Updated re2 to the latest version [#CLICKHOUSE-2]

This commit is contained in:
Alexey Milovidov 2018-01-19 04:18:47 +03:00
parent 9f57a1f7a5
commit 3e3d7b354a
67 changed files with 6520 additions and 9101 deletions

View File

@ -1,33 +1,30 @@
set (re2_sources
./re2/tostring.cc
./re2/dfa.cc
./re2/prefilter.cc
./re2/compile.cc
./re2/regexp.cc
./re2/onepass.cc
./re2/prefilter_tree.cc
./re2/set.cc
./re2/filtered_re2.cc
./re2/perl_groups.cc
./re2/parse.cc
./re2/nfa.cc
./re2/bitstate.cc
./re2/simplify.cc
./re2/unicode_groups.cc
./re2/compile.cc
./re2/dfa.cc
./re2/filtered_re2.cc
./re2/mimics_pcre.cc
./re2/re2.cc
./re2/nfa.cc
./re2/onepass.cc
./re2/parse.cc
./re2/perl_groups.cc
./re2/prefilter.cc
./re2/prefilter_tree.cc
./re2/prog.cc
./re2/re2.cc
./re2/regexp.cc
./re2/set.cc
./re2/simplify.cc
./re2/stringpiece.cc
./re2/tostring.cc
./re2/unicode_casefold.cc
./util/strutil.cc
./util/stringpiece.cc
./util/hash.cc
./util/arena.cc
./util/valgrind.cc
./re2/unicode_groups.cc
./util/benchmark.cc
./util/fuzz.cc
./util/pcre.cc
./util/stringprintf.cc
./util/rune.cc
./util/random.cc
./util/thread.cc
./util/strutil.cc
./util/test.cc
)
# Building re2 which is thread-safe and re2_st which is not.

View File

@ -1,9 +1 @@
Source: hg clone https://re2.googlecode.com/hg re2
Latest commit:
changeset: 118:1b483548272e
tag: tip
user: Russ Cox <rsc@swtch.com>
date: Mon Oct 06 15:08:47 2014 -0400
summary: doc: import clarifications from Go tree
https://github.com/google/re2/tree/7cf8b88e8f70f97fd4926b56aa87e7f53b2717e0

View File

@ -1,10 +0,0 @@
#!/bin/sh
rm -rf re2_st
mkdir -p re2_st
for i in filtered_re2.h re2.h set.h stringpiece.h variadic_function.h;
do
cp $1/re2/$i re2_st/$i
sed -i -r 's/using re2::RE2;//g;s/namespace re2/namespace re2_st/g;s/re2::/re2_st::/g;s/\"re2\//\"re2_st\//g;s/(.*?_H)/\1_ST/g' re2_st/$i;
done

View File

@ -0,0 +1,113 @@
// Copyright 2016 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_BITMAP256_H_
#define RE2_BITMAP256_H_
#ifdef _MSC_VER
#include <intrin.h>
#endif
#include <stdint.h>
#include <string.h>
#include "util/util.h"
#include "util/logging.h"
namespace re2 {
class Bitmap256 {
public:
Bitmap256() {
memset(words_, 0, sizeof words_);
}
// Tests the bit with index c.
bool Test(int c) const {
DCHECK_GE(c, 0);
DCHECK_LE(c, 255);
return (words_[c / 64] & (1ULL << (c % 64))) != 0;
}
// Sets the bit with index c.
void Set(int c) {
DCHECK_GE(c, 0);
DCHECK_LE(c, 255);
words_[c / 64] |= (1ULL << (c % 64));
}
// Finds the next non-zero bit with index >= c.
// Returns -1 if no such bit exists.
int FindNextSetBit(int c) const;
private:
// Finds the least significant non-zero bit in n.
static int FindLSBSet(uint64_t n) {
DCHECK_NE(n, 0);
#if defined(__GNUC__)
return __builtin_ctzll(n);
#elif defined(_MSC_VER) && defined(_M_X64)
unsigned long c;
_BitScanForward64(&c, n);
return static_cast<int>(c);
#elif defined(_MSC_VER) && defined(_M_IX86)
unsigned long c;
if (static_cast<uint32_t>(n) != 0) {
_BitScanForward(&c, static_cast<uint32_t>(n));
return static_cast<int>(c);
} else {
_BitScanForward(&c, static_cast<uint32_t>(n >> 32));
return static_cast<int>(c) + 32;
}
#else
int c = 63;
for (int shift = 1 << 5; shift != 0; shift >>= 1) {
uint64_t word = n << shift;
if (word != 0) {
n = word;
c -= shift;
}
}
return c;
#endif
}
uint64_t words_[4];
};
int Bitmap256::FindNextSetBit(int c) const {
DCHECK_GE(c, 0);
DCHECK_LE(c, 255);
// Check the word that contains the bit. Mask out any lower bits.
int i = c / 64;
uint64_t word = words_[i] & (~0ULL << (c % 64));
if (word != 0)
return (i * 64) + FindLSBSet(word);
// Check any following words.
i++;
switch (i) {
case 1:
if (words_[1] != 0)
return (1 * 64) + FindLSBSet(words_[1]);
FALLTHROUGH_INTENDED;
case 2:
if (words_[2] != 0)
return (2 * 64) + FindLSBSet(words_[2]);
FALLTHROUGH_INTENDED;
case 3:
if (words_[3] != 0)
return (3 * 64) + FindLSBSet(words_[3]);
FALLTHROUGH_INTENDED;
default:
return -1;
}
}
} // namespace re2
#endif // RE2_BITMAP256_H_

View File

@ -17,6 +17,11 @@
// SearchBitState is a fast replacement for the NFA code on small
// regexps and texts when SearchOnePass cannot be used.
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#include "util/logging.h"
#include "re2/prog.h"
#include "re2/regexp.h"
@ -60,8 +65,8 @@ class BitState {
int ncap_;
static const int VisitedBits = 32;
uint32 *visited_; // bitmap: (Inst*, char*) pairs already backtracked
int nvisited_; // # of words in bitmap
uint32_t *visited_; // bitmap: (Inst*, char*) pairs already backtracked
size_t nvisited_; // # of words in bitmap
Job *job_; // stack of text positions to explore
int njob_;
@ -94,7 +99,7 @@ BitState::~BitState() {
// If so, remember that it was visited so that the next time,
// we don't repeat the visit.
bool BitState::ShouldVisit(int id, const char* p) {
uint n = id * (text_.size() + 1) + (p - text_.begin());
size_t n = id * (text_.size() + 1) + (p - text_.begin());
if (visited_[n/VisitedBits] & (1 << (n & (VisitedBits-1))))
return false;
visited_[n/VisitedBits] |= 1 << (n & (VisitedBits-1));
@ -103,7 +108,6 @@ bool BitState::ShouldVisit(int id, const char* p) {
// Grow the stack.
bool BitState::GrowStack() {
// VLOG(0) << "Reallocate.";
maxjob_ *= 2;
Job* newjob = new Job[maxjob_];
memmove(newjob, job_, njob_*sizeof job_[0]);
@ -141,6 +145,7 @@ void BitState::Push(int id, const char* p, int arg) {
// Return whether it succeeded.
bool BitState::TrySearch(int id0, const char* p0) {
bool matched = false;
bool inaltmatch = false;
const char* end = text_.end();
njob_ = 0;
Push(id0, p0, 0);
@ -159,46 +164,37 @@ bool BitState::TrySearch(int id0, const char* p0) {
// would have, but we avoid the stack
// manipulation.
if (0) {
Next:
// If the Match of a non-greedy AltMatch failed,
// we stop ourselves from trying the ByteRange,
// which would steer us off the short circuit.
if (prog_->inst(id)->last() || inaltmatch)
continue;
id++;
CheckAndLoop:
if (!ShouldVisit(id, p))
continue;
}
// Visit ip, p.
// VLOG(0) << "Job: " << ip->id() << " "
// << (p - text_.begin()) << " " << arg;
Prog::Inst* ip = prog_->inst(id);
switch (ip->opcode()) {
case kInstFail:
default:
LOG(DFATAL) << "Unexpected opcode: " << ip->opcode() << " arg " << arg;
return false;
case kInstAlt:
// Cannot just
// Push(ip->out1(), p, 0);
// Push(ip->out(), p, 0);
// If, during the processing of ip->out(), we encounter
// ip->out1() via another path, we want to process it then.
// Pushing it here will inhibit that. Instead, re-push
// ip with arg==1 as a reminder to push ip->out1() later.
switch (arg) {
case 0:
Push(id, p, 1); // come back when we're done
id = ip->out();
goto CheckAndLoop;
case 1:
// Finished ip->out(); try ip->out1().
arg = 0;
id = ip->out1();
goto CheckAndLoop;
}
LOG(DFATAL) << "Bad arg in kInstCapture: " << arg;
case kInstFail:
continue;
case kInstAltMatch:
// One opcode is byte range; the other leads to match.
switch (arg) {
case 0:
inaltmatch = true;
Push(id, p, 1); // come back when we're done
// One opcode is ByteRange; the other leads to Match
// (possibly via Nop or Capture).
if (ip->greedy(prog_)) {
// out1 is the match
Push(ip->out1(), p, 0);
@ -211,29 +207,43 @@ bool BitState::TrySearch(int id0, const char* p0) {
id = ip->out();
goto CheckAndLoop;
case 1:
inaltmatch = false;
continue;
}
LOG(DFATAL) << "Bad arg in kInstAltMatch: " << arg;
continue;
case kInstByteRange: {
int c = -1;
if (p < end)
c = *p & 0xFF;
if (ip->Matches(c)) {
if (!ip->Matches(c))
goto Next;
if (!ip->last())
Push(id+1, p, 0); // try the next when we're done
id = ip->out();
p++;
goto CheckAndLoop;
}
continue;
}
case kInstCapture:
switch (arg) {
case 0:
if (!ip->last())
Push(id+1, p, 0); // try the next when we're done
if (0 <= ip->cap() && ip->cap() < ncap_) {
// Capture p to register, but save old value.
Push(id, cap_[ip->cap()], 1); // come back when we're done
cap_[ip->cap()] = p;
}
// Continue on.
id = ip->out();
goto CheckAndLoop;
case 1:
// Finished ip->out(); restore the old value.
cap_[ip->cap()] = p;
@ -244,19 +254,23 @@ bool BitState::TrySearch(int id0, const char* p0) {
case kInstEmptyWidth:
if (ip->empty() & ~Prog::EmptyFlags(context_, p))
continue;
goto Next;
if (!ip->last())
Push(id+1, p, 0); // try the next when we're done
id = ip->out();
goto CheckAndLoop;
case kInstNop:
if (!ip->last())
Push(id+1, p, 0); // try the next when we're done
id = ip->out();
goto CheckAndLoop;
case kInstMatch: {
if (endmatch_ && p != text_.end())
continue;
goto Next;
// VLOG(0) << "Found match.";
// We found a match. If the caller doesn't care
// where the match is, no point going further.
if (nsubmatch_ == 0)
@ -270,7 +284,9 @@ bool BitState::TrySearch(int id0, const char* p0) {
if (submatch_[0].data() == NULL ||
(longest_ && p > submatch_[0].end())) {
for (int i = 0; i < nsubmatch_; i++)
submatch_[i] = StringPiece(cap_[2*i], cap_[2*i+1] - cap_[2*i]);
submatch_[i] =
StringPiece(cap_[2 * i],
static_cast<size_t>(cap_[2 * i + 1] - cap_[2 * i]));
}
// If going for first match, we're done.
@ -282,7 +298,7 @@ bool BitState::TrySearch(int id0, const char* p0) {
return true;
// Otherwise, continue on in hope of a longer match.
continue;
goto Next;
}
}
}
@ -308,13 +324,12 @@ bool BitState::Search(const StringPiece& text, const StringPiece& context,
submatch_ = submatch;
nsubmatch_ = nsubmatch;
for (int i = 0; i < nsubmatch_; i++)
submatch_[i] = NULL;
submatch_[i] = StringPiece();
// Allocate scratch space.
nvisited_ = (prog_->size() * (text.size()+1) + VisitedBits-1) / VisitedBits;
visited_ = new uint32[nvisited_];
visited_ = new uint32_t[nvisited_];
memset(visited_, 0, nvisited_*sizeof visited_[0]);
// VLOG(0) << "nvisited_ = " << nvisited_;
ncap_ = 2*nsubmatch;
if (ncap_ < 2)
@ -338,6 +353,14 @@ bool BitState::Search(const StringPiece& text, const StringPiece& context,
// but we are not clearing visited_ between calls to TrySearch,
// so no work is duplicated and it ends up still being linear.
for (const char* p = text.begin(); p <= text.end(); p++) {
// Try to use memchr to find the first byte quickly.
int fb = prog_->first_byte();
if (fb >= 0 && p < text.end() && (p[0] & 0xFF) != fb) {
p = reinterpret_cast<const char*>(memchr(p, fb, text.end() - p));
if (p == NULL)
p = text.end();
}
cap_[0] = p;
if (TrySearch(prog_->start(), p)) // Match must be leftmost; done.
return true;

View File

@ -8,6 +8,13 @@
// This file's external interface is just Regexp::CompileToProg.
// The Compiler class defined in this file is private.
#include <stdint.h>
#include <string.h>
#include <unordered_map>
#include <utility>
#include "util/logging.h"
#include "util/utf.h"
#include "re2/prog.h"
#include "re2/re2.h"
#include "re2/regexp.h"
@ -28,14 +35,14 @@ namespace re2 {
// is always the fail instruction, which never appears on a list.
struct PatchList {
uint32 p;
uint32_t p;
// Returns patch list containing just p.
static PatchList Mk(uint32 p);
static PatchList Mk(uint32_t p);
// Patches all the entries on l to have value v.
// Caller must not ever use patch list again.
static void Patch(Prog::Inst *inst0, PatchList l, uint32 v);
static void Patch(Prog::Inst *inst0, PatchList l, uint32_t v);
// Deref returns the next pointer pointed at by p.
static PatchList Deref(Prog::Inst *inst0, PatchList l);
@ -47,7 +54,7 @@ struct PatchList {
static PatchList nullPatchList = { 0 };
// Returns patch list containing just p.
PatchList PatchList::Mk(uint32 p) {
PatchList PatchList::Mk(uint32_t p) {
PatchList l;
l.p = p;
return l;
@ -64,7 +71,7 @@ PatchList PatchList::Deref(Prog::Inst* inst0, PatchList l) {
}
// Patches all the entries on l to have value v.
void PatchList::Patch(Prog::Inst *inst0, PatchList l, uint32 val) {
void PatchList::Patch(Prog::Inst *inst0, PatchList l, uint32_t val) {
while (l.p != 0) {
Prog::Inst* ip = &inst0[l.p>>1];
if (l.p&1) {
@ -103,17 +110,17 @@ PatchList PatchList::Append(Prog::Inst* inst0, PatchList l1, PatchList l2) {
// Compiled program fragment.
struct Frag {
uint32 begin;
uint32_t begin;
PatchList end;
Frag() : begin(0) { end.p = 0; } // needed so Frag can go in vector
Frag(uint32 begin, PatchList end) : begin(begin), end(end) {}
Frag(uint32_t begin, PatchList end) : begin(begin), end(end) {}
};
// Input encodings.
enum Encoding {
kEncodingUTF8 = 1, // UTF-8 (0-10FFFF)
kEncodingLatin1, // Latin1 (0-FF)
kEncodingLatin1, // Latin-1 (0-FF)
};
class Compiler : public Regexp::Walker<Frag> {
@ -125,12 +132,11 @@ class Compiler : public Regexp::Walker<Frag> {
// Caller is responsible for deleting Prog when finished with it.
// If reversed is true, compiles for walking over the input
// string backward (reverses all concatenations).
static Prog *Compile(Regexp* re, bool reversed, int64 max_mem);
static Prog *Compile(Regexp* re, bool reversed, int64_t max_mem);
// Compiles alternation of all the re to a new Prog.
// Each re has a match with an id equal to its index in the vector.
static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor,
Regexp* re);
static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem);
// Interface for Regexp::Walker, which helps traverse the Regexp.
// The walk is purely post-recursive: given the machines for the
@ -162,7 +168,7 @@ class Compiler : public Regexp::Walker<Frag> {
Frag NoMatch();
// Returns a fragment that matches the empty string.
Frag Match(int32 id);
Frag Match(int32_t id);
// Returns a no-op fragment.
Frag Nop();
@ -178,9 +184,6 @@ class Compiler : public Regexp::Walker<Frag> {
// Returns -1 if no more instructions are available.
int AllocInst(int n);
// Deletes unused instructions.
void Trim();
// Rune range compiler.
// Begins a new alternation.
@ -193,19 +196,35 @@ class Compiler : public Regexp::Walker<Frag> {
void Add_80_10ffff();
// New suffix that matches the byte range lo-hi, then goes to next.
int RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next);
int UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next);
int UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next);
int CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next);
// Returns true iff the suffix is cached.
bool IsCachedRuneByteSuffix(int id);
// Adds a suffix to alternation.
void AddSuffix(int id);
// Adds a suffix to the trie starting from the given root node.
// Returns zero iff allocating an instruction fails. Otherwise, returns
// the current root node, which might be different from what was given.
int AddSuffixRecursive(int root, int id);
// Finds the trie node for the given suffix. Returns a Frag in order to
// distinguish between pointing at the root node directly (end.p == 0)
// and pointing at an Alt's out1 or out (end.p&1 == 1 or 0, respectively).
Frag FindByteRange(int root, int id);
// Compares two ByteRanges and returns true iff they are equal.
bool ByteRangeEqual(int id1, int id2);
// Returns the alternation of all the added suffixes.
Frag EndRange();
// Single rune.
Frag Literal(Rune r, bool foldcase);
void Setup(Regexp::ParseFlags, int64, RE2::Anchor);
void Setup(Regexp::ParseFlags, int64_t, RE2::Anchor);
Prog* Finish();
// Returns .* where dot = any byte
@ -223,14 +242,15 @@ class Compiler : public Regexp::Walker<Frag> {
int inst_len_; // Number of instructions used.
int inst_cap_; // Number of instructions allocated.
int64 max_mem_; // Total memory budget.
int64_t max_mem_; // Total memory budget.
map<uint64, int> rune_cache_;
std::unordered_map<uint64_t, int> rune_cache_;
Frag rune_range_;
RE2::Anchor anchor_; // anchor mode for RE2::Set
DISALLOW_EVIL_CONSTRUCTORS(Compiler);
Compiler(const Compiler&) = delete;
Compiler& operator=(const Compiler&) = delete;
};
Compiler::Compiler() {
@ -265,6 +285,7 @@ int Compiler::AllocInst(int n) {
while (inst_len_ + n > inst_cap_)
inst_cap_ *= 2;
Prog::Inst* ip = new Prog::Inst[inst_cap_];
if (inst_ != NULL)
memmove(ip, inst_, inst_len_ * sizeof ip[0]);
memset(ip + inst_len_, 0, (inst_cap_ - inst_len_) * sizeof ip[0]);
delete[] inst_;
@ -275,16 +296,6 @@ int Compiler::AllocInst(int n) {
return id;
}
void Compiler::Trim() {
if (inst_len_ < inst_cap_) {
Prog::Inst* ip = new Prog::Inst[inst_len_];
memmove(ip, inst_, inst_len_ * sizeof ip[0]);
delete[] inst_;
inst_ = ip;
inst_cap_ = inst_len_;
}
}
// These routines are somewhat hard to visualize in text --
// see http://swtch.com/~rsc/regexp/regexp1.html for
// pictures explaining what is going on here.
@ -393,16 +404,6 @@ Frag Compiler::ByteRange(int lo, int hi, bool foldcase) {
if (id < 0)
return NoMatch();
inst_[id].InitByteRange(lo, hi, foldcase, 0);
prog_->byte_inst_count_++;
prog_->MarkByteRange(lo, hi);
if (foldcase && lo <= 'z' && hi >= 'a') {
if (lo < 'a')
lo = 'a';
if (hi > 'z')
hi = 'z';
if (lo <= hi)
prog_->MarkByteRange(lo + 'A' - 'a', hi + 'A' - 'a');
}
return Frag(id, PatchList::Mk(id << 1));
}
@ -416,7 +417,7 @@ Frag Compiler::Nop() {
}
// Returns a fragment that signals a match.
Frag Compiler::Match(int32 match_id) {
Frag Compiler::Match(int32_t match_id) {
int id = AllocInst(1);
if (id < 0)
return NoMatch();
@ -430,16 +431,6 @@ Frag Compiler::EmptyWidth(EmptyOp empty) {
if (id < 0)
return NoMatch();
inst_[id].InitEmptyWidth(empty, 0);
if (empty & (kEmptyBeginLine|kEmptyEndLine))
prog_->MarkByteRange('\n', '\n');
if (empty & (kEmptyWordBoundary|kEmptyNonWordBoundary)) {
int j;
for (int i = 0; i < 256; i = j) {
for (j = i+1; j < 256 && Prog::IsWordChar(i) == Prog::IsWordChar(j); j++)
;
prog_->MarkByteRange(i, j-1);
}
}
return Frag(id, PatchList::Mk(id << 1));
}
@ -482,7 +473,7 @@ void Compiler::BeginRange() {
rune_range_.end = nullPatchList;
}
int Compiler::UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase,
int Compiler::UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase,
int next) {
Frag f = ByteRange(lo, hi, foldcase);
if (next != 0) {
@ -493,18 +484,18 @@ int Compiler::UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase,
return f.begin;
}
int Compiler::RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next) {
// In Latin1 mode, there's no point in caching.
// In forward UTF-8 mode, only need to cache continuation bytes.
if (encoding_ == kEncodingLatin1 ||
(encoding_ == kEncodingUTF8 &&
!reversed_ &&
!(0x80 <= lo && hi <= 0xbf))) {
return UncachedRuneByteSuffix(lo, hi, foldcase, next);
static uint64_t MakeRuneCacheKey(uint8_t lo, uint8_t hi, bool foldcase,
int next) {
return (uint64_t)next << 17 |
(uint64_t)lo << 9 |
(uint64_t)hi << 1 |
(uint64_t)foldcase;
}
uint64 key = ((uint64)next << 17) | (lo<<9) | (hi<<1) | foldcase;
map<uint64, int>::iterator it = rune_cache_.find(key);
int Compiler::CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase,
int next) {
uint64_t key = MakeRuneCacheKey(lo, hi, foldcase, next);
std::unordered_map<uint64_t, int>::const_iterator it = rune_cache_.find(key);
if (it != rune_cache_.end())
return it->second;
int id = UncachedRuneByteSuffix(lo, hi, foldcase, next);
@ -512,12 +503,31 @@ int Compiler::RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next) {
return id;
}
bool Compiler::IsCachedRuneByteSuffix(int id) {
uint8_t lo = inst_[id].lo_;
uint8_t hi = inst_[id].hi_;
bool foldcase = inst_[id].foldcase() != 0;
int next = inst_[id].out();
uint64_t key = MakeRuneCacheKey(lo, hi, foldcase, next);
return rune_cache_.find(key) != rune_cache_.end();
}
void Compiler::AddSuffix(int id) {
if (failed_)
return;
if (rune_range_.begin == 0) {
rune_range_.begin = id;
return;
}
if (encoding_ == kEncodingUTF8) {
// Build a trie in order to reduce fanout.
rune_range_.begin = AddSuffixRecursive(rune_range_.begin, id);
return;
}
int alt = AllocInst(1);
if (alt < 0) {
rune_range_.begin = 0;
@ -527,6 +537,102 @@ void Compiler::AddSuffix(int id) {
rune_range_.begin = alt;
}
int Compiler::AddSuffixRecursive(int root, int id) {
DCHECK(inst_[root].opcode() == kInstAlt ||
inst_[root].opcode() == kInstByteRange);
Frag f = FindByteRange(root, id);
if (IsNoMatch(f)) {
int alt = AllocInst(1);
if (alt < 0)
return 0;
inst_[alt].InitAlt(root, id);
return alt;
}
int br;
if (f.end.p == 0)
br = root;
else if (f.end.p&1)
br = inst_[f.begin].out1();
else
br = inst_[f.begin].out();
if (IsCachedRuneByteSuffix(br)) {
// We can't fiddle with cached suffixes, so make a clone of the head.
int byterange = AllocInst(1);
if (byterange < 0)
return 0;
inst_[byterange].InitByteRange(inst_[br].lo(), inst_[br].hi(),
inst_[br].foldcase(), inst_[br].out());
// Ensure that the parent points to the clone, not to the original.
// Note that this could leave the head unreachable except via the cache.
br = byterange;
if (f.end.p == 0)
root = br;
else if (f.end.p&1)
inst_[f.begin].out1_ = br;
else
inst_[f.begin].set_out(br);
}
int out = inst_[id].out();
if (!IsCachedRuneByteSuffix(id)) {
// The head should be the instruction most recently allocated, so free it
// instead of leaving it unreachable.
DCHECK_EQ(id, inst_len_-1);
inst_[id].out_opcode_ = 0;
inst_[id].out1_ = 0;
inst_len_--;
}
out = AddSuffixRecursive(inst_[br].out(), out);
if (out == 0)
return 0;
inst_[br].set_out(out);
return root;
}
bool Compiler::ByteRangeEqual(int id1, int id2) {
return inst_[id1].lo() == inst_[id2].lo() &&
inst_[id1].hi() == inst_[id2].hi() &&
inst_[id1].foldcase() == inst_[id2].foldcase();
}
Frag Compiler::FindByteRange(int root, int id) {
if (inst_[root].opcode() == kInstByteRange) {
if (ByteRangeEqual(root, id))
return Frag(root, nullPatchList);
else
return NoMatch();
}
while (inst_[root].opcode() == kInstAlt) {
int out1 = inst_[root].out1();
if (ByteRangeEqual(out1, id))
return Frag(root, PatchList::Mk((root << 1) | 1));
// CharClass is a sorted list of ranges, so if out1 of the root Alt wasn't
// what we're looking for, then we can stop immediately. Unfortunately, we
// can't short-circuit the search in reverse mode.
if (!reversed_)
return NoMatch();
int out = inst_[root].out();
if (inst_[out].opcode() == kInstAlt)
root = out;
else if (ByteRangeEqual(out, id))
return Frag(root, PatchList::Mk(root << 1));
else
return NoMatch();
}
LOG(DFATAL) << "should never happen";
return NoMatch();
}
Frag Compiler::EndRange() {
return rune_range_;
}
@ -550,12 +656,13 @@ void Compiler::AddRuneRange(Rune lo, Rune hi, bool foldcase) {
}
void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) {
// Latin1 is easy: runes *are* bytes.
// Latin-1 is easy: runes *are* bytes.
if (lo > hi || lo > 0xFF)
return;
if (hi > 0xFF)
hi = 0xFF;
AddSuffix(RuneByteSuffix(lo, hi, foldcase, 0));
AddSuffix(UncachedRuneByteSuffix(static_cast<uint8_t>(lo),
static_cast<uint8_t>(hi), foldcase, 0));
}
// Table describing how to make a UTF-8 matching machine
@ -591,12 +698,13 @@ static struct ByteRangeProg {
void Compiler::Add_80_10ffff() {
int inst[arraysize(prog_80_10ffff)] = { 0 }; // does not need to be initialized; silences gcc warning
for (size_t i = 0; i < arraysize(prog_80_10ffff); i++) {
for (int i = 0; i < arraysize(prog_80_10ffff); i++) {
const ByteRangeProg& p = prog_80_10ffff[i];
int next = 0;
if (p.next >= 0)
next = inst[p.next];
inst[i] = UncachedRuneByteSuffix(p.lo, p.hi, false, next);
inst[i] = UncachedRuneByteSuffix(static_cast<uint8_t>(p.lo),
static_cast<uint8_t>(p.hi), false, next);
if ((p.lo & 0xC0) != 0x80)
AddSuffix(inst[i]);
}
@ -625,13 +733,14 @@ void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) {
// ASCII range is always a special case.
if (hi < Runeself) {
AddSuffix(RuneByteSuffix(lo, hi, foldcase, 0));
AddSuffix(UncachedRuneByteSuffix(static_cast<uint8_t>(lo),
static_cast<uint8_t>(hi), foldcase, 0));
return;
}
// Split range into sections that agree on leading bytes.
for (int i = 1; i < UTFmax; i++) {
uint m = (1<<(6*i)) - 1; // last i bytes of a UTF-8 sequence
uint32_t m = (1<<(6*i)) - 1; // last i bytes of a UTF-8 sequence
if ((lo & ~m) != (hi & ~m)) {
if ((lo & m) != 0) {
AddRuneRangeUTF8(lo, lo|m, foldcase);
@ -647,19 +756,55 @@ void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) {
}
// Finally. Generate byte matching equivalent for lo-hi.
uint8 ulo[UTFmax], uhi[UTFmax];
uint8_t ulo[UTFmax], uhi[UTFmax];
int n = runetochar(reinterpret_cast<char*>(ulo), &lo);
int m = runetochar(reinterpret_cast<char*>(uhi), &hi);
(void)m; // USED(m)
DCHECK_EQ(n, m);
// The logic below encodes this thinking:
//
// 1. When we have built the whole suffix, we know that it cannot
// possibly be a suffix of anything longer: in forward mode, nothing
// else can occur before the leading byte; in reverse mode, nothing
// else can occur after the last continuation byte or else the leading
// byte would have to change. Thus, there is no benefit to caching
// the first byte of the suffix whereas there is a cost involved in
// cloning it if it begins a common prefix, which is fairly likely.
//
// 2. Conversely, the last byte of the suffix cannot possibly be a
// prefix of anything because next == 0, so we will never want to
// clone it, but it is fairly likely to be a common suffix. Perhaps
// more so in reverse mode than in forward mode because the former is
// "converging" towards lower entropy, but caching is still worthwhile
// for the latter in cases such as 80-BF.
//
// 3. Handling the bytes between the first and the last is less
// straightforward and, again, the approach depends on whether we are
// "converging" towards lower entropy: in forward mode, a single byte
// is unlikely to be part of a common suffix whereas a byte range
// is more likely so; in reverse mode, a byte range is unlikely to
// be part of a common suffix whereas a single byte is more likely
// so. The same benefit versus cost argument applies here.
int id = 0;
if (reversed_) {
for (int i = 0; i < n; i++)
id = RuneByteSuffix(ulo[i], uhi[i], false, id);
for (int i = 0; i < n; i++) {
// In reverse UTF-8 mode: cache the leading byte; don't cache the last
// continuation byte; cache anything else iff it's a single byte (XX-XX).
if (i == 0 || (ulo[i] == uhi[i] && i != n-1))
id = CachedRuneByteSuffix(ulo[i], uhi[i], false, id);
else
id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id);
}
} else {
for (int i = n-1; i >= 0; i--)
id = RuneByteSuffix(ulo[i], uhi[i], false, id);
for (int i = n-1; i >= 0; i--) {
// In forward UTF-8 mode: don't cache the leading byte; cache the last
// continuation byte; cache anything else iff it's a byte range (XX-YY).
if (i == n-1 || (ulo[i] < uhi[i] && i != 0))
id = CachedRuneByteSuffix(ulo[i], uhi[i], false, id);
else
id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id);
}
}
AddSuffix(id);
}
@ -699,11 +844,11 @@ Frag Compiler::Literal(Rune r, bool foldcase) {
case kEncodingUTF8: {
if (r < Runeself) // Make common case fast.
return ByteRange(r, r, foldcase);
uint8 buf[UTFmax];
uint8_t buf[UTFmax];
int n = runetochar(reinterpret_cast<char*>(buf), &r);
Frag f = ByteRange((uint8)buf[0], buf[0], false);
Frag f = ByteRange((uint8_t)buf[0], buf[0], false);
for (int i = 1; i < n; i++)
f = Cat(f, ByteRange((uint8)buf[i], buf[i], false));
f = Cat(f, ByteRange((uint8_t)buf[i], buf[i], false));
return f;
}
}
@ -732,9 +877,11 @@ Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags,
case kRegexpHaveMatch: {
Frag f = Match(re->match_id());
// Remember unanchored match to end of string.
if (anchor_ != RE2::ANCHOR_BOTH)
f = Cat(DotStar(), Cat(EmptyWidth(kEmptyEndText), f));
if (anchor_ == RE2::ANCHOR_BOTH) {
// Append \z or else the subexpression will effectively be unanchored.
// Complemented by the UNANCHORED case in CompileSet().
f = Cat(EmptyWidth(kEmptyEndText), f);
}
return f;
}
@ -753,16 +900,16 @@ Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags,
}
case kRegexpStar:
return Star(child_frags[0], re->parse_flags()&Regexp::NonGreedy);
return Star(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
case kRegexpPlus:
return Plus(child_frags[0], re->parse_flags()&Regexp::NonGreedy);
return Plus(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
case kRegexpQuest:
return Quest(child_frags[0], re->parse_flags()&Regexp::NonGreedy);
return Quest(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
case kRegexpLiteral:
return Literal(re->rune(), re->parse_flags()&Regexp::FoldCase);
return Literal(re->rune(), (re->parse_flags()&Regexp::FoldCase) != 0);
case kRegexpLiteralString: {
// Concatenation of literals.
@ -770,7 +917,8 @@ Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags,
return Nop();
Frag f;
for (int i = 0; i < re->nrunes(); i++) {
Frag f1 = Literal(re->runes()[i], re->parse_flags()&Regexp::FoldCase);
Frag f1 = Literal(re->runes()[i],
(re->parse_flags()&Regexp::FoldCase) != 0);
if (i == 0)
f = f1;
else
@ -815,7 +963,8 @@ Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags,
// If this range contains all of A-Za-z or none of it,
// the fold flag is unnecessary; don't bother.
bool fold = foldascii;
if ((i->lo <= 'A' && 'z' <= i->hi) || i->hi < 'A' || 'z' < i->lo)
if ((i->lo <= 'A' && 'z' <= i->hi) || i->hi < 'A' || 'z' < i->lo ||
('Z' < i->lo && i->hi < 'a'))
fold = false;
AddRuneRange(i->lo, i->hi, fold);
@ -949,7 +1098,7 @@ static bool IsAnchorEnd(Regexp** pre, int depth) {
return false;
}
void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem,
void Compiler::Setup(Regexp::ParseFlags flags, int64_t max_mem,
RE2::Anchor anchor) {
prog_->set_flags(flags);
@ -958,11 +1107,11 @@ void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem,
max_mem_ = max_mem;
if (max_mem <= 0) {
max_inst_ = 100000; // more than enough
} else if (max_mem <= static_cast<int64>(sizeof(Prog))) {
} else if (static_cast<size_t>(max_mem) <= sizeof(Prog)) {
// No room for anything.
max_inst_ = 0;
} else {
int64 m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst);
int64_t m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst);
// Limit instruction count so that inst->id() fits nicely in an int.
// SparseArray also assumes that the indices (inst->id()) are ints.
// The call to WalkExponential uses 2*max_inst_ below,
@ -978,7 +1127,7 @@ void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem,
if (m > Prog::Inst::kMaxInst)
m = Prog::Inst::kMaxInst;
max_inst_ = m;
max_inst_ = static_cast<int>(m);
}
anchor_ = anchor;
@ -989,10 +1138,9 @@ void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem,
// If reversed is true, compiles a program that expects
// to run over the input string backward (reverses all concatenations).
// The reversed flag is also recorded in the returned program.
Prog* Compiler::Compile(Regexp* re, bool reversed, int64 max_mem) {
Prog* Compiler::Compile(Regexp* re, bool reversed, int64_t max_mem) {
Compiler c;
c.Setup(re->parse_flags(), max_mem, RE2::ANCHOR_BOTH /* unused */);
c.Setup(re->parse_flags(), max_mem, RE2::UNANCHORED /* unused */);
c.reversed_ = reversed;
// Simplify to remove things like counted repetitions
@ -1007,7 +1155,7 @@ Prog* Compiler::Compile(Regexp* re, bool reversed, int64 max_mem) {
bool is_anchor_end = IsAnchorEnd(&sre, 0);
// Generate fragment for entire regexp.
Frag f = c.WalkExponential(sre, Frag(), 2*c.max_inst_);
Frag all = c.WalkExponential(sre, Frag(), 2*c.max_inst_);
sre->Decref();
if (c.failed_)
return NULL;
@ -1016,10 +1164,10 @@ Prog* Compiler::Compile(Regexp* re, bool reversed, int64 max_mem) {
// Turn off c.reversed_ (if it is set) to force the remaining concatenations
// to behave normally.
c.reversed_ = false;
Frag all = c.Cat(f, c.Match(0));
c.prog_->set_start(all.begin);
all = c.Cat(all, c.Match(0));
if (reversed) {
c.prog_->set_reversed(reversed);
if (c.prog_->reversed()) {
c.prog_->set_anchor_start(is_anchor_end);
c.prog_->set_anchor_end(is_anchor_start);
} else {
@ -1027,15 +1175,12 @@ Prog* Compiler::Compile(Regexp* re, bool reversed, int64 max_mem) {
c.prog_->set_anchor_end(is_anchor_end);
}
c.prog_->set_start(all.begin);
if (!c.prog_->anchor_start()) {
// Also create unanchored version, which starts with a .*? loop.
if (c.prog_->anchor_start()) {
c.prog_->set_start_unanchored(c.prog_->start());
} else {
Frag unanchored = c.Cat(c.DotStar(), all);
c.prog_->set_start_unanchored(unanchored.begin);
all = c.Cat(c.DotStar(), all);
}
c.prog_->set_reversed(reversed);
c.prog_->set_start_unanchored(all.begin);
// Hand ownership of prog_ to caller.
return c.Finish();
@ -1050,22 +1195,20 @@ Prog* Compiler::Finish() {
inst_len_ = 1;
}
// Trim instruction to minimum array and transfer to Prog.
Trim();
// Hand off the array to Prog.
prog_->inst_ = inst_;
prog_->size_ = inst_len_;
inst_ = NULL;
// Compute byte map.
prog_->ComputeByteMap();
prog_->Optimize();
prog_->Flatten();
prog_->ComputeByteMap();
// Record remaining memory for DFA.
if (max_mem_ <= 0) {
prog_->set_dfa_mem(1<<20);
} else {
int64 m = max_mem_ - sizeof(Prog) - inst_len_*sizeof(Prog::Inst);
int64_t m = max_mem_ - sizeof(Prog) - prog_->size_*sizeof(Prog::Inst);
if (m < 0)
m = 0;
prog_->set_dfa_mem(m);
@ -1077,11 +1220,11 @@ Prog* Compiler::Finish() {
}
// Converts Regexp to Prog.
Prog* Regexp::CompileToProg(int64 max_mem) {
Prog* Regexp::CompileToProg(int64_t max_mem) {
return Compiler::Compile(this, false, max_mem);
}
Prog* Regexp::CompileToReverseProg(int64 max_mem) {
Prog* Regexp::CompileToReverseProg(int64_t max_mem) {
return Compiler::Compile(this, true, max_mem);
}
@ -1090,41 +1233,41 @@ Frag Compiler::DotStar() {
}
// Compiles RE set to Prog.
Prog* Compiler::CompileSet(const RE2::Options& options, RE2::Anchor anchor,
Regexp* re) {
Prog* Compiler::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) {
Compiler c;
c.Setup(re->parse_flags(), max_mem, anchor);
Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(options.ParseFlags());
c.Setup(pf, options.max_mem(), anchor);
Regexp* sre = re->Simplify();
if (sre == NULL)
return NULL;
// Compile alternation of fragments.
Frag all = c.WalkExponential(re, Frag(), 2*c.max_inst_);
re->Decref();
Frag all = c.WalkExponential(sre, Frag(), 2*c.max_inst_);
sre->Decref();
if (c.failed_)
return NULL;
if (anchor == RE2::UNANCHORED) {
// The trailing .* was added while handling kRegexpHaveMatch.
// We just have to add the leading one.
all = c.Cat(c.DotStar(), all);
}
c.prog_->set_start(all.begin);
c.prog_->set_start_unanchored(all.begin);
c.prog_->set_anchor_start(true);
c.prog_->set_anchor_end(true);
if (anchor == RE2::UNANCHORED) {
// Prepend .* or else the expression will effectively be anchored.
// Complemented by the ANCHOR_BOTH case in PostVisit().
all = c.Cat(c.DotStar(), all);
}
c.prog_->set_start(all.begin);
c.prog_->set_start_unanchored(all.begin);
Prog* prog = c.Finish();
if (prog == NULL)
return NULL;
// Make sure DFA has enough memory to operate,
// since we're not going to fall back to the NFA.
bool failed;
bool dfa_failed = false;
StringPiece sp = "hello, world";
prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch,
NULL, &failed, NULL);
if (failed) {
NULL, &dfa_failed, NULL);
if (dfa_failed) {
delete prog;
return NULL;
}
@ -1132,9 +1275,8 @@ Prog* Compiler::CompileSet(const RE2::Options& options, RE2::Anchor anchor,
return prog;
}
Prog* Prog::CompileSet(const RE2::Options& options, RE2::Anchor anchor,
Regexp* re) {
return Compiler::CompileSet(options, anchor, re);
Prog* Prog::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) {
return Compiler::CompileSet(re, anchor, max_mem);
}
} // namespace re2

File diff suppressed because it is too large Load Diff

View File

@ -2,9 +2,13 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include <string>
#include "util/util.h"
#include "re2/filtered_re2.h"
#include <stddef.h>
#include <string>
#include "util/util.h"
#include "util/logging.h"
#include "re2/prefilter.h"
#include "re2/prefilter_tree.h"
@ -15,6 +19,11 @@ FilteredRE2::FilteredRE2()
prefilter_tree_(new PrefilterTree()) {
}
FilteredRE2::FilteredRE2(int min_atom_len)
: compiled_(false),
prefilter_tree_(new PrefilterTree(min_atom_len)) {
}
FilteredRE2::~FilteredRE2() {
for (size_t i = 0; i < re2_vec_.size(); i++)
delete re2_vec_[i];
@ -33,16 +42,21 @@ RE2::ErrorCode FilteredRE2::Add(const StringPiece& pattern,
}
delete re;
} else {
*id = re2_vec_.size();
*id = static_cast<int>(re2_vec_.size());
re2_vec_.push_back(re);
}
return code;
}
void FilteredRE2::Compile(vector<string>* atoms) {
if (compiled_ || re2_vec_.size() == 0) {
LOG(INFO) << "C: " << compiled_ << " S:" << re2_vec_.size();
void FilteredRE2::Compile(std::vector<string>* atoms) {
if (compiled_) {
LOG(ERROR) << "Compile called already.";
return;
}
if (re2_vec_.empty()) {
LOG(ERROR) << "Compile called before Add.";
return;
}
@ -58,17 +72,17 @@ void FilteredRE2::Compile(vector<string>* atoms) {
int FilteredRE2::SlowFirstMatch(const StringPiece& text) const {
for (size_t i = 0; i < re2_vec_.size(); i++)
if (RE2::PartialMatch(text, *re2_vec_[i]))
return i;
return static_cast<int>(i);
return -1;
}
int FilteredRE2::FirstMatch(const StringPiece& text,
const vector<int>& atoms) const {
const std::vector<int>& atoms) const {
if (!compiled_) {
LOG(DFATAL) << "FirstMatch called before Compile";
LOG(DFATAL) << "FirstMatch called before Compile.";
return -1;
}
vector<int> regexps;
std::vector<int> regexps;
prefilter_tree_->RegexpsGivenStrings(atoms, &regexps);
for (size_t i = 0; i < regexps.size(); i++)
if (RE2::PartialMatch(text, *re2_vec_[regexps[i]]))
@ -78,10 +92,10 @@ int FilteredRE2::FirstMatch(const StringPiece& text,
bool FilteredRE2::AllMatches(
const StringPiece& text,
const vector<int>& atoms,
vector<int>* matching_regexps) const {
const std::vector<int>& atoms,
std::vector<int>* matching_regexps) const {
matching_regexps->clear();
vector<int> regexps;
std::vector<int> regexps;
prefilter_tree_->RegexpsGivenStrings(atoms, &regexps);
for (size_t i = 0; i < regexps.size(); i++)
if (RE2::PartialMatch(text, *re2_vec_[regexps[i]]))
@ -89,11 +103,16 @@ bool FilteredRE2::AllMatches(
return !matching_regexps->empty();
}
void FilteredRE2::RegexpsGivenStrings(const vector<int>& matched_atoms,
vector<int>* passed_regexps) {
prefilter_tree_->RegexpsGivenStrings(matched_atoms, passed_regexps);
void FilteredRE2::AllPotentials(
const std::vector<int>& atoms,
std::vector<int>* potential_regexps) const {
prefilter_tree_->RegexpsGivenStrings(atoms, potential_regexps);
}
void FilteredRE2::RegexpsGivenStrings(const std::vector<int>& matched_atoms,
std::vector<int>* passed_regexps) {
prefilter_tree_->RegexpsGivenStrings(matched_atoms, passed_regexps);
}
void FilteredRE2::PrintPrefilter(int regexpid) {
prefilter_tree_->PrintPrefilter(regexpid);

View File

@ -2,6 +2,9 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_FILTERED_RE2_H_
#define RE2_FILTERED_RE2_H_
// The class FilteredRE2 is used as a wrapper to multiple RE2 regexps.
// It provides a prefilter mechanism that helps in cutting down the
// number of regexps that need to be actually searched.
@ -18,20 +21,19 @@
// indices of strings that were found in the text to get the actual
// regexp matches.
#ifndef RE2_FILTERED_RE2_H_
#define RE2_FILTERED_RE2_H_
#include <string>
#include <vector>
#include "re2/re2.h"
namespace re2 {
using std::vector;
class PrefilterTree;
class FilteredRE2 {
public:
FilteredRE2();
explicit FilteredRE2(int min_atom_len);
~FilteredRE2();
// Uses RE2 constructor to create a RE2 object (re). Returns
@ -47,7 +49,7 @@ class FilteredRE2 {
// the search text should be lowercased first to find matching
// strings from the set of strings returned by Compile. Call after
// all Add calls are done.
void Compile(vector<string>* strings_to_match);
void Compile(std::vector<string>* strings_to_match);
// Returns the index of the first matching regexp.
// Returns -1 on no match. Can be called prior to Compile.
@ -59,16 +61,24 @@ class FilteredRE2 {
// Returns -1 on no match. Compile has to be called before
// calling this.
int FirstMatch(const StringPiece& text,
const vector<int>& atoms) const;
const std::vector<int>& atoms) const;
// Returns the indices of all matching regexps, after first clearing
// matched_regexps.
bool AllMatches(const StringPiece& text,
const vector<int>& atoms,
vector<int>* matching_regexps) const;
const std::vector<int>& atoms,
std::vector<int>* matching_regexps) const;
// Returns the indices of all potentially matching regexps after first
// clearing potential_regexps.
// A regexp is potentially matching if it passes the filter.
// If a regexp passes the filter it may still not match.
// A regexp that does not pass the filter is guaranteed to not match.
void AllPotentials(const std::vector<int>& atoms,
std::vector<int>* potential_regexps) const;
// The number of regexps added.
int NumRegexps() const { return re2_vec_.size(); }
int NumRegexps() const { return static_cast<int>(re2_vec_.size()); }
private:
@ -79,11 +89,11 @@ class FilteredRE2 {
void PrintPrefilter(int regexpid);
// Useful for testing and debugging.
void RegexpsGivenStrings(const vector<int>& matched_atoms,
vector<int>* passed_regexps);
void RegexpsGivenStrings(const std::vector<int>& matched_atoms,
std::vector<int>* passed_regexps);
// All the regexps in the FilteredRE2.
vector<RE2*> re2_vec_;
std::vector<RE2*> re2_vec_;
// Has the FilteredRE2 been compiled using Compile()
bool compiled_;
@ -91,9 +101,8 @@ class FilteredRE2 {
// An AND-OR tree of string atoms used for filtering regexps.
PrefilterTree* prefilter_tree_;
//DISALLOW_EVIL_CONSTRUCTORS(FilteredRE2);
FilteredRE2(const FilteredRE2&);
void operator=(const FilteredRE2&);
FilteredRE2(const FilteredRE2&) = delete;
FilteredRE2& operator=(const FilteredRE2&) = delete;
};
} // namespace re2

View File

@ -23,6 +23,7 @@
// Regexp::MimicsPCRE checks for any of these conditions.
#include "util/util.h"
#include "util/logging.h"
#include "re2/regexp.h"
#include "re2/walker-inl.h"
@ -124,7 +125,8 @@ class EmptyStringWalker : public Regexp::Walker<bool> {
}
private:
DISALLOW_EVIL_CONSTRUCTORS(EmptyStringWalker);
EmptyStringWalker(const EmptyStringWalker&) = delete;
EmptyStringWalker& operator=(const EmptyStringWalker&) = delete;
};
// Called after visiting re's children. child_args contains the return

View File

@ -24,13 +24,24 @@
// Like Thompson's original machine and like the DFA implementation, this
// implementation notices a match only once it is one byte past it.
#include <stdio.h>
#include <string.h>
#include <algorithm>
#include <string>
#include <utility>
#include <vector>
#include "re2/prog.h"
#include "re2/regexp.h"
#include "util/logging.h"
#include "util/sparse_array.h"
#include "util/sparse_set.h"
#include "util/strutil.h"
namespace re2 {
static const bool ExtraDebug = false;
class NFA {
public:
NFA(Prog* prog);
@ -51,12 +62,10 @@ class NFA {
bool anchored, bool longest,
StringPiece* submatch, int nsubmatch);
static const int Debug = 0;
private:
struct Thread {
union {
int id;
int ref;
Thread* next; // when on free list
};
const char** capture;
@ -65,15 +74,14 @@ class NFA {
// State for explicit stack in AddToThreadq.
struct AddState {
int id; // Inst to process
int j;
const char* cap_j; // if j>=0, set capture[j] = cap_j before processing ip
Thread* t; // if not null, set t0 = t before processing id
AddState()
: id(0), j(-1), cap_j(NULL) {}
: id(0), t(NULL) {}
explicit AddState(int id)
: id(id), j(-1), cap_j(NULL) {}
AddState(int id, const char* cap_j, int j)
: id(id), j(j), cap_j(cap_j) {}
: id(id), t(NULL) {}
AddState(int id, Thread* t)
: id(id), t(t) {}
};
// Threadq is a list of threads. The list is sorted by the order
@ -82,19 +90,24 @@ class NFA {
typedef SparseArray<Thread*> Threadq;
inline Thread* AllocThread();
inline void FreeThread(Thread*);
inline Thread* Incref(Thread* t);
inline void Decref(Thread* t);
// Add id (or its children, following unlabeled arrows)
// to the workqueue q with associated capture info.
void AddToThreadq(Threadq* q, int id, int flag,
const char* p, const char** capture);
// Follows all empty arrows from id0 and enqueues all the states reached.
// Enqueues only the ByteRange instructions that match byte c.
// The bits in flag (Bol, Eol, etc.) specify whether ^, $ and \b match.
// p is the current input position, and t0 is the current thread.
void AddToThreadq(Threadq* q, int id0, int c, int flag,
const char* p, Thread* t0);
// Run runq on byte c, appending new states to nextq.
// Updates matched_ and match_ as new, better matches are found.
// p is position of the next byte (the one after c)
// in the input string, used when processing capturing parens.
// flag is the bitwise or of Bol, Eol, etc., specifying whether
// ^, $ and \b match the current input point (after c).
// p is the position of byte c in the input string for AddToThreadq;
// p-1 will be used when processing Match instructions.
// flag is the bitwise OR of Bol, Eol, etc., specifying whether
// ^, $ and \b match the current input position (after c).
// Frees all the threads on runq.
// If there is a shortcut to the end, returns that shortcut.
inline int Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p);
// Returns text version of capture information, for debugging.
@ -102,10 +115,6 @@ class NFA {
inline void CopyCapture(const char** dst, const char** src);
// Computes whether all matches must begin with the same first
// byte, and if so, returns that byte. If not, returns -1.
int ComputeFirstByte();
Prog* prog_; // underlying program
int start_; // start instruction in program
int ncapture_; // number of submatches to track
@ -118,16 +127,16 @@ class NFA {
bool matched_; // any match so far?
AddState* astack_; // pre-allocated for AddToThreadq
int nastack_;
int first_byte_; // required first byte for match, or -1 if none
Thread* free_threads_; // free list
DISALLOW_EVIL_CONSTRUCTORS(NFA);
NFA(const NFA&) = delete;
NFA& operator=(const NFA&) = delete;
};
NFA::NFA(Prog* prog) {
prog_ = prog;
start_ = prog->start();
start_ = prog_->start();
ncapture_ = 0;
longest_ = false;
endmatch_ = false;
@ -135,12 +144,14 @@ NFA::NFA(Prog* prog) {
etext_ = NULL;
q0_.resize(prog_->size());
q1_.resize(prog_->size());
nastack_ = 2*prog_->size();
// See NFA::AddToThreadq() for why this is so.
nastack_ = 2*prog_->inst_count(kInstCapture) +
prog_->inst_count(kInstEmptyWidth) +
prog_->inst_count(kInstNop) + 1; // + 1 for start inst
astack_ = new AddState[nastack_];
match_ = NULL;
matched_ = false;
free_threads_ = NULL;
first_byte_ = ComputeFirstByte();
}
NFA::~NFA() {
@ -154,24 +165,36 @@ NFA::~NFA() {
}
}
void NFA::FreeThread(Thread *t) {
if (t == NULL)
return;
t->next = free_threads_;
free_threads_ = t;
}
NFA::Thread* NFA::AllocThread() {
Thread* t = free_threads_;
if (t == NULL) {
t = new Thread;
t->ref = 1;
t->capture = new const char*[ncapture_];
return t;
}
free_threads_ = t->next;
t->ref = 1;
return t;
}
NFA::Thread* NFA::Incref(Thread* t) {
DCHECK(t != NULL);
t->ref++;
return t;
}
void NFA::Decref(Thread* t) {
if (t == NULL)
return;
t->ref--;
if (t->ref > 0)
return;
DCHECK_EQ(t->ref, 0);
t->next = free_threads_;
free_threads_ = t;
}
void NFA::CopyCapture(const char** dst, const char** src) {
for (int i = 0; i < ncapture_; i+=2) {
dst[i] = src[i];
@ -180,35 +203,43 @@ void NFA::CopyCapture(const char** dst, const char** src) {
}
// Follows all empty arrows from id0 and enqueues all the states reached.
// Enqueues only the ByteRange instructions that match byte c.
// The bits in flag (Bol, Eol, etc.) specify whether ^, $ and \b match.
// The pointer p is the current input position, and m is the
// current set of match boundaries.
void NFA::AddToThreadq(Threadq* q, int id0, int flag,
const char* p, const char** capture) {
// p is the current input position, and t0 is the current thread.
void NFA::AddToThreadq(Threadq* q, int id0, int c, int flag,
const char* p, Thread* t0) {
if (id0 == 0)
return;
// Astack_ is pre-allocated to avoid resize operations.
// It has room for 2*prog_->size() entries, which is enough:
// Each inst in prog can be processed at most once,
// pushing at most two entries on stk.
int nstk = 0;
// Use astack_ to hold our stack of instructions yet to process.
// It was preallocated as follows:
// two entries per Capture;
// one entry per EmptyWidth; and
// one entry per Nop.
// This reflects the maximum number of stack pushes that each can
// perform. (Each instruction can be processed at most once.)
AddState* stk = astack_;
stk[nstk++] = AddState(id0);
int nstk = 0;
stk[nstk++] = AddState(id0);
while (nstk > 0) {
DCHECK_LE(nstk, nastack_);
const AddState& a = stk[--nstk];
if (a.j >= 0)
capture[a.j] = a.cap_j;
AddState a = stk[--nstk];
Loop:
if (a.t != NULL) {
// t0 was a thread that we allocated and copied in order to
// record the capture, so we must now decref it.
Decref(t0);
t0 = a.t;
}
int id = a.id;
if (id == 0)
continue;
if (q->has_index(id)) {
if (Debug)
fprintf(stderr, " [%d%s]\n", id, FormatCapture(capture).c_str());
if (ExtraDebug)
fprintf(stderr, " [%d%s]\n", id, FormatCapture(t0->capture).c_str());
continue;
}
@ -231,62 +262,76 @@ void NFA::AddToThreadq(Threadq* q, int id0, int flag,
case kInstAltMatch:
// Save state; will pick up at next byte.
t = AllocThread();
t->id = id;
CopyCapture(t->capture, capture);
t = Incref(t0);
*tp = t;
// fall through
case kInstAlt:
// Explore alternatives.
stk[nstk++] = AddState(ip->out1());
stk[nstk++] = AddState(ip->out());
break;
DCHECK(!ip->last());
a = AddState(id+1);
goto Loop;
case kInstNop:
if (!ip->last())
stk[nstk++] = AddState(id+1);
// Continue on.
stk[nstk++] = AddState(ip->out());
break;
a = AddState(ip->out());
goto Loop;
case kInstCapture:
if (!ip->last())
stk[nstk++] = AddState(id+1);
if ((j=ip->cap()) < ncapture_) {
// Push a dummy whose only job is to restore capture[j]
// Push a dummy whose only job is to restore t0
// once we finish exploring this possibility.
stk[nstk++] = AddState(0, capture[j], j);
stk[nstk++] = AddState(0, t0);
// Record capture.
capture[j] = p;
t = AllocThread();
CopyCapture(t->capture, t0->capture);
t->capture[j] = p;
t0 = t;
}
stk[nstk++] = AddState(ip->out());
break;
a = AddState(ip->out());
goto Loop;
case kInstByteRange:
if (!ip->Matches(c))
goto Next;
FALLTHROUGH_INTENDED;
case kInstMatch:
case kInstByteRange:
// Save state; will pick up at next byte.
t = AllocThread();
t->id = id;
CopyCapture(t->capture, capture);
t = Incref(t0);
*tp = t;
if (Debug)
fprintf(stderr, " + %d%s [%p]\n", id, FormatCapture(t->capture).c_str(), t);
if (ExtraDebug)
fprintf(stderr, " + %d%s\n", id, FormatCapture(t0->capture).c_str());
Next:
if (ip->last())
break;
a = AddState(id+1);
goto Loop;
case kInstEmptyWidth:
if (!ip->last())
stk[nstk++] = AddState(id+1);
// Continue on if we have all the right flag bits.
if (ip->empty() & ~flag)
break;
stk[nstk++] = AddState(ip->out());
break;
a = AddState(ip->out());
goto Loop;
}
}
}
// Run runq on byte c, appending new states to nextq.
// Updates match as new, better matches are found.
// p is position of the byte c in the input string,
// used when processing capturing parens.
// flag is the bitwise or of Bol, Eol, etc., specifying whether
// ^, $ and \b match the current input point (after c).
// Updates matched_ and match_ as new, better matches are found.
// p is the position of byte c in the input string for AddToThreadq;
// p-1 will be used when processing Match instructions.
// flag is the bitwise OR of Bol, Eol, etc., specifying whether
// ^, $ and \b match the current input position (after c).
// Frees all the threads on runq.
// If there is a shortcut to the end, returns that shortcut.
int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {
@ -300,12 +345,12 @@ int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {
if (longest_) {
// Can skip any threads started after our current best match.
if (matched_ && match_[0] < t->capture[0]) {
FreeThread(t);
Decref(t);
continue;
}
}
int id = t->id;
int id = i->index();
Prog::Inst* ip = prog_->inst(id);
switch (ip->opcode()) {
@ -315,8 +360,7 @@ int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {
break;
case kInstByteRange:
if (ip->Matches(c))
AddToThreadq(nextq, ip->out(), flag, p+1, t->capture);
AddToThreadq(nextq, ip->out(), c, flag, p, t);
break;
case kInstAltMatch:
@ -324,52 +368,58 @@ int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {
break;
// The match is ours if we want it.
if (ip->greedy(prog_) || longest_) {
CopyCapture((const char**)match_, t->capture);
FreeThread(t);
for (++i; i != runq->end(); ++i)
FreeThread(i->second);
runq->clear();
CopyCapture(match_, t->capture);
matched_ = true;
Decref(t);
for (++i; i != runq->end(); ++i)
Decref(i->second);
runq->clear();
if (ip->greedy(prog_))
return ip->out1();
return ip->out();
}
break;
case kInstMatch:
if (endmatch_ && p != etext_)
case kInstMatch: {
// Avoid invoking undefined behavior when p happens
// to be null - and p-1 would be meaningless anyway.
if (p == NULL)
break;
if (endmatch_ && p-1 != etext_)
break;
const char* old = t->capture[1]; // previous end pointer
t->capture[1] = p;
if (longest_) {
// Leftmost-longest mode: save this match only if
// it is either farther to the left or at the same
// point but longer than an existing match.
if (!matched_ || t->capture[0] < match_[0] ||
(t->capture[0] == match_[0] && t->capture[1] > match_[1]))
CopyCapture((const char**)match_, t->capture);
(t->capture[0] == match_[0] && p-1 > match_[1])) {
CopyCapture(match_, t->capture);
match_[1] = p-1;
matched_ = true;
}
} else {
// Leftmost-biased mode: this match is by definition
// better than what we've already found (see next line).
CopyCapture((const char**)match_, t->capture);
CopyCapture(match_, t->capture);
match_[1] = p-1;
matched_ = true;
// Cut off the threads that can only find matches
// worse than the one we just found: don't run the
// rest of the current Threadq.
t->capture[0] = old;
FreeThread(t);
Decref(t);
for (++i; i != runq->end(); ++i)
FreeThread(i->second);
Decref(i->second);
runq->clear();
matched_ = true;
return 0;
}
t->capture[0] = old;
matched_ = true;
break;
}
FreeThread(t);
}
Decref(t);
}
runq->clear();
return 0;
@ -391,12 +441,6 @@ string NFA::FormatCapture(const char** capture) {
return s;
}
// Returns whether haystack contains needle's memory.
static bool StringPieceContains(const StringPiece haystack, const StringPiece needle) {
return haystack.begin() <= needle.begin() &&
haystack.end() >= needle.end();
}
bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
bool anchored, bool longest,
StringPiece* submatch, int nsubmatch) {
@ -407,12 +451,9 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
if (context.begin() == NULL)
context = text;
if (!StringPieceContains(context, text)) {
LOG(FATAL) << "Bad args: context does not contain text "
<< reinterpret_cast<const void*>(context.begin())
<< "+" << context.size() << " "
<< reinterpret_cast<const void*>(text.begin())
<< "+" << text.size();
// Sanity check: make sure that text lies within context.
if (text.begin() < context.begin() || text.end() > context.end()) {
LOG(DFATAL) << "context does not contain text";
return false;
}
@ -445,16 +486,14 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
match_ = new const char*[ncapture_];
matched_ = false;
memset(match_, 0, ncapture_*sizeof match_[0]);
// For debugging prints.
btext_ = context.begin();
if (Debug) {
if (ExtraDebug)
fprintf(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n",
text.as_string().c_str(), context.as_string().c_str(), anchored,
text.ToString().c_str(), context.ToString().c_str(), anchored,
longest);
}
// Set up search.
Threadq* runq = &q0_;
@ -462,14 +501,10 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
runq->clear();
nextq->clear();
memset(&match_[0], 0, ncapture_*sizeof match_[0]);
const char* bp = context.begin();
int c = -1;
int wasword = 0;
if (text.begin() > context.begin()) {
c = text.begin()[-1] & 0xFF;
wasword = Prog::IsWordChar(c);
}
if (text.begin() > context.begin())
wasword = Prog::IsWordChar(text.begin()[-1] & 0xFF);
// Loop over the text, stepping the machine.
for (const char* p = text.begin();; p++) {
@ -498,24 +533,29 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
else
flag |= kEmptyNonWordBoundary;
if (Debug) {
fprintf(stderr, "%c[%#x/%d/%d]:", p > text.end() ? '$' : p == bp ? '^' : c, flag, isword, wasword);
if (ExtraDebug) {
int c = 0;
if (p == context.begin())
c = '^';
else if (p > text.end())
c = '$';
else if (p < text.end())
c = p[0] & 0xFF;
fprintf(stderr, "%c[%#x/%d/%d]:", c, flag, isword, wasword);
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
Thread* t = i->second;
if (t == NULL)
continue;
fprintf(stderr, " %d%s", t->id,
FormatCapture((const char**)t->capture).c_str());
fprintf(stderr, " %d%s", i->index(), FormatCapture(t->capture).c_str());
}
fprintf(stderr, "\n");
}
// Process previous character (waited until now to avoid
// repeating the flag computation above).
// This is a no-op the first time around the loop, because
// runq is empty.
int id = Step(runq, nextq, c, flag, p-1);
// This is a no-op the first time around the loop because runq is empty.
int id = Step(runq, nextq, p < text.end() ? p[0] & 0xFF : -1, flag, p);
DCHECK_EQ(runq->size(), 0);
using std::swap;
swap(nextq, runq);
nextq->clear();
if (id != 0) {
@ -529,6 +569,7 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
break;
case kInstCapture:
if (ip->cap() < ncapture_)
match_[ip->cap()] = p;
id = ip->out();
continue;
@ -541,14 +582,6 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
match_[1] = p;
matched_ = true;
break;
case kInstEmptyWidth:
if (ip->empty() & ~(kEmptyEndLine|kEmptyEndText)) {
LOG(DFATAL) << "Unexpected empty-width in short circuit: " << ip->empty();
break;
}
id = ip->out();
continue;
}
break;
}
@ -566,10 +599,10 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
// If there's a required first byte for an unanchored search
// and we're not in the middle of any possible matches,
// use memchr to search for the byte quickly.
if (!anchored && first_byte_ >= 0 && runq->size() == 0 &&
p < text.end() && (p[0] & 0xFF) != first_byte_) {
p = reinterpret_cast<const char*>(memchr(p, first_byte_,
text.end() - p));
int fb = prog_->first_byte();
if (!anchored && runq->size() == 0 &&
fb >= 0 && p < text.end() && (p[0] & 0xFF) != fb) {
p = reinterpret_cast<const char*>(memchr(p, fb, text.end() - p));
if (p == NULL) {
p = text.end();
isword = 0;
@ -579,59 +612,48 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
flag = Prog::EmptyFlags(context, p);
}
// Steal match storage (cleared but unused as of yet)
// temporarily to hold match boundaries for new thread.
match_[0] = p;
AddToThreadq(runq, start_, flag, p, match_);
match_[0] = NULL;
Thread* t = AllocThread();
CopyCapture(t->capture, match_);
t->capture[0] = p;
AddToThreadq(runq, start_, p < text.end() ? p[0] & 0xFF : -1, flag, p, t);
Decref(t);
}
// If all the threads have died, stop early.
if (runq->size() == 0) {
if (Debug)
if (ExtraDebug)
fprintf(stderr, "dead\n");
break;
}
if (p == text.end())
c = 0;
else
c = *p & 0xFF;
wasword = isword;
// Will run step(runq, nextq, c, ...) on next iteration. See above.
}
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i)
FreeThread(i->second);
Decref(i->second);
if (matched_) {
for (int i = 0; i < nsubmatch; i++)
submatch[i].set(match_[2*i], match_[2*i+1] - match_[2*i]);
if (Debug)
fprintf(stderr, "match (%d,%d)\n",
static_cast<int>(match_[0] - btext_),
static_cast<int>(match_[1] - btext_));
submatch[i] =
StringPiece(match_[2 * i],
static_cast<size_t>(match_[2 * i + 1] - match_[2 * i]));
if (ExtraDebug)
fprintf(stderr, "match (%td,%td)\n",
match_[0] - btext_, match_[1] - btext_);
return true;
}
VLOG(1) << "No matches found";
return false;
}
// Computes whether all successful matches have a common first byte,
// and if so, returns that byte. If not, returns -1.
int NFA::ComputeFirstByte() {
if (start_ == 0)
return -1;
int b = -1; // first byte, not yet computed
typedef SparseSet Workq;
Workq q(prog_->size());
q.insert(start_);
for (Workq::iterator it = q.begin(); it != q.end(); ++it) {
int Prog::ComputeFirstByte() {
int b = -1;
SparseSet q(size());
q.insert(start());
for (SparseSet::iterator it = q.begin(); it != q.end(); ++it) {
int id = *it;
Prog::Inst* ip = prog_->inst(id);
Prog::Inst* ip = inst(id);
switch (ip->opcode()) {
default:
LOG(DFATAL) << "unhandled " << ip->opcode() << " in ComputeFirstByte";
@ -642,6 +664,9 @@ int NFA::ComputeFirstByte() {
return -1;
case kInstByteRange:
if (!ip->last())
q.insert(id+1);
// Must match only a single byte
if (ip->lo() != ip->hi())
return -1;
@ -658,6 +683,9 @@ int NFA::ComputeFirstByte() {
case kInstNop:
case kInstCapture:
case kInstEmptyWidth:
if (!ip->last())
q.insert(id+1);
// Continue on.
// Ignore ip->empty() flags for kInstEmptyWidth
// in order to be as conservative as possible
@ -666,13 +694,9 @@ int NFA::ComputeFirstByte() {
q.insert(ip->out());
break;
case kInstAlt:
case kInstAltMatch:
// Explore alternatives.
if (ip->out())
q.insert(ip->out());
if (ip->out1())
q.insert(ip->out1());
DCHECK(!ip->last());
q.insert(id+1);
break;
case kInstFail:
@ -686,7 +710,7 @@ bool
Prog::SearchNFA(const StringPiece& text, const StringPiece& context,
Anchor anchor, MatchKind kind,
StringPiece* match, int nmatch) {
if (NFA::Debug)
if (ExtraDebug)
Dump();
NFA nfa(this);
@ -705,5 +729,63 @@ Prog::SearchNFA(const StringPiece& text, const StringPiece& context,
return true;
}
} // namespace re2
// For each instruction i in the program reachable from the start, compute the
// number of instructions reachable from i by following only empty transitions
// and record that count as fanout[i].
//
// fanout holds the results and is also the work queue for the outer iteration.
// reachable holds the reached nodes for the inner iteration.
void Prog::Fanout(SparseArray<int>* fanout) {
DCHECK_EQ(fanout->max_size(), size());
SparseSet reachable(size());
fanout->clear();
fanout->set_new(start(), 0);
for (SparseArray<int>::iterator i = fanout->begin(); i != fanout->end(); ++i) {
int* count = &i->second;
reachable.clear();
reachable.insert(i->index());
for (SparseSet::iterator j = reachable.begin(); j != reachable.end(); ++j) {
int id = *j;
Prog::Inst* ip = inst(id);
switch (ip->opcode()) {
default:
LOG(DFATAL) << "unhandled " << ip->opcode() << " in Prog::Fanout()";
break;
case kInstByteRange:
if (!ip->last())
reachable.insert(id+1);
(*count)++;
if (!fanout->has_index(ip->out())) {
fanout->set_new(ip->out(), 0);
}
break;
case kInstAltMatch:
DCHECK(!ip->last());
reachable.insert(id+1);
break;
case kInstCapture:
case kInstEmptyWidth:
case kInstNop:
if (!ip->last())
reachable.insert(id+1);
reachable.insert(ip->out());
break;
case kInstMatch:
if (!ip->last())
reachable.insert(id+1);
break;
case kInstFail:
break;
}
}
}
}
} // namespace re2

View File

@ -50,17 +50,29 @@
// See also Anne Brüggemann-Klein and Derick Wood,
// "One-unambiguous regular languages", Information and Computation 142(2).
#include <stdint.h>
#include <string.h>
#include <algorithm>
#include <map>
#include <string>
#include <vector>
#include "util/util.h"
#include "util/arena.h"
#include "util/logging.h"
#include "util/sparse_set.h"
#include "util/strutil.h"
#include "util/utf.h"
#include "re2/prog.h"
#include "re2/stringpiece.h"
// Silence "zero-sized array in struct/union" warning for OneState::action.
#ifdef _MSC_VER
#pragma warning(disable: 4200)
#endif
namespace re2 {
static const int Debug = 0;
static const bool ExtraDebug = false;
// The key insight behind this implementation is that the
// non-determinism in an NFA for a one-pass regular expression
@ -126,19 +138,16 @@ static const int Debug = 0;
// whether a set of conditions required to finish a match at that
// point in the input rather than process the next byte.
// A state in the one-pass NFA (aka DFA) - just an array of actions.
struct OneState;
// A state in the one-pass NFA - just an array of actions indexed
// by the bytemap_[] of the next input byte. (The bytemap
// maps next input bytes into equivalence classes, to reduce
// the memory footprint.)
struct OneState {
uint32 matchcond; // conditions to match right now.
uint32 action[1];
uint32_t matchcond; // conditions to match right now.
uint32_t action[];
};
// The uint32 conditions in the action are a combination of
// The uint32_t conditions in the action are a combination of
// condition and capture bits and the next state. The bottom 16 bits
// are the condition and capture bits, and the top 16 are the index of
// the next state.
@ -164,23 +173,23 @@ static const int kRealMaxCap = (kIndexShift - kRealCapShift) / 2 * 2;
static const int kCapShift = kRealCapShift - 2;
static const int kMaxCap = kRealMaxCap + 2;
static const uint32 kMatchWins = 1 << kEmptyShift;
static const uint32 kCapMask = ((1 << kRealMaxCap) - 1) << kRealCapShift;
static const uint32_t kMatchWins = 1 << kEmptyShift;
static const uint32_t kCapMask = ((1 << kRealMaxCap) - 1) << kRealCapShift;
static const uint32 kImpossible = kEmptyWordBoundary | kEmptyNonWordBoundary;
static const uint32_t kImpossible = kEmptyWordBoundary | kEmptyNonWordBoundary;
// Check, at compile time, that prog.h agrees with math above.
// This function is never called.
void OnePass_Checks() {
COMPILE_ASSERT((1<<kEmptyShift)-1 == kEmptyAllFlags,
kEmptyShift_disagrees_with_kEmptyAllFlags);
static_assert((1<<kEmptyShift)-1 == kEmptyAllFlags,
"kEmptyShift disagrees with kEmptyAllFlags");
// kMaxCap counts pointers, kMaxOnePassCapture counts pairs.
COMPILE_ASSERT(kMaxCap == Prog::kMaxOnePassCapture*2,
kMaxCap_disagrees_with_kMaxOnePassCapture);
static_assert(kMaxCap == Prog::kMaxOnePassCapture*2,
"kMaxCap disagrees with kMaxOnePassCapture");
}
static bool Satisfy(uint32 cond, const StringPiece& context, const char* p) {
uint32 satisfied = Prog::EmptyFlags(context, p);
static bool Satisfy(uint32_t cond, const StringPiece& context, const char* p) {
uint32_t satisfied = Prog::EmptyFlags(context, p);
if (cond & kEmptyAllFlags & ~satisfied)
return false;
return true;
@ -188,20 +197,17 @@ static bool Satisfy(uint32 cond, const StringPiece& context, const char* p) {
// Apply the capture bits in cond, saving p to the appropriate
// locations in cap[].
static void ApplyCaptures(uint32 cond, const char* p,
static void ApplyCaptures(uint32_t cond, const char* p,
const char** cap, int ncap) {
for (int i = 2; i < ncap; i++)
if (cond & (1 << kCapShift << i))
cap[i] = p;
}
// Compute a node pointer.
// Basically (OneState*)(nodes + statesize*nodeindex)
// but the version with the C++ casts overflows 80 characters (and is ugly).
static inline OneState* IndexToNode(volatile uint8* nodes, int statesize,
// Computes the OneState* for the given nodeindex.
static inline OneState* IndexToNode(uint8_t* nodes, int statesize,
int nodeindex) {
return reinterpret_cast<OneState*>(
const_cast<uint8*>(nodes + statesize*nodeindex));
return reinterpret_cast<OneState*>(nodes + statesize*nodeindex);
}
bool Prog::SearchOnePass(const StringPiece& text,
@ -237,30 +243,27 @@ bool Prog::SearchOnePass(const StringPiece& text,
if (anchor_end())
kind = kFullMatch;
// State and act are marked volatile to
// keep the compiler from re-ordering the
// memory accesses walking over the NFA.
// This is worth about 5%.
volatile OneState* state = onepass_start_;
volatile uint8* nodes = onepass_nodes_;
volatile uint32 statesize = onepass_statesize_;
uint8* bytemap = bytemap_;
uint8_t* nodes = onepass_nodes_;
int statesize = sizeof(OneState) + bytemap_range()*sizeof(uint32_t);
// start() is always mapped to the zeroth OneState.
OneState* state = IndexToNode(nodes, statesize, 0);
uint8_t* bytemap = bytemap_;
const char* bp = text.begin();
const char* ep = text.end();
const char* p;
bool matched = false;
matchcap[0] = bp;
cap[0] = bp;
uint32 nextmatchcond = state->matchcond;
uint32_t nextmatchcond = state->matchcond;
for (p = bp; p < ep; p++) {
int c = bytemap[*p & 0xFF];
uint32 matchcond = nextmatchcond;
uint32 cond = state->action[c];
uint32_t matchcond = nextmatchcond;
uint32_t cond = state->action[c];
// Determine whether we can reach act->next.
// If so, advance state and nextmatchcond.
if ((cond & kEmptyAllFlags) == 0 || Satisfy(cond, context, p)) {
uint32 nextindex = cond >> kIndexShift;
uint32_t nextindex = cond >> kIndexShift;
state = IndexToNode(nodes, statesize, nextindex);
nextmatchcond = state->matchcond;
} else {
@ -319,7 +322,7 @@ bool Prog::SearchOnePass(const StringPiece& text,
// Look for match at end of input.
{
uint32 matchcond = state->matchcond;
uint32_t matchcond = state->matchcond;
if (matchcond != kImpossible &&
((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p))) {
if (nmatch > 1 && (matchcond & kCapMask))
@ -335,7 +338,9 @@ done:
if (!matched)
return false;
for (int i = 0; i < nmatch; i++)
match[i].set(matchcap[2*i], matchcap[2*i+1] - matchcap[2*i]);
match[i] =
StringPiece(matchcap[2 * i],
static_cast<size_t>(matchcap[2 * i + 1] - matchcap[2 * i]));
return true;
}
@ -357,7 +362,7 @@ static bool AddQ(Instq *q, int id) {
struct InstCond {
int id;
uint32 cond;
uint32_t cond;
};
// Returns whether this is a one-pass program; that is,
@ -377,7 +382,7 @@ struct InstCond {
// Constructs and saves corresponding one-pass NFA on success.
bool Prog::IsOnePass() {
if (did_onepass_)
return onepass_start_ != NULL;
return onepass_nodes_ != NULL;
did_onepass_ = true;
if (start() == 0) // no match
@ -387,32 +392,37 @@ bool Prog::IsOnePass() {
// Willing to use at most 1/4 of the DFA budget (heuristic).
// Limit max node count to 65000 as a conservative estimate to
// avoid overflowing 16-bit node index in encoding.
int maxnodes = 2 + byte_inst_count_;
int statesize = sizeof(OneState) + (bytemap_range_-1)*sizeof(uint32);
int maxnodes = 2 + inst_count(kInstByteRange);
int statesize = sizeof(OneState) + bytemap_range()*sizeof(uint32_t);
if (maxnodes >= 65000 || dfa_mem_ / 4 / statesize < maxnodes)
return false;
// Flood the graph starting at the start state, and check
// that in each reachable state, each possible byte leads
// to a unique next state.
int size = this->size();
InstCond *stack = new InstCond[size];
int stacksize = inst_count(kInstCapture) +
inst_count(kInstEmptyWidth) +
inst_count(kInstNop) + 1; // + 1 for start inst
InstCond* stack = new InstCond[stacksize];
int size = this->size();
int* nodebyid = new int[size]; // indexed by ip
memset(nodebyid, 0xFF, size*sizeof nodebyid[0]);
uint8* nodes = new uint8[maxnodes*statesize];
uint8* nodep = nodes;
// Originally, nodes was a uint8_t[maxnodes*statesize], but that was
// unnecessarily optimistic: why allocate a large amount of memory
// upfront for a large program when it is unlikely to be one-pass?
std::vector<uint8_t> nodes;
Instq tovisit(size), workq(size);
AddQ(&tovisit, start());
nodebyid[start()] = 0;
nodep += statesize;
int nalloc = 1;
nodes.insert(nodes.end(), statesize, 0);
for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
int id = *it;
int nodeindex = nodebyid[id];
OneState* node = IndexToNode(nodes, statesize, nodeindex);
OneState* node = IndexToNode(nodes.data(), statesize, nodeindex);
// Flood graph using manual stack, filling in actions as found.
// Default is none.
@ -427,93 +437,108 @@ bool Prog::IsOnePass() {
stack[nstack++].cond = 0;
while (nstack > 0) {
int id = stack[--nstack].id;
uint32_t cond = stack[nstack].cond;
Loop:
Prog::Inst* ip = inst(id);
uint32 cond = stack[nstack].cond;
switch (ip->opcode()) {
default:
LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
break;
case kInstAltMatch:
// TODO(rsc): Ignoring kInstAltMatch optimization.
// Should implement it in this engine, but it's subtle.
// Fall through.
case kInstAlt:
DCHECK(!ip->last());
// If already on work queue, (1) is violated: bail out.
if (!AddQ(&workq, ip->out()) || !AddQ(&workq, ip->out1()))
if (!AddQ(&workq, id+1))
goto fail;
stack[nstack].id = ip->out1();
stack[nstack++].cond = cond;
stack[nstack].id = ip->out();
stack[nstack++].cond = cond;
break;
id = id+1;
goto Loop;
case kInstByteRange: {
int nextindex = nodebyid[ip->out()];
if (nextindex == -1) {
if (nalloc >= maxnodes) {
if (Debug)
LOG(ERROR)
<< StringPrintf("Not OnePass: hit node limit %d > %d",
nalloc, maxnodes);
if (ExtraDebug)
LOG(ERROR) << StringPrintf(
"Not OnePass: hit node limit %d >= %d", nalloc, maxnodes);
goto fail;
}
nextindex = nalloc;
nodep += statesize;
nodebyid[ip->out()] = nextindex;
nalloc++;
AddQ(&tovisit, ip->out());
nodebyid[ip->out()] = nalloc;
nalloc++;
nodes.insert(nodes.end(), statesize, 0);
// Update node because it might have been invalidated.
node = IndexToNode(nodes.data(), statesize, nodeindex);
}
if (matched)
cond |= kMatchWins;
for (int c = ip->lo(); c <= ip->hi(); c++) {
int b = bytemap_[c];
c = unbytemap_[b]; // last c in byte class
uint32 act = node->action[b];
uint32 newact = (nextindex << kIndexShift) | cond;
// Skip any bytes immediately after c that are also in b.
while (c < 256-1 && bytemap_[c+1] == b)
c++;
uint32_t act = node->action[b];
uint32_t newact = (nextindex << kIndexShift) | cond;
if (matched)
newact |= kMatchWins;
if ((act & kImpossible) == kImpossible) {
node->action[b] = newact;
} else if (act != newact) {
if (Debug) {
LOG(ERROR)
<< StringPrintf("Not OnePass: conflict on byte "
"%#x at state %d",
c, *it);
}
if (ExtraDebug)
LOG(ERROR) << StringPrintf(
"Not OnePass: conflict on byte %#x at state %d", c, *it);
goto fail;
}
}
if (ip->foldcase()) {
Rune lo = max<Rune>(ip->lo(), 'a') + 'A' - 'a';
Rune hi = min<Rune>(ip->hi(), 'z') + 'A' - 'a';
Rune lo = std::max<Rune>(ip->lo(), 'a') + 'A' - 'a';
Rune hi = std::min<Rune>(ip->hi(), 'z') + 'A' - 'a';
for (int c = lo; c <= hi; c++) {
int b = bytemap_[c];
c = unbytemap_[b]; // last c in class
uint32 act = node->action[b];
uint32 newact = (nextindex << kIndexShift) | cond;
// Skip any bytes immediately after c that are also in b.
while (c < 256-1 && bytemap_[c+1] == b)
c++;
uint32_t act = node->action[b];
uint32_t newact = (nextindex << kIndexShift) | cond;
if (matched)
newact |= kMatchWins;
if ((act & kImpossible) == kImpossible) {
node->action[b] = newact;
} else if (act != newact) {
if (Debug) {
LOG(ERROR)
<< StringPrintf("Not OnePass: conflict on byte "
"%#x at state %d",
c, *it);
}
if (ExtraDebug)
LOG(ERROR) << StringPrintf(
"Not OnePass: conflict on byte %#x at state %d", c, *it);
goto fail;
}
}
}
if (ip->last())
break;
// If already on work queue, (1) is violated: bail out.
if (!AddQ(&workq, id+1))
goto fail;
id = id+1;
goto Loop;
}
case kInstCapture:
if (ip->cap() < kMaxCap)
cond |= (1 << kCapShift) << ip->cap();
goto QueueEmpty;
case kInstEmptyWidth:
cond |= ip->empty();
goto QueueEmpty;
case kInstNop:
QueueEmpty:
if (!ip->last()) {
// If already on work queue, (1) is violated: bail out.
if (!AddQ(&workq, id+1))
goto fail;
stack[nstack].id = id+1;
stack[nstack++].cond = cond;
}
if (ip->opcode() == kInstCapture && ip->cap() < kMaxCap)
cond |= (1 << kCapShift) << ip->cap();
if (ip->opcode() == kInstEmptyWidth)
cond |= ip->empty();
// kInstCapture and kInstNop always proceed to ip->out().
// kInstEmptyWidth only sometimes proceeds to ip->out(),
// but as a conservative approximation we assume it always does.
@ -522,29 +547,32 @@ bool Prog::IsOnePass() {
// If already on work queue, (1) is violated: bail out.
if (!AddQ(&workq, ip->out())) {
if (Debug) {
LOG(ERROR) << StringPrintf("Not OnePass: multiple paths"
" %d -> %d\n",
*it, ip->out());
}
if (ExtraDebug)
LOG(ERROR) << StringPrintf(
"Not OnePass: multiple paths %d -> %d\n", *it, ip->out());
goto fail;
}
stack[nstack].id = ip->out();
stack[nstack++].cond = cond;
break;
id = ip->out();
goto Loop;
case kInstMatch:
if (matched) {
// (3) is violated
if (Debug) {
LOG(ERROR) << StringPrintf("Not OnePass: multiple matches"
" from %d\n", *it);
}
if (ExtraDebug)
LOG(ERROR) << StringPrintf(
"Not OnePass: multiple matches from %d\n", *it);
goto fail;
}
matched = true;
node->matchcond = cond;
if (ip->last())
break;
// If already on work queue, (1) is violated: bail out.
if (!AddQ(&workq, id+1))
goto fail;
id = id+1;
goto Loop;
case kInstFail:
break;
@ -552,29 +580,22 @@ bool Prog::IsOnePass() {
}
}
if (Debug) { // For debugging, dump one-pass NFA to LOG(ERROR).
string dump = "prog dump:\n" + Dump() + "node dump\n";
map<int, int> idmap;
if (ExtraDebug) { // For debugging, dump one-pass NFA to LOG(ERROR).
LOG(ERROR) << "bytemap:\n" << DumpByteMap();
LOG(ERROR) << "prog:\n" << Dump();
std::map<int, int> idmap;
for (int i = 0; i < size; i++)
if (nodebyid[i] != -1)
idmap[nodebyid[i]] = i;
StringAppendF(&dump, "byte ranges:\n");
int i = 0;
for (int b = 0; b < bytemap_range_; b++) {
int lo = i;
while (bytemap_[i] == b)
i++;
StringAppendF(&dump, "\t%d: %#x-%#x\n", b, lo, i - 1);
}
string dump;
for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
int id = *it;
int nodeindex = nodebyid[id];
if (nodeindex == -1)
continue;
OneState* node = IndexToNode(nodes, statesize, nodeindex);
string s;
OneState* node = IndexToNode(nodes.data(), statesize, nodeindex);
StringAppendF(&dump, "node %d id=%d: matchcond=%#x\n",
nodeindex, id, node->matchcond);
for (int i = 0; i < bytemap_range_; i++) {
@ -586,19 +607,12 @@ bool Prog::IsOnePass() {
idmap[node->action[i] >> kIndexShift]);
}
}
LOG(ERROR) << dump;
LOG(ERROR) << "nodes:\n" << dump;
}
// Overallocated earlier; cut down to actual size.
nodep = new uint8[nalloc*statesize];
memmove(nodep, nodes, nalloc*statesize);
delete[] nodes;
nodes = nodep;
onepass_start_ = IndexToNode(nodes, statesize, nodebyid[start()]);
onepass_nodes_ = nodes;
onepass_statesize_ = statesize;
dfa_mem_ -= nalloc*statesize;
onepass_nodes_ = new uint8_t[nalloc*statesize];
memmove(onepass_nodes_, nodes.data(), nalloc*statesize);
delete[] stack;
delete[] nodebyid;
@ -607,7 +621,6 @@ bool Prog::IsOnePass() {
fail:
delete[] stack;
delete[] nodebyid;
delete[] nodes;
return false;
}

File diff suppressed because it is too large Load Diff

View File

@ -2,34 +2,38 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "util/util.h"
#include "re2/prefilter.h"
#include <stddef.h>
#include <stdint.h>
#include <string>
#include <vector>
#include "util/util.h"
#include "util/logging.h"
#include "util/strutil.h"
#include "util/utf.h"
#include "re2/re2.h"
#include "re2/unicode_casefold.h"
#include "re2/walker-inl.h"
namespace re2 {
static const int Trace = false;
static const bool ExtraDebug = false;
typedef set<string>::iterator SSIter;
typedef set<string>::const_iterator ConstSSIter;
typedef std::set<string>::iterator SSIter;
typedef std::set<string>::const_iterator ConstSSIter;
static int alloc_id = 100000; // Used for debugging.
// Initializes a Prefilter, allocating subs_ as necessary.
Prefilter::Prefilter(Op op) {
op_ = op;
subs_ = NULL;
if (op_ == AND || op_ == OR)
subs_ = new vector<Prefilter*>;
alloc_id_ = alloc_id++;
VLOG(10) << "alloc_id: " << alloc_id_;
subs_ = new std::vector<Prefilter*>;
}
// Destroys a Prefilter.
Prefilter::~Prefilter() {
VLOG(10) << "Deleted: " << alloc_id_;
if (subs_) {
for (size_t i = 0; i < subs_->size(); i++)
delete (*subs_)[i];
@ -45,7 +49,7 @@ Prefilter* Prefilter::Simplify() {
}
// Nothing left in the AND/OR.
if (subs_->size() == 0) {
if (subs_->empty()) {
if (op_ == AND)
op_ = ALL; // AND of nothing is true
else
@ -136,7 +140,7 @@ Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) {
return AndOr(OR, a, b);
}
static void SimplifyStringSet(set<string> *ss) {
static void SimplifyStringSet(std::set<string> *ss) {
// Now make sure that the strings aren't redundant. For example, if
// we know "ab" is a required string, then it doesn't help at all to
// know that "abc" is also a required string, so delete "abc". This
@ -157,7 +161,7 @@ static void SimplifyStringSet(set<string> *ss) {
}
}
Prefilter* Prefilter::OrStrings(set<string>* ss) {
Prefilter* Prefilter::OrStrings(std::set<string>* ss) {
SimplifyStringSet(ss);
Prefilter* or_prefilter = NULL;
if (!ss->empty()) {
@ -176,7 +180,7 @@ static Rune ToLowerRune(Rune r) {
}
const CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r);
if (f == NULL || r < static_cast<Rune>(f->lo))
if (f == NULL || r < f->lo)
return r;
return ApplyFold(f, r);
}
@ -222,14 +226,14 @@ class Prefilter::Info {
// Caller takes ownership of the Prefilter.
Prefilter* TakeMatch();
set<string>& exact() { return exact_; }
std::set<string>& exact() { return exact_; }
bool is_exact() const { return is_exact_; }
class Walker;
private:
set<string> exact_;
std::set<string> exact_;
// When is_exact_ is true, the strings that match
// are placed in exact_. When it is no longer an exact
@ -268,7 +272,9 @@ string Prefilter::Info::ToString() {
if (is_exact_) {
int n = 0;
string s;
for (set<string>::iterator i = exact_.begin(); i != exact_.end(); ++i) {
for (std::set<string>::iterator i = exact_.begin();
i != exact_.end();
++i) {
if (n++ > 0)
s += ",";
s += *i;
@ -283,16 +289,17 @@ string Prefilter::Info::ToString() {
}
// Add the strings from src to dst.
static void CopyIn(const set<string>& src, set<string>* dst) {
static void CopyIn(const std::set<string>& src,
std::set<string>* dst) {
for (ConstSSIter i = src.begin(); i != src.end(); ++i)
dst->insert(*i);
}
// Add the cross-product of a and b to dst.
// (For each string i in a and j in b, add i+j.)
static void CrossProduct(const set<string>& a,
const set<string>& b,
set<string>* dst) {
static void CrossProduct(const std::set<string>& a,
const std::set<string>& b,
std::set<string>* dst) {
for (ConstSSIter i = a.begin(); i != a.end(); ++i)
for (ConstSSIter j = b.begin(); j != b.end(); ++j)
dst->insert(*i + *j);
@ -446,10 +453,10 @@ Prefilter::Info* Prefilter::Info::EmptyString() {
typedef CharClass::iterator CCIter;
Prefilter::Info* Prefilter::Info::CClass(CharClass *cc,
bool latin1) {
if (Trace) {
VLOG(0) << "CharClassInfo:";
if (ExtraDebug) {
LOG(ERROR) << "CharClassInfo:";
for (CCIter i = cc->begin(); i != cc->end(); ++i)
VLOG(0) << " " << i->lo << "-" << i->hi;
LOG(ERROR) << " " << i->lo << "-" << i->hi;
}
// If the class is too large, it's okay to overestimate.
@ -469,9 +476,8 @@ Prefilter::Info* Prefilter::Info::CClass(CharClass *cc,
a->is_exact_ = true;
if (Trace) {
VLOG(0) << " = " << a->ToString();
}
if (ExtraDebug)
LOG(ERROR) << " = " << a->ToString();
return a;
}
@ -492,15 +498,16 @@ class Prefilter::Info::Walker : public Regexp::Walker<Prefilter::Info*> {
bool latin1() { return latin1_; }
private:
bool latin1_;
DISALLOW_EVIL_CONSTRUCTORS(Walker);
Walker(const Walker&) = delete;
Walker& operator=(const Walker&) = delete;
};
Prefilter::Info* Prefilter::BuildInfo(Regexp* re) {
if (Trace) {
LOG(INFO) << "BuildPrefilter::Info: " << re->ToString();
}
if (ExtraDebug)
LOG(ERROR) << "BuildPrefilter::Info: " << re->ToString();
bool latin1 = re->parse_flags() & Regexp::Latin1;
bool latin1 = (re->parse_flags() & Regexp::Latin1) != 0;
Prefilter::Info::Walker w(latin1);
Prefilter::Info* info = w.WalkExponential(re, NULL, 100000);
@ -600,7 +607,6 @@ Prefilter::Info* Prefilter::Info::Walker::PostVisit(
info = child_args[0];
for (int i = 1; i < nchild_args; i++)
info = Alt(info, child_args[i]);
VLOG(10) << "Alt: " << info->ToString();
break;
case kRegexpStar:
@ -630,10 +636,9 @@ Prefilter::Info* Prefilter::Info::Walker::PostVisit(
break;
}
if (Trace) {
VLOG(0) << "BuildInfo " << re->ToString()
if (ExtraDebug)
LOG(ERROR) << "BuildInfo " << re->ToString()
<< ": " << (info ? info->ToString() : "");
}
return info;
}

View File

@ -2,14 +2,19 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_PREFILTER_H_
#define RE2_PREFILTER_H_
// Prefilter is the class used to extract string guards from regexps.
// Rather than using Prefilter class directly, use FilteredRE2.
// See filtered_re2.h
#ifndef RE2_PREFILTER_H_
#define RE2_PREFILTER_H_
#include <set>
#include <string>
#include <vector>
#include "util/util.h"
#include "util/logging.h"
namespace re2 {
@ -37,14 +42,14 @@ class Prefilter {
int unique_id() const { return unique_id_; }
// The children of the Prefilter node.
vector<Prefilter*>* subs() {
CHECK(op_ == AND || op_ == OR);
std::vector<Prefilter*>* subs() {
DCHECK(op_ == AND || op_ == OR);
return subs_;
}
// Set the children vector. Prefilter takes ownership of subs and
// subs_ will be deleted when Prefilter is deleted.
void set_subs(vector<Prefilter*>* subs) { subs_ = subs; }
void set_subs(std::vector<Prefilter*>* subs) { subs_ = subs; }
// Given a RE2, return a Prefilter. The caller takes ownership of
// the Prefilter and should deallocate it. Returns NULL if Prefilter
@ -72,7 +77,7 @@ class Prefilter {
static Prefilter* FromString(const string& str);
static Prefilter* OrStrings(set<string>* ss);
static Prefilter* OrStrings(std::set<string>* ss);
static Info* BuildInfo(Regexp* re);
@ -82,7 +87,7 @@ class Prefilter {
Op op_;
// Sub-matches for AND or OR Prefilter.
vector<Prefilter*>* subs_;
std::vector<Prefilter*>* subs_;
// Actual string to match in leaf node.
string atom_;
@ -94,10 +99,8 @@ class Prefilter {
// and -1 for duplicate nodes.
int unique_id_;
// Used for debugging, helps in tracking memory leaks.
int alloc_id_;
DISALLOW_EVIL_CONSTRUCTORS(Prefilter);
Prefilter(const Prefilter&) = delete;
Prefilter& operator=(const Prefilter&) = delete;
};
} // namespace re2

View File

@ -2,20 +2,35 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "util/util.h"
#include "util/flags.h"
#include "re2/prefilter.h"
#include "re2/prefilter_tree.h"
#include "re2/re2.h"
DEFINE_int32(filtered_re2_min_atom_len,
3,
"Strings less than this length are not stored as atoms");
#include <stddef.h>
#include <algorithm>
#include <map>
#include <memory>
#include <set>
#include <string>
#include <utility>
#include <vector>
#include "util/util.h"
#include "util/logging.h"
#include "util/strutil.h"
#include "re2/prefilter.h"
#include "re2/re2.h"
namespace re2 {
static const bool ExtraDebug = false;
PrefilterTree::PrefilterTree()
: compiled_(false) {
: compiled_(false),
min_atom_len_(3) {
}
PrefilterTree::PrefilterTree(int min_atom_len)
: compiled_(false),
min_atom_len_(min_atom_len) {
}
PrefilterTree::~PrefilterTree() {
@ -26,62 +41,22 @@ PrefilterTree::~PrefilterTree() {
delete entries_[i].parents;
}
// Functions used for adding and Compiling prefilters to the
// PrefilterTree.
static bool KeepPart(Prefilter* prefilter, int level) {
if (prefilter == NULL)
return false;
switch (prefilter->op()) {
default:
LOG(DFATAL) << "Unexpected op in KeepPart: "
<< prefilter->op();
return false;
case Prefilter::ALL:
return false;
case Prefilter::ATOM:
return prefilter->atom().size() >=
static_cast<size_t>(FLAGS_filtered_re2_min_atom_len);
case Prefilter::AND: {
int j = 0;
vector<Prefilter*>* subs = prefilter->subs();
for (size_t i = 0; i < subs->size(); i++)
if (KeepPart((*subs)[i], level + 1))
(*subs)[j++] = (*subs)[i];
else
delete (*subs)[i];
subs->resize(j);
return j > 0;
}
case Prefilter::OR:
for (size_t i = 0; i < prefilter->subs()->size(); i++)
if (!KeepPart((*prefilter->subs())[i], level + 1))
return false;
return true;
}
}
void PrefilterTree::Add(Prefilter *f) {
void PrefilterTree::Add(Prefilter* prefilter) {
if (compiled_) {
LOG(DFATAL) << "Add after Compile.";
LOG(DFATAL) << "Add called after Compile.";
return;
}
if (f != NULL && !KeepPart(f, 0)) {
delete f;
f = NULL;
if (prefilter != NULL && !KeepNode(prefilter)) {
delete prefilter;
prefilter = NULL;
}
prefilter_vec_.push_back(f);
prefilter_vec_.push_back(prefilter);
}
void PrefilterTree::Compile(vector<string>* atom_vec) {
void PrefilterTree::Compile(std::vector<string>* atom_vec) {
if (compiled_) {
LOG(DFATAL) << "Compile after Compile.";
LOG(DFATAL) << "Compile called already.";
return;
}
@ -93,7 +68,9 @@ void PrefilterTree::Compile(vector<string>* atom_vec) {
compiled_ = true;
AssignUniqueIds(atom_vec);
// TODO(junyer): Use std::unordered_set<Prefilter*> instead?
NodeMap nodes;
AssignUniqueIds(&nodes, atom_vec);
// Identify nodes that are too common among prefilters and are
// triggering too many parents. Then get rid of them if possible.
@ -109,9 +86,11 @@ void PrefilterTree::Compile(vector<string>* atom_vec) {
// this trigger. TODO(vsri): Adjust the threshold appropriately,
// make it a function of total number of nodes?
bool have_other_guard = true;
for (StdIntMap::iterator it = parents->begin(); it != parents->end(); ++it)
for (StdIntMap::iterator it = parents->begin();
it != parents->end(); ++it) {
have_other_guard = have_other_guard &&
(entries_[it->first].propagate_up_at_count > 1);
}
if (have_other_guard) {
for (StdIntMap::iterator it = parents->begin();
@ -123,50 +102,82 @@ void PrefilterTree::Compile(vector<string>* atom_vec) {
}
}
PrintDebugInfo();
if (ExtraDebug)
PrintDebugInfo(&nodes);
}
Prefilter* PrefilterTree::CanonicalNode(Prefilter* node) {
Prefilter* PrefilterTree::CanonicalNode(NodeMap* nodes, Prefilter* node) {
string node_string = NodeString(node);
map<string, Prefilter*>::iterator iter = node_map_.find(node_string);
if (iter == node_map_.end())
std::map<string, Prefilter*>::iterator iter = nodes->find(node_string);
if (iter == nodes->end())
return NULL;
return (*iter).second;
}
static string Itoa(int n) {
char buf[100];
snprintf(buf, sizeof buf, "%d", n);
return string(buf);
}
string PrefilterTree::NodeString(Prefilter* node) const {
// Adding the operation disambiguates AND/OR/atom nodes.
string s = Itoa(node->op()) + ":";
string s = StringPrintf("%d", node->op()) + ":";
if (node->op() == Prefilter::ATOM) {
s += node->atom();
} else {
for (size_t i = 0; i < node->subs()->size(); i++) {
if (i > 0)
s += ',';
s += Itoa((*node->subs())[i]->unique_id());
s += StringPrintf("%d", (*node->subs())[i]->unique_id());
}
}
return s;
}
void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) {
bool PrefilterTree::KeepNode(Prefilter* node) const {
if (node == NULL)
return false;
switch (node->op()) {
default:
LOG(DFATAL) << "Unexpected op in KeepNode: " << node->op();
return false;
case Prefilter::ALL:
return false;
case Prefilter::ATOM:
return node->atom().size() >= static_cast<size_t>(min_atom_len_);
case Prefilter::AND: {
int j = 0;
std::vector<Prefilter*>* subs = node->subs();
for (size_t i = 0; i < subs->size(); i++)
if (KeepNode((*subs)[i]))
(*subs)[j++] = (*subs)[i];
else
delete (*subs)[i];
subs->resize(j);
return j > 0;
}
case Prefilter::OR:
for (size_t i = 0; i < node->subs()->size(); i++)
if (!KeepNode((*node->subs())[i]))
return false;
return true;
}
}
void PrefilterTree::AssignUniqueIds(NodeMap* nodes,
std::vector<string>* atom_vec) {
atom_vec->clear();
// Build vector of all filter nodes, sorted topologically
// from top to bottom in v.
vector<Prefilter*> v;
std::vector<Prefilter*> v;
// Add the top level nodes of each regexp prefilter.
for (size_t i = 0; i < prefilter_vec_.size(); i++) {
Prefilter* f = prefilter_vec_[i];
if (f == NULL)
unfiltered_.push_back(i);
unfiltered_.push_back(static_cast<int>(i));
// We push NULL also on to v, so that we maintain the
// mapping of index==regexpid for level=0 prefilter nodes.
@ -179,7 +190,7 @@ void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) {
if (f == NULL)
continue;
if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) {
const vector<Prefilter*>& subs = *f->subs();
const std::vector<Prefilter*>& subs = *f->subs();
for (size_t j = 0; j < subs.size(); j++)
v.push_back(subs[j]);
}
@ -187,16 +198,16 @@ void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) {
// Identify unique nodes.
int unique_id = 0;
for (int i = v.size() - 1; i >= 0; i--) {
for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
Prefilter *node = v[i];
if (node == NULL)
continue;
node->set_unique_id(-1);
Prefilter* canonical = CanonicalNode(node);
Prefilter* canonical = CanonicalNode(nodes, node);
if (canonical == NULL) {
// Any further nodes that have the same node string
// will find this node as the canonical node.
node_map_[NodeString(node)] = node;
nodes->emplace(NodeString(node), node);
if (node->op() == Prefilter::ATOM) {
atom_vec->push_back(node->atom());
atom_index_to_id_.push_back(unique_id);
@ -206,15 +217,15 @@ void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) {
node->set_unique_id(canonical->unique_id());
}
}
entries_.resize(node_map_.size());
entries_.resize(nodes->size());
// Create parent IntMap for the entries.
for (int i = v.size() - 1; i >= 0; i--) {
// Create parent StdIntMap for the entries.
for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
Prefilter* prefilter = v[i];
if (prefilter == NULL)
continue;
if (CanonicalNode(prefilter) != prefilter)
if (CanonicalNode(nodes, prefilter) != prefilter)
continue;
Entry* entry = &entries_[prefilter->unique_id()];
@ -222,12 +233,12 @@ void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) {
}
// Fill the entries.
for (int i = v.size() - 1; i >= 0; i--) {
for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
Prefilter* prefilter = v[i];
if (prefilter == NULL)
continue;
if (CanonicalNode(prefilter) != prefilter)
if (CanonicalNode(nodes, prefilter) != prefilter)
continue;
Entry* entry = &entries_[prefilter->unique_id()];
@ -244,10 +255,10 @@ void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) {
case Prefilter::OR:
case Prefilter::AND: {
set<int> uniq_child;
std::set<int> uniq_child;
for (size_t j = 0; j < prefilter->subs()->size(); j++) {
Prefilter* child = (*prefilter->subs())[j];
Prefilter* canonical = CanonicalNode(child);
Prefilter* canonical = CanonicalNode(nodes, child);
if (canonical == NULL) {
LOG(DFATAL) << "Null canonical node";
return;
@ -256,11 +267,14 @@ void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) {
uniq_child.insert(child_id);
// To the child, we want to add to parent indices.
Entry* child_entry = &entries_[child_id];
if (child_entry->parents->find(prefilter->unique_id()) == child_entry->parents->end())
if (child_entry->parents->find(prefilter->unique_id()) ==
child_entry->parents->end()) {
(*child_entry->parents)[prefilter->unique_id()] = 1;
}
entry->propagate_up_at_count =
prefilter->op() == Prefilter::AND ? uniq_child.size() : 1;
}
entry->propagate_up_at_count = prefilter->op() == Prefilter::AND
? static_cast<int>(uniq_child.size())
: 1;
break;
}
@ -271,29 +285,28 @@ void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) {
for (size_t i = 0; i < prefilter_vec_.size(); i++) {
if (prefilter_vec_[i] == NULL)
continue;
int id = CanonicalNode(prefilter_vec_[i])->unique_id();
int id = CanonicalNode(nodes, prefilter_vec_[i])->unique_id();
DCHECK_LE(0, id);
Entry* entry = &entries_[id];
entry->regexps.push_back(i);
entry->regexps.push_back(static_cast<int>(i));
}
}
// Functions for triggering during search.
void PrefilterTree::RegexpsGivenStrings(
const vector<int>& matched_atoms,
vector<int>* regexps) const {
const std::vector<int>& matched_atoms,
std::vector<int>* regexps) const {
regexps->clear();
if (!compiled_) {
LOG(WARNING) << "Compile() not called";
LOG(ERROR) << "RegexpsGivenStrings called before Compile.";
for (size_t i = 0; i < prefilter_vec_.size(); ++i)
regexps->push_back(i);
regexps->push_back(static_cast<int>(i));
} else {
if (!prefilter_vec_.empty()) {
IntMap regexps_map(prefilter_vec_.size());
vector<int> matched_atom_ids;
IntMap regexps_map(static_cast<int>(prefilter_vec_.size()));
std::vector<int> matched_atom_ids;
for (size_t j = 0; j < matched_atoms.size(); j++) {
matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]);
VLOG(10) << "Atom id:" << atom_index_to_id_[matched_atoms[j]];
}
PropagateMatch(matched_atom_ids, &regexps_map);
for (IntMap::iterator it = regexps_map.begin();
@ -304,23 +317,20 @@ void PrefilterTree::RegexpsGivenStrings(
regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end());
}
}
sort(regexps->begin(), regexps->end());
std::sort(regexps->begin(), regexps->end());
}
void PrefilterTree::PropagateMatch(const vector<int>& atom_ids,
void PrefilterTree::PropagateMatch(const std::vector<int>& atom_ids,
IntMap* regexps) const {
IntMap count(entries_.size());
IntMap work(entries_.size());
IntMap count(static_cast<int>(entries_.size()));
IntMap work(static_cast<int>(entries_.size()));
for (size_t i = 0; i < atom_ids.size(); i++)
work.set(atom_ids[i], 1);
for (IntMap::iterator it = work.begin(); it != work.end(); ++it) {
const Entry& entry = entries_[it->index()];
VLOG(10) << "Processing: " << it->index();
// Record regexps triggered.
for (size_t i = 0; i < entry.regexps.size(); i++) {
VLOG(10) << "Regexp triggered: " << entry.regexps[i];
for (size_t i = 0; i < entry.regexps.size(); i++)
regexps->set(entry.regexps[i], 1);
}
int c;
// Pass trigger up to parents.
for (StdIntMap::iterator it = entry.parents->begin();
@ -328,7 +338,6 @@ void PrefilterTree::PropagateMatch(const vector<int>& atom_ids,
++it) {
int j = it->first;
const Entry& parent = entries_[j];
VLOG(10) << " parent= " << j << " trig= " << parent.propagate_up_at_count;
// Delay until all the children have succeeded.
if (parent.propagate_up_at_count > 1) {
if (count.has_index(j)) {
@ -341,7 +350,6 @@ void PrefilterTree::PropagateMatch(const vector<int>& atom_ids,
if (c < parent.propagate_up_at_count)
continue;
}
VLOG(10) << "Triggering: " << j;
// Trigger the parent.
work.set(j, 1);
}
@ -350,25 +358,25 @@ void PrefilterTree::PropagateMatch(const vector<int>& atom_ids,
// Debugging help.
void PrefilterTree::PrintPrefilter(int regexpid) {
LOG(INFO) << DebugNodeString(prefilter_vec_[regexpid]);
LOG(ERROR) << DebugNodeString(prefilter_vec_[regexpid]);
}
void PrefilterTree::PrintDebugInfo() {
VLOG(10) << "#Unique Atoms: " << atom_index_to_id_.size();
VLOG(10) << "#Unique Nodes: " << entries_.size();
void PrefilterTree::PrintDebugInfo(NodeMap* nodes) {
LOG(ERROR) << "#Unique Atoms: " << atom_index_to_id_.size();
LOG(ERROR) << "#Unique Nodes: " << entries_.size();
for (size_t i = 0; i < entries_.size(); ++i) {
StdIntMap* parents = entries_[i].parents;
const vector<int>& regexps = entries_[i].regexps;
VLOG(10) << "EntryId: " << i
const std::vector<int>& regexps = entries_[i].regexps;
LOG(ERROR) << "EntryId: " << i
<< " N: " << parents->size() << " R: " << regexps.size();
for (StdIntMap::iterator it = parents->begin(); it != parents->end(); ++it)
VLOG(10) << it->first;
LOG(ERROR) << it->first;
}
VLOG(10) << "Map:";
for (map<string, Prefilter*>::const_iterator iter = node_map_.begin();
iter != node_map_.end(); ++iter)
VLOG(10) << "NodeId: " << (*iter).second->unique_id()
LOG(ERROR) << "Map:";
for (std::map<string, Prefilter*>::const_iterator iter = nodes->begin();
iter != nodes->end(); ++iter)
LOG(ERROR) << "NodeId: " << (*iter).second->unique_id()
<< " Str: " << (*iter).first;
}
@ -385,7 +393,7 @@ string PrefilterTree::DebugNodeString(Prefilter* node) const {
for (size_t i = 0; i < node->subs()->size(); i++) {
if (i > 0)
node_string += ',';
node_string += Itoa((*node->subs())[i]->unique_id());
node_string += StringPrintf("%d", (*node->subs())[i]->unique_id());
node_string += ":";
node_string += DebugNodeString((*node->subs())[i]);
}

View File

@ -2,6 +2,9 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_PREFILTER_TREE_H_
#define RE2_PREFILTER_TREE_H_
// The PrefilterTree class is used to form an AND-OR tree of strings
// that would trigger each regexp. The 'prefilter' of each regexp is
// added tp PrefilterTree, and then PrefilterTree is used to find all
@ -12,23 +15,21 @@
// favorite engine. PrefilterTree provides a set of strings (called
// atoms) that the user of this class should use to do the string
// matching.
//
#ifndef RE2_PREFILTER_TREE_H_
#define RE2_PREFILTER_TREE_H_
#include <map>
#include <string>
#include <vector>
#include "util/util.h"
#include "util/sparse_array.h"
#include "re2/prefilter.h"
namespace re2 {
typedef SparseArray<int> IntMap;
typedef map<int,int> StdIntMap;
class Prefilter;
class PrefilterTree {
public:
PrefilterTree();
explicit PrefilterTree(int min_atom_len);
~PrefilterTree();
// Adds the prefilter for the next regexp. Note that we assume that
@ -42,20 +43,24 @@ class PrefilterTree {
// The caller should use the returned set of strings to do string matching.
// Each time a string matches, the corresponding index then has to be
// and passed to RegexpsGivenStrings below.
void Compile(vector<string>* atom_vec);
void Compile(std::vector<string>* atom_vec);
// Given the indices of the atoms that matched, returns the indexes
// of regexps that should be searched. The matched_atoms should
// contain all the ids of string atoms that were found to match the
// content. The caller can use any string match engine to perform
// this function. This function is thread safe.
void RegexpsGivenStrings(const vector<int>& matched_atoms,
vector<int>* regexps) const;
void RegexpsGivenStrings(const std::vector<int>& matched_atoms,
std::vector<int>* regexps) const;
// Print debug prefilter. Also prints unique ids associated with
// nodes of the prefilter of the regexp.
void PrintPrefilter(int regexpid);
private:
typedef SparseArray<int> IntMap;
typedef std::map<int, int> StdIntMap;
typedef std::map<string, Prefilter*> NodeMap;
// Each unique node has a corresponding Entry that helps in
// passing the matching trigger information along the tree.
@ -76,22 +81,24 @@ class PrefilterTree {
// When this node is ready to trigger the parent, what are the
// regexps that are triggered.
vector<int> regexps;
std::vector<int> regexps;
};
private:
// Returns true if the prefilter node should be kept.
bool KeepNode(Prefilter* node) const;
// This function assigns unique ids to various parts of the
// prefilter, by looking at if these nodes are already in the
// PrefilterTree.
void AssignUniqueIds(vector<string>* atom_vec);
void AssignUniqueIds(NodeMap* nodes, std::vector<string>* atom_vec);
// Given the matching atoms, find the regexps to be triggered.
void PropagateMatch(const vector<int>& atom_ids,
void PropagateMatch(const std::vector<int>& atom_ids,
IntMap* regexps) const;
// Returns the prefilter node that has the same NodeString as this
// node. For the canonical node, returns node.
Prefilter* CanonicalNode(Prefilter* node);
Prefilter* CanonicalNode(NodeMap* nodes, Prefilter* node);
// A string that uniquely identifies the node. Assumes that the
// children of node has already been assigned unique ids.
@ -101,29 +108,30 @@ class PrefilterTree {
string DebugNodeString(Prefilter* node) const;
// Used for debugging.
void PrintDebugInfo();
void PrintDebugInfo(NodeMap* nodes);
// These are all the nodes formed by Compile. Essentially, there is
// one node for each unique atom and each unique AND/OR node.
vector<Entry> entries_;
// Map node string to canonical Prefilter node.
map<string, Prefilter*> node_map_;
std::vector<Entry> entries_;
// indices of regexps that always pass through the filter (since we
// found no required literals in these regexps).
vector<int> unfiltered_;
std::vector<int> unfiltered_;
// vector of Prefilter for all regexps.
vector<Prefilter*> prefilter_vec_;
std::vector<Prefilter*> prefilter_vec_;
// Atom index in returned strings to entry id mapping.
vector<int> atom_index_to_id_;
std::vector<int> atom_index_to_id_;
// Has the prefilter tree been compiled.
bool compiled_;
DISALLOW_EVIL_CONSTRUCTORS(PrefilterTree);
// Strings less than this length are not stored as atoms.
const int min_atom_len_;
PrefilterTree(const PrefilterTree&) = delete;
PrefilterTree& operator=(const PrefilterTree&) = delete;
};
} // namespace

View File

@ -5,48 +5,57 @@
// Compiled regular expression representation.
// Tested by compile_test.cc
#include "util/util.h"
#include "util/sparse_set.h"
#include "re2/prog.h"
#include <stdint.h>
#include <string.h>
#include <algorithm>
#include <memory>
#include <utility>
#include "util/util.h"
#include "util/logging.h"
#include "util/strutil.h"
#include "re2/bitmap256.h"
#include "re2/stringpiece.h"
namespace re2 {
// Constructors per Inst opcode
void Prog::Inst::InitAlt(uint32 out, uint32 out1) {
void Prog::Inst::InitAlt(uint32_t out, uint32_t out1) {
DCHECK_EQ(out_opcode_, 0);
set_out_opcode(out, kInstAlt);
out1_ = out1;
}
void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32 out) {
void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32_t out) {
DCHECK_EQ(out_opcode_, 0);
set_out_opcode(out, kInstByteRange);
lo_ = lo & 0xFF;
hi_ = hi & 0xFF;
foldcase_ = foldcase;
foldcase_ = foldcase & 0xFF;
}
void Prog::Inst::InitCapture(int cap, uint32 out) {
void Prog::Inst::InitCapture(int cap, uint32_t out) {
DCHECK_EQ(out_opcode_, 0);
set_out_opcode(out, kInstCapture);
cap_ = cap;
}
void Prog::Inst::InitEmptyWidth(EmptyOp empty, uint32 out) {
void Prog::Inst::InitEmptyWidth(EmptyOp empty, uint32_t out) {
DCHECK_EQ(out_opcode_, 0);
set_out_opcode(out, kInstEmptyWidth);
empty_ = empty;
}
void Prog::Inst::InitMatch(int32 id) {
void Prog::Inst::InitMatch(int32_t id) {
DCHECK_EQ(out_opcode_, 0);
set_opcode(kInstMatch);
match_id_ = id;
}
void Prog::Inst::InitNop(uint32 out) {
void Prog::Inst::InitNop(uint32_t out) {
DCHECK_EQ(out_opcode_, 0);
set_opcode(kInstNop);
}
@ -94,34 +103,27 @@ Prog::Prog()
: anchor_start_(false),
anchor_end_(false),
reversed_(false),
did_flatten_(false),
did_onepass_(false),
start_(0),
start_unanchored_(0),
size_(0),
byte_inst_count_(0),
bytemap_range_(0),
first_byte_(-1),
flags_(0),
onepass_statesize_(0),
list_count_(0),
inst_(NULL),
dfa_first_(NULL),
dfa_longest_(NULL),
dfa_mem_(0),
delete_dfa_(NULL),
unbytemap_(NULL),
onepass_nodes_(NULL),
onepass_start_(NULL) {
dfa_mem_(0),
dfa_first_(NULL),
dfa_longest_(NULL) {
}
Prog::~Prog() {
if (delete_dfa_) {
if (dfa_first_)
delete_dfa_(dfa_first_);
if (dfa_longest_)
delete_dfa_(dfa_longest_);
}
DeleteDFA(dfa_longest_);
DeleteDFA(dfa_first_);
delete[] onepass_nodes_;
delete[] inst_;
delete[] unbytemap_;
}
typedef SparseSet Workq;
@ -133,7 +135,6 @@ static inline void AddToQueue(Workq* q, int id) {
static string ProgToString(Prog* prog, Workq* q) {
string s;
for (Workq::iterator i = q->begin(); i != q->end(); ++i) {
int id = *i;
Prog::Inst* ip = prog->inst(id);
@ -145,29 +146,56 @@ static string ProgToString(Prog* prog, Workq* q) {
return s;
}
static string FlattenedProgToString(Prog* prog, int start) {
string s;
for (int id = start; id < prog->size(); id++) {
Prog::Inst* ip = prog->inst(id);
if (ip->last())
StringAppendF(&s, "%d. %s\n", id, ip->Dump().c_str());
else
StringAppendF(&s, "%d+ %s\n", id, ip->Dump().c_str());
}
return s;
}
string Prog::Dump() {
string map;
if (false) { // Debugging
int lo = 0;
StringAppendF(&map, "byte map:\n");
for (int i = 0; i < bytemap_range_; i++) {
StringAppendF(&map, "\t%d. [%02x-%02x]\n", i, lo, unbytemap_[i]);
lo = unbytemap_[i] + 1;
}
StringAppendF(&map, "\n");
}
if (did_flatten_)
return FlattenedProgToString(this, start_);
Workq q(size_);
AddToQueue(&q, start_);
return map + ProgToString(this, &q);
return ProgToString(this, &q);
}
string Prog::DumpUnanchored() {
if (did_flatten_)
return FlattenedProgToString(this, start_unanchored_);
Workq q(size_);
AddToQueue(&q, start_unanchored_);
return ProgToString(this, &q);
}
string Prog::DumpByteMap() {
string map;
for (int c = 0; c < 256; c++) {
int b = bytemap_[c];
int lo = c;
while (c < 256-1 && bytemap_[c+1] == b)
c++;
int hi = c;
StringAppendF(&map, "[%02x-%02x] -> %d\n", lo, hi, b);
}
return map;
}
int Prog::first_byte() {
std::call_once(first_byte_once_, [](Prog* prog) {
prog->first_byte_ = prog->ComputeFirstByte();
}, this);
return first_byte_;
}
static bool IsMatch(Prog*, Prog::Inst*);
// Peep-hole optimizer.
@ -260,7 +288,7 @@ static bool IsMatch(Prog* prog, Prog::Inst* ip) {
}
}
uint32 Prog::EmptyFlags(const StringPiece& text, const char* p) {
uint32_t Prog::EmptyFlags(const StringPiece& text, const char* p) {
int flags = 0;
// ^ and \A
@ -294,50 +322,505 @@ uint32 Prog::EmptyFlags(const StringPiece& text, const char* p) {
return flags;
}
void Prog::MarkByteRange(int lo, int hi) {
// ByteMapBuilder implements a coloring algorithm.
//
// The first phase is a series of "mark and merge" batches: we mark one or more
// [lo-hi] ranges, then merge them into our internal state. Batching is not for
// performance; rather, it means that the ranges are treated indistinguishably.
//
// Internally, the ranges are represented using a bitmap that stores the splits
// and a vector that stores the colors; both of them are indexed by the ranges'
// last bytes. Thus, in order to merge a [lo-hi] range, we split at lo-1 and at
// hi (if not already split), then recolor each range in between. The color map
// (i.e. from the old color to the new color) is maintained for the lifetime of
// the batch and so underpins this somewhat obscure approach to set operations.
//
// The second phase builds the bytemap from our internal state: we recolor each
// range, then store the new color (which is now the byte class) in each of the
// corresponding array elements. Finally, we output the number of byte classes.
class ByteMapBuilder {
public:
ByteMapBuilder() {
// Initial state: the [0-255] range has color 256.
// This will avoid problems during the second phase,
// in which we assign byte classes numbered from 0.
splits_.Set(255);
colors_.resize(256);
colors_[255] = 256;
nextcolor_ = 257;
}
void Mark(int lo, int hi);
void Merge();
void Build(uint8_t* bytemap, int* bytemap_range);
private:
int Recolor(int oldcolor);
Bitmap256 splits_;
std::vector<int> colors_;
int nextcolor_;
std::vector<std::pair<int, int>> colormap_;
std::vector<std::pair<int, int>> ranges_;
ByteMapBuilder(const ByteMapBuilder&) = delete;
ByteMapBuilder& operator=(const ByteMapBuilder&) = delete;
};
void ByteMapBuilder::Mark(int lo, int hi) {
DCHECK_GE(lo, 0);
DCHECK_GE(hi, 0);
DCHECK_LE(lo, 255);
DCHECK_LE(hi, 255);
DCHECK_LE(lo, hi);
if (0 < lo && lo <= 255)
byterange_.Set(lo - 1);
if (0 <= hi && hi <= 255)
byterange_.Set(hi);
// Ignore any [0-255] ranges. They cause us to recolor every range, which
// has no effect on the eventual result and is therefore a waste of time.
if (lo == 0 && hi == 255)
return;
ranges_.emplace_back(lo, hi);
}
void ByteMapBuilder::Merge() {
for (std::vector<std::pair<int, int>>::const_iterator it = ranges_.begin();
it != ranges_.end();
++it) {
int lo = it->first-1;
int hi = it->second;
if (0 <= lo && !splits_.Test(lo)) {
splits_.Set(lo);
int next = splits_.FindNextSetBit(lo+1);
colors_[lo] = colors_[next];
}
if (!splits_.Test(hi)) {
splits_.Set(hi);
int next = splits_.FindNextSetBit(hi+1);
colors_[hi] = colors_[next];
}
int c = lo+1;
while (c < 256) {
int next = splits_.FindNextSetBit(c);
colors_[next] = Recolor(colors_[next]);
if (next == hi)
break;
c = next+1;
}
}
colormap_.clear();
ranges_.clear();
}
void ByteMapBuilder::Build(uint8_t* bytemap, int* bytemap_range) {
// Assign byte classes numbered from 0.
nextcolor_ = 0;
int c = 0;
while (c < 256) {
int next = splits_.FindNextSetBit(c);
uint8_t b = static_cast<uint8_t>(Recolor(colors_[next]));
while (c <= next) {
bytemap[c] = b;
c++;
}
}
*bytemap_range = nextcolor_;
}
int ByteMapBuilder::Recolor(int oldcolor) {
// Yes, this is a linear search. There can be at most 256
// colors and there will typically be far fewer than that.
// Also, we need to consider keys *and* values in order to
// avoid recoloring a given range more than once per batch.
std::vector<std::pair<int, int>>::const_iterator it =
std::find_if(colormap_.begin(), colormap_.end(),
[=](const std::pair<int, int>& kv) -> bool {
return kv.first == oldcolor || kv.second == oldcolor;
});
if (it != colormap_.end())
return it->second;
int newcolor = nextcolor_;
nextcolor_++;
colormap_.emplace_back(oldcolor, newcolor);
return newcolor;
}
void Prog::ComputeByteMap() {
// Fill in bytemap with byte classes for prog_.
// Ranges of bytes that are treated as indistinguishable
// by the regexp program are mapped to a single byte class.
// The vector prog_->byterange() marks the end of each
// such range.
const Bitmap<256>& v = byterange();
// Fill in bytemap with byte classes for the program.
// Ranges of bytes that are treated indistinguishably
// will be mapped to a single byte class.
ByteMapBuilder builder;
COMPILE_ASSERT(8*sizeof(v.Word(0)) == 32, wordsize);
uint8 n = 0;
uint32 bits = 0;
for (int i = 0; i < 256; i++) {
if ((i&31) == 0)
bits = v.Word(i >> 5);
bytemap_[i] = n;
n += bits & 1;
bits >>= 1;
// Don't repeat the work for ^ and $.
bool marked_line_boundaries = false;
// Don't repeat the work for \b and \B.
bool marked_word_boundaries = false;
for (int id = 0; id < size(); id++) {
Inst* ip = inst(id);
if (ip->opcode() == kInstByteRange) {
int lo = ip->lo();
int hi = ip->hi();
builder.Mark(lo, hi);
if (ip->foldcase() && lo <= 'z' && hi >= 'a') {
int foldlo = lo;
int foldhi = hi;
if (foldlo < 'a')
foldlo = 'a';
if (foldhi > 'z')
foldhi = 'z';
if (foldlo <= foldhi)
builder.Mark(foldlo + 'A' - 'a', foldhi + 'A' - 'a');
}
bytemap_range_ = bytemap_[255] + 1;
unbytemap_ = new uint8[bytemap_range_];
// If this Inst is not the last Inst in its list AND the next Inst is
// also a ByteRange AND the Insts have the same out, defer the merge.
if (!ip->last() &&
inst(id+1)->opcode() == kInstByteRange &&
ip->out() == inst(id+1)->out())
continue;
builder.Merge();
} else if (ip->opcode() == kInstEmptyWidth) {
if (ip->empty() & (kEmptyBeginLine|kEmptyEndLine) &&
!marked_line_boundaries) {
builder.Mark('\n', '\n');
builder.Merge();
marked_line_boundaries = true;
}
if (ip->empty() & (kEmptyWordBoundary|kEmptyNonWordBoundary) &&
!marked_word_boundaries) {
// We require two batches here: the first for ranges that are word
// characters, the second for ranges that are not word characters.
for (bool isword : {true, false}) {
int j;
for (int i = 0; i < 256; i = j) {
for (j = i + 1; j < 256 &&
Prog::IsWordChar(static_cast<uint8_t>(i)) ==
Prog::IsWordChar(static_cast<uint8_t>(j));
j++)
;
if (Prog::IsWordChar(static_cast<uint8_t>(i)) == isword)
builder.Mark(i, j - 1);
}
builder.Merge();
}
marked_word_boundaries = true;
}
}
}
builder.Build(bytemap_, &bytemap_range_);
if (0) { // For debugging, use trivial bytemap.
LOG(ERROR) << "Using trivial bytemap.";
for (int i = 0; i < 256; i++)
unbytemap_[bytemap_[i]] = i;
if (0) { // For debugging: use trivial byte map.
for (int i = 0; i < 256; i++) {
bytemap_[i] = i;
unbytemap_[i] = i;
}
bytemap_[i] = static_cast<uint8_t>(i);
bytemap_range_ = 256;
LOG(INFO) << "Using trivial bytemap.";
}
}
// Prog::Flatten() implements a graph rewriting algorithm.
//
// The overall process is similar to epsilon removal, but retains some epsilon
// transitions: those from Capture and EmptyWidth instructions; and those from
// nullable subexpressions. (The latter avoids quadratic blowup in transitions
// in the worst case.) It might be best thought of as Alt instruction elision.
//
// In conceptual terms, it divides the Prog into "trees" of instructions, then
// traverses the "trees" in order to produce "lists" of instructions. A "tree"
// is one or more instructions that grow from one "root" instruction to one or
// more "leaf" instructions; if a "tree" has exactly one instruction, then the
// "root" is also the "leaf". In most cases, a "root" is the successor of some
// "leaf" (i.e. the "leaf" instruction's out() returns the "root" instruction)
// and is considered a "successor root". A "leaf" can be a ByteRange, Capture,
// EmptyWidth or Match instruction. However, this is insufficient for handling
// nested nullable subexpressions correctly, so in some cases, a "root" is the
// dominator of the instructions reachable from some "successor root" (i.e. it
// has an unreachable predecessor) and is considered a "dominator root". Since
// only Alt instructions can be "dominator roots" (other instructions would be
// "leaves"), only Alt instructions are required to be marked as predecessors.
//
// Dividing the Prog into "trees" comprises two passes: marking the "successor
// roots" and the predecessors; and marking the "dominator roots". Sorting the
// "successor roots" by their bytecode offsets enables iteration in order from
// greatest to least during the second pass; by working backwards in this case
// and flooding the graph no further than "leaves" and already marked "roots",
// it becomes possible to mark "dominator roots" without doing excessive work.
//
// Traversing the "trees" is just iterating over the "roots" in order of their
// marking and flooding the graph no further than "leaves" and "roots". When a
// "leaf" is reached, the instruction is copied with its successor remapped to
// its "root" number. When a "root" is reached, a Nop instruction is generated
// with its successor remapped similarly. As each "list" is produced, its last
// instruction is marked as such. After all of the "lists" have been produced,
// a pass over their instructions remaps their successors to bytecode offsets.
void Prog::Flatten() {
if (did_flatten_)
return;
did_flatten_ = true;
// Scratch structures. It's important that these are reused by functions
// that we call in loops because they would thrash the heap otherwise.
SparseSet reachable(size());
std::vector<int> stk;
stk.reserve(size());
// First pass: Marks "successor roots" and predecessors.
// Builds the mapping from inst-ids to root-ids.
SparseArray<int> rootmap(size());
SparseArray<int> predmap(size());
std::vector<std::vector<int>> predvec;
MarkSuccessors(&rootmap, &predmap, &predvec, &reachable, &stk);
// Second pass: Marks "dominator roots".
SparseArray<int> sorted(rootmap);
std::sort(sorted.begin(), sorted.end(), sorted.less);
for (SparseArray<int>::const_iterator i = sorted.end() - 1;
i != sorted.begin();
--i) {
if (i->index() != start_unanchored() && i->index() != start())
MarkDominator(i->index(), &rootmap, &predmap, &predvec, &reachable, &stk);
}
// Third pass: Emits "lists". Remaps outs to root-ids.
// Builds the mapping from root-ids to flat-ids.
std::vector<int> flatmap(rootmap.size());
std::vector<Inst> flat;
flat.reserve(size());
for (SparseArray<int>::const_iterator i = rootmap.begin();
i != rootmap.end();
++i) {
flatmap[i->value()] = static_cast<int>(flat.size());
EmitList(i->index(), &rootmap, &flat, &reachable, &stk);
flat.back().set_last();
}
list_count_ = static_cast<int>(flatmap.size());
for (int i = 0; i < kNumInst; i++)
inst_count_[i] = 0;
// Fourth pass: Remaps outs to flat-ids.
// Counts instructions by opcode.
for (int id = 0; id < static_cast<int>(flat.size()); id++) {
Inst* ip = &flat[id];
if (ip->opcode() != kInstAltMatch) // handled in EmitList()
ip->set_out(flatmap[ip->out()]);
inst_count_[ip->opcode()]++;
}
int total = 0;
for (int i = 0; i < kNumInst; i++)
total += inst_count_[i];
DCHECK_EQ(total, static_cast<int>(flat.size()));
// Remap start_unanchored and start.
if (start_unanchored() == 0) {
DCHECK_EQ(start(), 0);
} else if (start_unanchored() == start()) {
set_start_unanchored(flatmap[1]);
set_start(flatmap[1]);
} else {
set_start_unanchored(flatmap[1]);
set_start(flatmap[2]);
}
// Finally, replace the old instructions with the new instructions.
size_ = static_cast<int>(flat.size());
delete[] inst_;
inst_ = new Inst[size_];
memmove(inst_, flat.data(), size_ * sizeof *inst_);
}
void Prog::MarkSuccessors(SparseArray<int>* rootmap,
SparseArray<int>* predmap,
std::vector<std::vector<int>>* predvec,
SparseSet* reachable, std::vector<int>* stk) {
// Mark the kInstFail instruction.
rootmap->set_new(0, rootmap->size());
// Mark the start_unanchored and start instructions.
if (!rootmap->has_index(start_unanchored()))
rootmap->set_new(start_unanchored(), rootmap->size());
if (!rootmap->has_index(start()))
rootmap->set_new(start(), rootmap->size());
reachable->clear();
stk->clear();
stk->push_back(start_unanchored());
while (!stk->empty()) {
int id = stk->back();
stk->pop_back();
Loop:
if (reachable->contains(id))
continue;
reachable->insert_new(id);
Inst* ip = inst(id);
switch (ip->opcode()) {
default:
LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
break;
case kInstAltMatch:
case kInstAlt:
// Mark this instruction as a predecessor of each out.
for (int out : {ip->out(), ip->out1()}) {
if (!predmap->has_index(out)) {
predmap->set_new(out, static_cast<int>(predvec->size()));
predvec->emplace_back();
}
(*predvec)[predmap->get_existing(out)].emplace_back(id);
}
stk->push_back(ip->out1());
id = ip->out();
goto Loop;
case kInstByteRange:
case kInstCapture:
case kInstEmptyWidth:
// Mark the out of this instruction as a "root".
if (!rootmap->has_index(ip->out()))
rootmap->set_new(ip->out(), rootmap->size());
id = ip->out();
goto Loop;
case kInstNop:
id = ip->out();
goto Loop;
case kInstMatch:
case kInstFail:
break;
}
}
}
void Prog::MarkDominator(int root, SparseArray<int>* rootmap,
SparseArray<int>* predmap,
std::vector<std::vector<int>>* predvec,
SparseSet* reachable, std::vector<int>* stk) {
reachable->clear();
stk->clear();
stk->push_back(root);
while (!stk->empty()) {
int id = stk->back();
stk->pop_back();
Loop:
if (reachable->contains(id))
continue;
reachable->insert_new(id);
if (id != root && rootmap->has_index(id)) {
// We reached another "tree" via epsilon transition.
continue;
}
Inst* ip = inst(id);
switch (ip->opcode()) {
default:
LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
break;
case kInstAltMatch:
case kInstAlt:
stk->push_back(ip->out1());
id = ip->out();
goto Loop;
case kInstByteRange:
case kInstCapture:
case kInstEmptyWidth:
break;
case kInstNop:
id = ip->out();
goto Loop;
case kInstMatch:
case kInstFail:
break;
}
}
for (SparseSet::const_iterator i = reachable->begin();
i != reachable->end();
++i) {
int id = *i;
if (predmap->has_index(id)) {
for (int pred : (*predvec)[predmap->get_existing(id)]) {
if (!reachable->contains(pred)) {
// id has a predecessor that cannot be reached from root!
// Therefore, id must be a "root" too - mark it as such.
if (!rootmap->has_index(id))
rootmap->set_new(id, rootmap->size());
}
}
}
}
}
void Prog::EmitList(int root, SparseArray<int>* rootmap,
std::vector<Inst>* flat,
SparseSet* reachable, std::vector<int>* stk) {
reachable->clear();
stk->clear();
stk->push_back(root);
while (!stk->empty()) {
int id = stk->back();
stk->pop_back();
Loop:
if (reachable->contains(id))
continue;
reachable->insert_new(id);
if (id != root && rootmap->has_index(id)) {
// We reached another "tree" via epsilon transition. Emit a kInstNop
// instruction so that the Prog does not become quadratically larger.
flat->emplace_back();
flat->back().set_opcode(kInstNop);
flat->back().set_out(rootmap->get_existing(id));
continue;
}
Inst* ip = inst(id);
switch (ip->opcode()) {
default:
LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
break;
case kInstAltMatch:
flat->emplace_back();
flat->back().set_opcode(kInstAltMatch);
flat->back().set_out(static_cast<int>(flat->size()));
flat->back().out1_ = static_cast<uint32_t>(flat->size())+1;
FALLTHROUGH_INTENDED;
case kInstAlt:
stk->push_back(ip->out1());
id = ip->out();
goto Loop;
case kInstByteRange:
case kInstCapture:
case kInstEmptyWidth:
flat->emplace_back();
memmove(&flat->back(), ip, sizeof *ip);
flat->back().set_out(rootmap->get_existing(ip->out()));
break;
case kInstNop:
id = ip->out();
goto Loop;
case kInstMatch:
case kInstFail:
flat->emplace_back();
memmove(&flat->back(), ip, sizeof *ip);
break;
}
}
}
} // namespace re2

View File

@ -2,50 +2,27 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_PROG_H_
#define RE2_PROG_H_
// Compiled representation of regular expressions.
// See regexp.h for the Regexp class, which represents a regular
// expression symbolically.
#ifndef RE2_PROG_H__
#define RE2_PROG_H__
#include <stdint.h>
#include <functional>
#include <mutex>
#include <string>
#include <vector>
#include "util/util.h"
#include "util/logging.h"
#include "util/sparse_array.h"
#include "util/sparse_set.h"
#include "re2/re2.h"
namespace re2 {
// Simple fixed-size bitmap.
template<int Bits>
class Bitmap {
public:
Bitmap() { Reset(); }
int Size() { return Bits; }
void Reset() {
for (int i = 0; i < Words; i++)
w_[i] = 0;
}
bool Get(int k) const {
return w_[k >> WordLog] & (1<<(k & 31));
}
void Set(int k) {
w_[k >> WordLog] |= 1<<(k & 31);
}
void Clear(int k) {
w_[k >> WordLog] &= ~(1<<(k & 31));
}
uint32 Word(int i) const {
return w_[i];
}
private:
static const int WordLog = 5;
static const int Words = (Bits+31)/32;
uint32 w_[Words];
DISALLOW_EVIL_CONSTRUCTORS(Bitmap);
};
// Opcodes for Inst
enum InstOp {
kInstAlt = 0, // choose between out_ and out1_
@ -56,6 +33,7 @@ enum InstOp {
kInstMatch, // found a match!
kInstNop, // no-op; occasionally unavoidable
kInstFail, // never match; occasionally unavoidable
kNumInst,
};
// Bit flags for empty-width specials
@ -69,10 +47,8 @@ enum EmptyOp {
kEmptyAllFlags = (1<<6)-1,
};
class Regexp;
class DFA;
struct OneState;
class Regexp;
// Compiled form of regexp program.
class Prog {
@ -85,19 +61,24 @@ class Prog {
public:
Inst() : out_opcode_(0), out1_(0) {}
// Copyable.
Inst(const Inst&) = default;
Inst& operator=(const Inst&) = default;
// Constructors per opcode
void InitAlt(uint32 out, uint32 out1);
void InitByteRange(int lo, int hi, int foldcase, uint32 out);
void InitCapture(int cap, uint32 out);
void InitEmptyWidth(EmptyOp empty, uint32 out);
void InitAlt(uint32_t out, uint32_t out1);
void InitByteRange(int lo, int hi, int foldcase, uint32_t out);
void InitCapture(int cap, uint32_t out);
void InitEmptyWidth(EmptyOp empty, uint32_t out);
void InitMatch(int id);
void InitNop(uint32 out);
void InitNop(uint32_t out);
void InitFail();
// Getters
int id(Prog* p) { return this - p->inst_; }
int id(Prog* p) { return static_cast<int>(this - p->inst_); }
InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); }
int out() { return out_opcode_>>3; }
int last() { return (out_opcode_>>3)&1; }
int out() { return out_opcode_>>4; }
int out1() { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; }
int cap() { DCHECK_EQ(opcode(), kInstCapture); return cap_; }
int lo() { DCHECK_EQ(opcode(), kInstByteRange); return lo_; }
@ -105,9 +86,12 @@ class Prog {
int foldcase() { DCHECK_EQ(opcode(), kInstByteRange); return foldcase_; }
int match_id() { DCHECK_EQ(opcode(), kInstMatch); return match_id_; }
EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; }
bool greedy(Prog* p) {
DCHECK_EQ(opcode(), kInstAltMatch);
return p->inst(out())->opcode() == kInstByteRange;
return p->inst(out())->opcode() == kInstByteRange ||
(p->inst(out())->opcode() == kInstNop &&
p->inst(p->inst(out())->out())->opcode() == kInstByteRange);
}
// Does this inst (an kInstByteRange) match c?
@ -122,41 +106,45 @@ class Prog {
string Dump();
// Maximum instruction id.
// (Must fit in out_opcode_, and PatchList steals another bit.)
// (Must fit in out_opcode_. PatchList/last steal another bit.)
static const int kMaxInst = (1<<28) - 1;
private:
void set_opcode(InstOp opcode) {
out_opcode_ = (out()<<3) | opcode;
out_opcode_ = (out()<<4) | (last()<<3) | opcode;
}
void set_last() {
out_opcode_ = (out()<<4) | (1<<3) | opcode();
}
void set_out(int out) {
out_opcode_ = (out<<3) | opcode();
out_opcode_ = (out<<4) | (last()<<3) | opcode();
}
void set_out_opcode(int out, InstOp opcode) {
out_opcode_ = (out<<3) | opcode;
out_opcode_ = (out<<4) | (last()<<3) | opcode;
}
uint32 out_opcode_; // 29 bits of out, 3 (low) bits opcode
uint32_t out_opcode_; // 28 bits: out, 1 bit: last, 3 (low) bits: opcode
union { // additional instruction arguments:
uint32 out1_; // opcode == kInstAlt
uint32_t out1_; // opcode == kInstAlt
// alternate next instruction
int32 cap_; // opcode == kInstCapture
int32_t cap_; // opcode == kInstCapture
// Index of capture register (holds text
// position recorded by capturing parentheses).
// For \n (the submatch for the nth parentheses),
// the left parenthesis captures into register 2*n
// and the right one captures into register 2*n+1.
int32 match_id_; // opcode == kInstMatch
int32_t match_id_; // opcode == kInstMatch
// Match ID to identify this match (for re2::Set).
struct { // opcode == kInstByteRange
uint8 lo_; // byte range is lo_-hi_ inclusive
uint8 hi_; //
uint8 foldcase_; // convert A-Z to a-z before checking range.
uint8_t lo_; // byte range is lo_-hi_ inclusive
uint8_t hi_; //
uint8_t foldcase_; // convert A-Z to a-z before checking range.
};
EmptyOp empty_; // opcode == kInstEmptyWidth
@ -166,8 +154,6 @@ class Prog {
friend class Compiler;
friend struct PatchList;
friend class Prog;
DISALLOW_EVIL_CONSTRUCTORS(Inst);
};
// Whether to anchor the search.
@ -200,13 +186,13 @@ class Prog {
int start_unanchored() { return start_unanchored_; }
void set_start(int start) { start_ = start; }
void set_start_unanchored(int start) { start_unanchored_ = start; }
int64 size() { return size_; }
int size() { return size_; }
bool reversed() { return reversed_; }
void set_reversed(bool reversed) { reversed_ = reversed; }
int64 byte_inst_count() { return byte_inst_count_; }
const Bitmap<256>& byterange() { return byterange_; }
void set_dfa_mem(int64 dfa_mem) { dfa_mem_ = dfa_mem; }
int64 dfa_mem() { return dfa_mem_; }
int list_count() { return list_count_; }
int inst_count(InstOp op) { return inst_count_[op]; }
void set_dfa_mem(int64_t dfa_mem) { dfa_mem_ = dfa_mem; }
int64_t dfa_mem() { return dfa_mem_; }
int flags() { return flags_; }
void set_flags(int flags) { flags_ = flags; }
bool anchor_start() { return anchor_start_; }
@ -214,21 +200,19 @@ class Prog {
bool anchor_end() { return anchor_end_; }
void set_anchor_end(bool b) { anchor_end_ = b; }
int bytemap_range() { return bytemap_range_; }
const uint8* bytemap() { return bytemap_; }
const uint8_t* bytemap() { return bytemap_; }
// Lazily computed.
int first_byte();
// Returns string representation of program for debugging.
string Dump();
string DumpUnanchored();
// Record that at some point in the prog, the bytes in the range
// lo-hi (inclusive) are treated as different from bytes outside the range.
// Tracking this lets the DFA collapse commonly-treated byte ranges
// when recording state pointers, greatly reducing its memory footprint.
void MarkByteRange(int lo, int hi);
string DumpByteMap();
// Returns the set of kEmpty flags that are in effect at
// position p within context.
static uint32 EmptyFlags(const StringPiece& context, const char* p);
static uint32_t EmptyFlags(const StringPiece& context, const char* p);
// Returns whether byte c is a word character: ASCII only.
// Used by the implementation of \b and \B.
@ -237,7 +221,7 @@ class Prog {
// (the DFA has only one-byte lookahead).
// - even if the lookahead were possible, the Progs would be huge.
// This crude approximation is the same one PCRE uses.
static bool IsWordChar(uint8 c) {
static bool IsWordChar(uint8_t c) {
return ('A' <= c && c <= 'Z') ||
('a' <= c && c <= 'z') ||
('0' <= c && c <= '9') ||
@ -270,19 +254,37 @@ class Prog {
// If matches != NULL and kind == kManyMatch and there is a match,
// SearchDFA fills matches with the match IDs of the final matching state.
bool SearchDFA(const StringPiece& text, const StringPiece& context,
Anchor anchor, MatchKind kind,
StringPiece* match0, bool* failed,
vector<int>* matches);
Anchor anchor, MatchKind kind, StringPiece* match0,
bool* failed, SparseSet* matches);
// Build the entire DFA for the given match kind. FOR TESTING ONLY.
// The callback issued after building each DFA state with BuildEntireDFA().
// If next is null, then the memory budget has been exhausted and building
// will halt. Otherwise, the state has been built and next points to an array
// of bytemap_range()+1 slots holding the next states as per the bytemap and
// kByteEndText. The number of the state is implied by the callback sequence:
// the first callback is for state 0, the second callback is for state 1, ...
// match indicates whether the state is a matching state.
using DFAStateCallback = std::function<void(const int* next, bool match)>;
// Build the entire DFA for the given match kind.
// Usually the DFA is built out incrementally, as needed, which
// avoids lots of unnecessary work. This function is useful only
// for testing purposes. Returns number of states.
int BuildEntireDFA(MatchKind kind);
// avoids lots of unnecessary work.
// If cb is not empty, it receives one callback per state built.
// Returns the number of states built.
// FOR TESTING OR EXPERIMENTAL PURPOSES ONLY.
int BuildEntireDFA(MatchKind kind, const DFAStateCallback& cb);
// Controls whether the DFA should bail out early if the NFA would be faster.
// FOR TESTING ONLY.
static void TEST_dfa_should_bail_when_slow(bool b);
// Compute bytemap.
void ComputeByteMap();
// Computes whether all matches must begin with the same first
// byte, and if so, returns that byte. If not, returns -1.
int ComputeFirstByte();
// Run peep-hole optimizer on program.
void Optimize();
@ -329,48 +331,80 @@ class Prog {
// Returns true on success, false on error.
bool PossibleMatchRange(string* min, string* max, int maxlen);
// EXPERIMENTAL! SUBJECT TO CHANGE!
// Outputs the program fanout into the given sparse array.
void Fanout(SparseArray<int>* fanout);
// Compiles a collection of regexps to Prog. Each regexp will have
// its own Match instruction recording the index in the vector.
static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor,
Regexp* re);
// its own Match instruction recording the index in the output vector.
static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem);
// Flattens the Prog from "tree" form to "list" form. This is an in-place
// operation in the sense that the old instructions are lost.
void Flatten();
// Walks the Prog; the "successor roots" or predecessors of the reachable
// instructions are marked in rootmap or predmap/predvec, respectively.
// reachable and stk are preallocated scratch structures.
void MarkSuccessors(SparseArray<int>* rootmap,
SparseArray<int>* predmap,
std::vector<std::vector<int>>* predvec,
SparseSet* reachable, std::vector<int>* stk);
// Walks the Prog from the given "root" instruction; the "dominator root"
// of the reachable instructions (if such exists) is marked in rootmap.
// reachable and stk are preallocated scratch structures.
void MarkDominator(int root, SparseArray<int>* rootmap,
SparseArray<int>* predmap,
std::vector<std::vector<int>>* predvec,
SparseSet* reachable, std::vector<int>* stk);
// Walks the Prog from the given "root" instruction; the reachable
// instructions are emitted in "list" form and appended to flat.
// reachable and stk are preallocated scratch structures.
void EmitList(int root, SparseArray<int>* rootmap,
std::vector<Inst>* flat,
SparseSet* reachable, std::vector<int>* stk);
private:
friend class Compiler;
DFA* GetDFA(MatchKind kind);
void DeleteDFA(DFA* dfa);
bool anchor_start_; // regexp has explicit start anchor
bool anchor_end_; // regexp has explicit end anchor
bool reversed_; // whether program runs backward over input
bool did_flatten_; // has Flatten been called?
bool did_onepass_; // has IsOnePass been called?
int start_; // entry point for program
int start_unanchored_; // unanchored entry point for program
int size_; // number of instructions
int byte_inst_count_; // number of kInstByteRange instructions
int bytemap_range_; // bytemap_[x] < bytemap_range_
int first_byte_; // required first byte for match, or -1 if none
int flags_; // regexp parse flags
int onepass_statesize_; // byte size of each OneState* node
int list_count_; // count of lists (see above)
int inst_count_[kNumInst]; // count of instructions by opcode
Inst* inst_; // pointer to instruction array
uint8_t* onepass_nodes_; // data for OnePass nodes
Mutex dfa_mutex_; // Protects dfa_first_, dfa_longest_
DFA* volatile dfa_first_; // DFA cached for kFirstMatch
DFA* volatile dfa_longest_; // DFA cached for kLongestMatch and kFullMatch
int64 dfa_mem_; // Maximum memory for DFAs.
void (*delete_dfa_)(DFA* dfa);
int64_t dfa_mem_; // Maximum memory for DFAs.
DFA* dfa_first_; // DFA cached for kFirstMatch/kManyMatch
DFA* dfa_longest_; // DFA cached for kLongestMatch/kFullMatch
Bitmap<256> byterange_; // byterange.Get(x) true if x ends a
// commonly-treated byte range.
uint8 bytemap_[256]; // map from input bytes to byte classes
uint8 *unbytemap_; // bytemap_[unbytemap_[x]] == x
uint8_t bytemap_[256]; // map from input bytes to byte classes
uint8* onepass_nodes_; // data for OnePass nodes
OneState* onepass_start_; // start node for OnePass program
std::once_flag first_byte_once_;
std::once_flag dfa_first_once_;
std::once_flag dfa_longest_once_;
DISALLOW_EVIL_CONSTRUCTORS(Prog);
Prog(const Prog&) = delete;
Prog& operator=(const Prog&) = delete;
};
} // namespace re2
#endif // RE2_PROG_H__
#endif // RE2_PROG_H_

File diff suppressed because it is too large Load Diff

View File

@ -2,8 +2,8 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_RE2_H
#define RE2_RE2_H
#ifndef RE2_RE2_H_
#define RE2_RE2_H_
// C++ interface to the re2 regular-expression library.
// RE2 supports Perl-style regular expressions (with extensions like
@ -17,7 +17,7 @@
// some of the more complicated things thrown away. In particular,
// backreferences and generalized assertions are not available, nor is \Z.
//
// See http://code.google.com/p/re2/wiki/Syntax for the syntax
// See https://github.com/google/re2/wiki/Syntax for the syntax
// supported by RE2, and a comparison with PCRE and PERL regexps.
//
// For those not familiar with Perl's regular expressions,
@ -179,38 +179,24 @@
// RE2::Octal(&a), RE2::Hex(&b), RE2::CRadix(&c), RE2::CRadix(&d));
// will leave 64 in a, b, c, and d.
#include <stddef.h>
#include <stdint.h>
#include <algorithm>
#include <map>
#include <mutex>
#include <string>
#include "re2/stringpiece.h"
#include "re2/variadic_function.h"
#ifndef RE2_HAVE_LONGLONG
#define RE2_HAVE_LONGLONG 1
#endif
#include "re2/stringpiece.h"
namespace re2 {
class Prog;
class Regexp;
} // namespace re2
namespace re2 {
// TODO(junyer): Get rid of this.
using std::string;
using std::map;
class Mutex;
class Prog;
class Regexp;
// The following enum should be used only as a constructor argument to indicate
// that the variable has static storage class, and that the constructor should
// do nothing to its state. It indicates to the reader that it is legal to
// declare a static instance of the class, provided the constructor is given
// the LINKER_INITIALIZED argument. Normally, it is unsafe to declare a
// static variable that has a constructor or a destructor because invocation
// order is undefined. However, IF the type can be initialized by filling with
// zeroes (which the loader does for static variables), AND the type's
// destructor does nothing to the storage, then a constructor for static
// initialization can be declared as
// explicit MyClass(LinkerInitialized x) {}
// and invoked as
// static MyClass my_variable_name(LINKER_INITIALIZED);
enum LinkerInitialized { LINKER_INITIALIZED };
// Interface for regular expression matching. Also corresponds to a
// pre-compiled regular expression. An "RE2" object is safe for
@ -266,7 +252,7 @@ class RE2 {
RE2(const string& pattern);
#endif
RE2(const StringPiece& pattern);
RE2(const StringPiece& pattern, const Options& option);
RE2(const StringPiece& pattern, const Options& options);
~RE2();
// Returns whether RE2 was created properly.
@ -293,6 +279,11 @@ class RE2 {
// Larger numbers are more expensive than smaller numbers.
int ProgramSize() const;
// EXPERIMENTAL! SUBJECT TO CHANGE!
// Outputs the program fanout as a histogram bucketed by powers of 2.
// Returns the number of the largest non-empty bucket.
int ProgramFanout(std::map<int, int>* histogram) const;
// Returns the underlying Regexp; not for general use.
// Returns entire_regexp_ so that callers don't need
// to know about prefix_ and prefix_foldcase_.
@ -300,21 +291,21 @@ class RE2 {
/***** The useful part: the matching interface *****/
// Matches "text" against "pattern". If pointer arguments are
// Matches "text" against "re". If pointer arguments are
// supplied, copies matched sub-patterns into them.
//
// You can pass in a "const char*" or a "string" for "text".
// You can pass in a "const char*" or a "string" or a "RE2" for "pattern".
// You can pass in a "const char*" or a "string" or a "RE2" for "re".
//
// The provided pointer arguments can be pointers to any scalar numeric
// type, or one of:
// string (matched piece is copied to string)
// StringPiece (StringPiece is mutated to point to matched piece)
// T (where "bool T::ParseFrom(const char*, int)" exists)
// T (where "bool T::ParseFrom(const char*, size_t)" exists)
// (void*)NULL (the corresponding matched sub-pattern is not copied)
//
// Returns true iff all of the following conditions are satisfied:
// a. "text" matches "pattern" exactly
// a. "text" matches "re" exactly
// b. The number of matched sub-patterns is >= number of supplied pointers
// c. The "i"th argument has a suitable type for holding the
// string captured as the "i"th sub-pattern. If you pass in
@ -330,32 +321,65 @@ class RE2 {
// RE2::FullMatch("abc", "[a-z]+(\\d+)?", &number);
static bool FullMatchN(const StringPiece& text, const RE2& re,
const Arg* const args[], int argc);
static const VariadicFunction2<
bool, const StringPiece&, const RE2&, Arg, RE2::FullMatchN> FullMatch;
// Exactly like FullMatch(), except that "pattern" is allowed to match
// Exactly like FullMatch(), except that "re" is allowed to match
// a substring of "text".
static bool PartialMatchN(const StringPiece& text, const RE2& re, // 3..16 args
static bool PartialMatchN(const StringPiece& text, const RE2& re,
const Arg* const args[], int argc);
static const VariadicFunction2<
bool, const StringPiece&, const RE2&, Arg, RE2::PartialMatchN> PartialMatch;
// Like FullMatch() and PartialMatch(), except that pattern has to
// match a prefix of "text", and "input" is advanced past the matched
// Like FullMatch() and PartialMatch(), except that "re" has to match
// a prefix of the text, and "input" is advanced past the matched
// text. Note: "input" is modified iff this routine returns true.
static bool ConsumeN(StringPiece* input, const RE2& pattern, // 3..16 args
static bool ConsumeN(StringPiece* input, const RE2& re,
const Arg* const args[], int argc);
static const VariadicFunction2<
bool, StringPiece*, const RE2&, Arg, RE2::ConsumeN> Consume;
// Like Consume(..), but does not anchor the match at the beginning of the
// string. That is, "pattern" need not start its match at the beginning of
// "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds the next
// word in "s" and stores it in "word".
static bool FindAndConsumeN(StringPiece* input, const RE2& pattern,
// Like Consume(), but does not anchor the match at the beginning of
// the text. That is, "re" need not start its match at the beginning
// of "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds
// the next word in "s" and stores it in "word".
static bool FindAndConsumeN(StringPiece* input, const RE2& re,
const Arg* const args[], int argc);
static const VariadicFunction2<
bool, StringPiece*, const RE2&, Arg, RE2::FindAndConsumeN> FindAndConsume;
#ifndef SWIG
private:
template <typename F, typename SP>
static inline bool Apply(F f, SP sp, const RE2& re) {
return f(sp, re, NULL, 0);
}
template <typename F, typename SP, typename... A>
static inline bool Apply(F f, SP sp, const RE2& re, const A&... a) {
const Arg* const args[] = {&a...};
const int argc = sizeof...(a);
return f(sp, re, args, argc);
}
public:
// In order to allow FullMatch() et al. to be called with a varying number
// of arguments of varying types, we use two layers of variadic templates.
// The first layer constructs the temporary Arg objects. The second layer
// (above) constructs the array of pointers to the temporary Arg objects.
template <typename... A>
static bool FullMatch(const StringPiece& text, const RE2& re, A&&... a) {
return Apply(FullMatchN, text, re, Arg(std::forward<A>(a))...);
}
template <typename... A>
static bool PartialMatch(const StringPiece& text, const RE2& re, A&&... a) {
return Apply(PartialMatchN, text, re, Arg(std::forward<A>(a))...);
}
template <typename... A>
static bool Consume(StringPiece* input, const RE2& re, A&&... a) {
return Apply(ConsumeN, input, re, Arg(std::forward<A>(a))...);
}
template <typename... A>
static bool FindAndConsume(StringPiece* input, const RE2& re, A&&... a) {
return Apply(FindAndConsumeN, input, re, Arg(std::forward<A>(a))...);
}
#endif
// Replace the first match of "pattern" in "str" with "rewrite".
// Within "rewrite", backslash-escaped digits (\1 to \9) can be
@ -397,6 +421,8 @@ class RE2 {
//
// Returns true iff a match occurred and the extraction happened
// successfully; if no match occurs, the string is left unaffected.
//
// REQUIRES: "text" must not alias any part of "*out".
static bool Extract(const StringPiece &text,
const RE2& pattern,
const StringPiece &rewrite,
@ -440,17 +466,16 @@ class RE2 {
// does not count: if the regexp is "(a)(b)", returns 2.
int NumberOfCapturingGroups() const;
// Return a map from names to capturing indices.
// The map records the index of the leftmost group
// with the given name.
// Only valid until the re is deleted.
const map<string, int>& NamedCapturingGroups() const;
const std::map<string, int>& NamedCapturingGroups() const;
// Return a map from capturing indices to names.
// The map has no entries for unnamed groups.
// Only valid until the re is deleted.
const map<int, string>& CapturingGroupNames() const;
const std::map<int, string>& CapturingGroupNames() const;
// General matching routine.
// Match against text starting at offset startpos
@ -459,8 +484,8 @@ class RE2 {
// On a successful match, fills in match[] (up to nmatch entries)
// with information about submatches.
// I.e. matching RE2("(foo)|(bar)baz") on "barbazbla" will return true,
// setting match[0] = "barbaz", match[1] = NULL, match[2] = "bar",
// match[3] = NULL, ..., up to match[nmatch-1] = NULL.
// setting match[0] = "barbaz", match[1].data() = NULL, match[2] = "bar",
// match[3].data() = NULL, ..., up to match[nmatch-1].data() = NULL.
//
// Don't ask for more match information than you will use:
// runs much faster with nmatch == 1 than nmatch > 1, and
@ -471,10 +496,10 @@ class RE2 {
// Passing text == StringPiece(NULL, 0) will be handled like any other
// empty string, but note that on return, it will not be possible to tell
// whether submatch i matched the empty string or did not match:
// either way, match[i] == NULL.
// either way, match[i].data() == NULL.
bool Match(const StringPiece& text,
int startpos,
int endpos,
size_t startpos,
size_t endpos,
Anchor anchor,
StringPiece *match,
int nmatch) const;
@ -632,19 +657,7 @@ class RE2 {
void set_one_line(bool b) { one_line_ = b; }
void Copy(const Options& src) {
encoding_ = src.encoding_;
posix_syntax_ = src.posix_syntax_;
longest_match_ = src.longest_match_;
log_errors_ = src.log_errors_;
max_mem_ = src.max_mem_;
literal_ = src.literal_;
never_nl_ = src.never_nl_;
dot_nl_ = src.dot_nl_;
never_capture_ = src.never_capture_;
case_sensitive_ = src.case_sensitive_;
perl_classes_ = src.perl_classes_;
word_boundary_ = src.word_boundary_;
one_line_ = src.one_line_;
*this = src;
}
int ParseFlags() const;
@ -663,10 +676,6 @@ class RE2 {
bool perl_classes_;
bool word_boundary_;
bool one_line_;
//DISALLOW_EVIL_CONSTRUCTORS(Options);
Options(const Options&);
void operator=(const Options&);
};
// Returns the options set in the constructor.
@ -679,10 +688,8 @@ class RE2 {
static inline Arg CRadix(unsigned int* x);
static inline Arg CRadix(long* x);
static inline Arg CRadix(unsigned long* x);
#ifdef RE2_HAVE_LONGLONG
static inline Arg CRadix(long long* x);
static inline Arg CRadix(unsigned long long* x);
#endif
static inline Arg Hex(short* x);
static inline Arg Hex(unsigned short* x);
@ -690,10 +697,8 @@ class RE2 {
static inline Arg Hex(unsigned int* x);
static inline Arg Hex(long* x);
static inline Arg Hex(unsigned long* x);
#ifdef RE2_HAVE_LONGLONG
static inline Arg Hex(long long* x);
static inline Arg Hex(unsigned long long* x);
#endif
static inline Arg Octal(short* x);
static inline Arg Octal(unsigned short* x);
@ -701,23 +706,20 @@ class RE2 {
static inline Arg Octal(unsigned int* x);
static inline Arg Octal(long* x);
static inline Arg Octal(unsigned long* x);
#ifdef RE2_HAVE_LONGLONG
static inline Arg Octal(long long* x);
static inline Arg Octal(unsigned long long* x);
#endif
private:
void Init(const StringPiece& pattern, const Options& options);
bool DoMatch(const StringPiece& text,
Anchor anchor,
int* consumed,
size_t* consumed,
const Arg* const args[],
int n) const;
re2::Prog* ReverseProg() const;
mutable Mutex* mutex_;
string pattern_; // string regular expression
Options options_; // option flags
string prefix_; // required prefix (before regexp_)
@ -725,8 +727,9 @@ class RE2 {
re2::Regexp* entire_regexp_; // parsed regular expression
re2::Regexp* suffix_regexp_; // parsed regular expression, prefix removed
re2::Prog* prog_; // compiled program for regexp
mutable re2::Prog* rprog_; // reverse program for regexp
bool is_one_pass_; // can use prog_->SearchOnePass?
mutable re2::Prog* rprog_; // reverse program for regexp
mutable const string* error_; // Error indicator
// (or points to empty string)
mutable ErrorCode error_code_; // Error code
@ -734,14 +737,19 @@ class RE2 {
mutable int num_captures_; // Number of capturing groups
// Map from capture names to indices
mutable const map<string, int>* named_groups_;
mutable const std::map<string, int>* named_groups_;
// Map from capture indices to names
mutable const map<int, string>* group_names_;
mutable const std::map<int, string>* group_names_;
//DISALLOW_EVIL_CONSTRUCTORS(RE2);
RE2(const RE2&);
void operator=(const RE2&);
// Onces for lazy computations.
mutable std::once_flag rprog_once_;
mutable std::once_flag num_captures_once_;
mutable std::once_flag named_groups_once_;
mutable std::once_flag group_names_once_;
RE2(const RE2&) = delete;
RE2& operator=(const RE2&) = delete;
};
/***** Implementation details *****/
@ -752,7 +760,7 @@ class RE2 {
template <class T>
class _RE2_MatchObject {
public:
static inline bool Parse(const char* str, int n, void* dest) {
static inline bool Parse(const char* str, size_t n, void* dest) {
if (dest == NULL) return true;
T* object = reinterpret_cast<T*>(dest);
return object->ParseFrom(str, n);
@ -767,65 +775,64 @@ class RE2::Arg {
// Constructor specially designed for NULL arguments
Arg(void*);
typedef bool (*Parser)(const char* str, int n, void* dest);
typedef bool (*Parser)(const char* str, size_t n, void* dest);
// Type-specific parsers
#define MAKE_PARSER(type, name) \
Arg(type* p) : arg_(p), parser_(name) {} \
Arg(type* p, Parser parser) : arg_(p), parser_(parser) { } \
Arg(type* p, Parser parser) : arg_(p), parser_(parser) {}
MAKE_PARSER(char, parse_char);
MAKE_PARSER(signed char, parse_char);
MAKE_PARSER(signed char, parse_schar);
MAKE_PARSER(unsigned char, parse_uchar);
MAKE_PARSER(float, parse_float);
MAKE_PARSER(double, parse_double);
MAKE_PARSER(string, parse_string);
MAKE_PARSER(StringPiece, parse_stringpiece);
MAKE_PARSER(short, parse_short);
MAKE_PARSER(unsigned short, parse_ushort);
MAKE_PARSER(int, parse_int);
MAKE_PARSER(unsigned int, parse_uint);
MAKE_PARSER(long, parse_long);
MAKE_PARSER(unsigned long, parse_ulong);
#ifdef RE2_HAVE_LONGLONG
MAKE_PARSER(long long, parse_longlong);
MAKE_PARSER(unsigned long long, parse_ulonglong);
#endif
MAKE_PARSER(float, parse_float);
MAKE_PARSER(double, parse_double);
MAKE_PARSER(string, parse_string);
MAKE_PARSER(StringPiece, parse_stringpiece);
#undef MAKE_PARSER
// Generic constructor
template <class T> Arg(T*, Parser parser);
// Generic constructor template
// Generic constructor templates
template <class T> Arg(T* p)
: arg_(p), parser_(_RE2_MatchObject<T>::Parse) {
}
: arg_(p), parser_(_RE2_MatchObject<T>::Parse) { }
template <class T> Arg(T* p, Parser parser)
: arg_(p), parser_(parser) { }
// Parse the data
bool Parse(const char* str, int n) const;
bool Parse(const char* str, size_t n) const;
private:
void* arg_;
Parser parser_;
static bool parse_null (const char* str, int n, void* dest);
static bool parse_char (const char* str, int n, void* dest);
static bool parse_uchar (const char* str, int n, void* dest);
static bool parse_float (const char* str, int n, void* dest);
static bool parse_double (const char* str, int n, void* dest);
static bool parse_string (const char* str, int n, void* dest);
static bool parse_stringpiece (const char* str, int n, void* dest);
static bool parse_null (const char* str, size_t n, void* dest);
static bool parse_char (const char* str, size_t n, void* dest);
static bool parse_schar (const char* str, size_t n, void* dest);
static bool parse_uchar (const char* str, size_t n, void* dest);
static bool parse_float (const char* str, size_t n, void* dest);
static bool parse_double (const char* str, size_t n, void* dest);
static bool parse_string (const char* str, size_t n, void* dest);
static bool parse_stringpiece (const char* str, size_t n, void* dest);
#define DECLARE_INTEGER_PARSER(name) \
private: \
static bool parse_ ## name(const char* str, int n, void* dest); \
static bool parse_ ## name ## _radix( \
const char* str, int n, void* dest, int radix); \
static bool parse_##name(const char* str, size_t n, void* dest); \
static bool parse_##name##_radix(const char* str, size_t n, void* dest, \
int radix); \
\
public: \
static bool parse_ ## name ## _hex(const char* str, int n, void* dest); \
static bool parse_ ## name ## _octal(const char* str, int n, void* dest); \
static bool parse_ ## name ## _cradix(const char* str, int n, void* dest)
static bool parse_##name##_hex(const char* str, size_t n, void* dest); \
static bool parse_##name##_octal(const char* str, size_t n, void* dest); \
static bool parse_##name##_cradix(const char* str, size_t n, void* dest)
DECLARE_INTEGER_PARSER(short);
DECLARE_INTEGER_PARSER(ushort);
@ -833,29 +840,31 @@ class RE2::Arg {
DECLARE_INTEGER_PARSER(uint);
DECLARE_INTEGER_PARSER(long);
DECLARE_INTEGER_PARSER(ulong);
#ifdef RE2_HAVE_LONGLONG
DECLARE_INTEGER_PARSER(longlong);
DECLARE_INTEGER_PARSER(ulonglong);
#endif
#undef DECLARE_INTEGER_PARSER
};
inline RE2::Arg::Arg() : arg_(NULL), parser_(parse_null) { }
inline RE2::Arg::Arg(void* p) : arg_(p), parser_(parse_null) { }
inline bool RE2::Arg::Parse(const char* str, int n) const {
inline bool RE2::Arg::Parse(const char* str, size_t n) const {
return (*parser_)(str, n, arg_);
}
// This part of the parser, appropriate only for ints, deals with bases
#define MAKE_INTEGER_PARSER(type, name) \
inline RE2::Arg RE2::Hex(type* ptr) { \
return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _hex); } \
return RE2::Arg(ptr, RE2::Arg::parse_##name##_hex); \
} \
inline RE2::Arg RE2::Octal(type* ptr) { \
return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _octal); } \
return RE2::Arg(ptr, RE2::Arg::parse_##name##_octal); \
} \
inline RE2::Arg RE2::CRadix(type* ptr) { \
return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _cradix); }
return RE2::Arg(ptr, RE2::Arg::parse_##name##_cradix); \
}
MAKE_INTEGER_PARSER(short, short)
MAKE_INTEGER_PARSER(unsigned short, ushort)
@ -863,15 +872,70 @@ MAKE_INTEGER_PARSER(int, int)
MAKE_INTEGER_PARSER(unsigned int, uint)
MAKE_INTEGER_PARSER(long, long)
MAKE_INTEGER_PARSER(unsigned long, ulong)
#ifdef RE2_HAVE_LONGLONG
MAKE_INTEGER_PARSER(long long, longlong)
MAKE_INTEGER_PARSER(unsigned long long, ulonglong)
#endif
#undef MAKE_INTEGER_PARSER
#ifndef SWIG
// Silence warnings about missing initializers for members of LazyRE2.
// Note that we test for Clang first because it defines __GNUC__ as well.
#if defined(__clang__)
#elif defined(__GNUC__) && __GNUC__ >= 6
#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
#endif
// Helper for writing global or static RE2s safely.
// Write
// static LazyRE2 re = {".*"};
// and then use *re instead of writing
// static RE2 re(".*");
// The former is more careful about multithreaded
// situations than the latter.
//
// N.B. This class never deletes the RE2 object that
// it constructs: that's a feature, so that it can be used
// for global and function static variables.
class LazyRE2 {
private:
struct NoArg {};
public:
typedef RE2 element_type; // support std::pointer_traits
// Constructor omitted to preserve braced initialization in C++98.
// Pretend to be a pointer to Type (never NULL due to on-demand creation):
RE2& operator*() const { return *get(); }
RE2* operator->() const { return get(); }
// Named accessor/initializer:
RE2* get() const {
std::call_once(once_, &LazyRE2::Init, this);
return ptr_;
}
// All data fields must be public to support {"foo"} initialization.
const char* pattern_;
RE2::CannedOptions options_;
NoArg barrier_against_excess_initializers_;
mutable RE2* ptr_;
mutable std::once_flag once_;
private:
static void Init(const LazyRE2* lazy_re2) {
lazy_re2->ptr_ = new RE2(lazy_re2->pattern_, lazy_re2->options_);
}
void operator=(const LazyRE2&); // disallowed
};
#endif // SWIG
} // namespace re2
using re2::RE2;
using re2::LazyRE2;
#endif /* RE2_RE2_H */
#endif // RE2_RE2_H_

View File

@ -5,8 +5,21 @@
// Regular expression representation.
// Tested by parse_test.cc
#include "util/util.h"
#include "re2/regexp.h"
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#include <algorithm>
#include <map>
#include <mutex>
#include <string>
#include <vector>
#include "util/util.h"
#include "util/logging.h"
#include "util/mutex.h"
#include "util/utf.h"
#include "re2/stringpiece.h"
#include "re2/walker-inl.h"
@ -14,9 +27,9 @@ namespace re2 {
// Constructor. Allocates vectors as appropriate for operator.
Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
: op_(op),
: op_(static_cast<uint8_t>(op)),
simple_(false),
parse_flags_(static_cast<uint16>(parse_flags)),
parse_flags_(static_cast<uint16_t>(parse_flags)),
ref_(1),
nsub_(0),
down_(NULL) {
@ -43,6 +56,7 @@ Regexp::~Regexp() {
delete[] runes_;
break;
case kRegexpCharClass:
if (cc_)
cc_->Delete();
delete ccb_;
break;
@ -59,30 +73,29 @@ bool Regexp::QuickDestroy() {
return false;
}
static map<Regexp*, int> *ref_map;
GLOBAL_MUTEX(ref_mutex);
// Lazily allocated.
static Mutex* ref_mutex;
static std::map<Regexp*, int>* ref_map;
int Regexp::Ref() {
if (ref_ < kMaxRef)
return ref_;
GLOBAL_MUTEX_LOCK(ref_mutex);
int r = 0;
if (ref_map != NULL) {
r = (*ref_map)[this];
}
GLOBAL_MUTEX_UNLOCK(ref_mutex);
return r;
MutexLock l(ref_mutex);
return (*ref_map)[this];
}
// Increments reference count, returns object as convenience.
Regexp* Regexp::Incref() {
if (ref_ >= kMaxRef-1) {
static std::once_flag ref_once;
std::call_once(ref_once, []() {
ref_mutex = new Mutex;
ref_map = new std::map<Regexp*, int>;
});
// Store ref count in overflow map.
GLOBAL_MUTEX_LOCK(ref_mutex);
if (ref_map == NULL) {
ref_map = new map<Regexp*, int>;
}
MutexLock l(ref_mutex);
if (ref_ == kMaxRef) {
// already overflowed
(*ref_map)[this]++;
@ -91,7 +104,6 @@ Regexp* Regexp::Incref() {
(*ref_map)[this] = kMaxRef;
ref_ = kMaxRef;
}
GLOBAL_MUTEX_UNLOCK(ref_mutex);
return this;
}
@ -103,15 +115,14 @@ Regexp* Regexp::Incref() {
void Regexp::Decref() {
if (ref_ == kMaxRef) {
// Ref count is stored in overflow map.
GLOBAL_MUTEX_LOCK(ref_mutex);
MutexLock l(ref_mutex);
int r = (*ref_map)[this] - 1;
if (r < kMaxRef) {
ref_ = r;
ref_ = static_cast<uint16_t>(r);
ref_map->erase(this);
} else {
(*ref_map)[this] = r;
}
GLOBAL_MUTEX_UNLOCK(ref_mutex);
return;
}
ref_--;
@ -179,31 +190,45 @@ Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
return re;
}
Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
if (sub->op() == kRegexpPlus && sub->parse_flags() == flags)
Regexp* Regexp::StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags) {
// Squash **, ++ and ??.
if (op == sub->op() && flags == sub->parse_flags())
return sub;
Regexp* re = new Regexp(kRegexpPlus, flags);
// Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because
// op is Star/Plus/Quest, we just have to check that sub->op() is too.
if ((sub->op() == kRegexpStar ||
sub->op() == kRegexpPlus ||
sub->op() == kRegexpQuest) &&
flags == sub->parse_flags()) {
// If sub is Star, no need to rewrite it.
if (sub->op() == kRegexpStar)
return sub;
// Rewrite sub to Star.
Regexp* re = new Regexp(kRegexpStar, flags);
re->AllocSub(1);
re->sub()[0] = sub->sub()[0]->Incref();
sub->Decref(); // We didn't consume the reference after all.
return re;
}
Regexp* re = new Regexp(op, flags);
re->AllocSub(1);
re->sub()[0] = sub;
return re;
}
Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
return StarPlusOrQuest(kRegexpPlus, sub, flags);
}
Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
if (sub->op() == kRegexpStar && sub->parse_flags() == flags)
return sub;
Regexp* re = new Regexp(kRegexpStar, flags);
re->AllocSub(1);
re->sub()[0] = sub;
return re;
return StarPlusOrQuest(kRegexpStar, sub, flags);
}
Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
if (sub->op() == kRegexpQuest && sub->parse_flags() == flags)
return sub;
Regexp* re = new Regexp(kRegexpQuest, flags);
re->AllocSub(1);
re->sub()[0] = sub;
return re;
return StarPlusOrQuest(kRegexpQuest, sub, flags);
}
Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
@ -211,6 +236,13 @@ Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
if (nsub == 1)
return sub[0];
if (nsub == 0) {
if (op == kRegexpAlternate)
return new Regexp(kRegexpNoMatch, flags);
else
return new Regexp(kRegexpEmptyMatch, flags);
}
Regexp** subcopy = NULL;
if (op == kRegexpAlternate && can_factor) {
// Going to edit sub; make a copy so we don't step on caller.
@ -405,7 +437,7 @@ bool Regexp::Equal(Regexp* a, Regexp* b) {
// The stack (vector) has pairs of regexps waiting to
// be compared. The regexps are only equal if
// all the pairs end up being equal.
vector<Regexp*> stk;
std::vector<Regexp*> stk;
for (;;) {
// Invariant: TopEqual(a, b) == true.
@ -445,10 +477,11 @@ bool Regexp::Equal(Regexp* a, Regexp* b) {
continue;
}
int n = stk.size();
size_t n = stk.size();
if (n == 0)
break;
DCHECK_GE(n, 2);
a = stk[n-2];
b = stk[n-1];
stk.resize(n-2);
@ -517,7 +550,9 @@ class NumCapturesWalker : public Regexp::Walker<Ignored> {
private:
int ncapture_;
DISALLOW_EVIL_CONSTRUCTORS(NumCapturesWalker);
NumCapturesWalker(const NumCapturesWalker&) = delete;
NumCapturesWalker& operator=(const NumCapturesWalker&) = delete;
};
int Regexp::NumCaptures() {
@ -532,8 +567,8 @@ class NamedCapturesWalker : public Regexp::Walker<Ignored> {
NamedCapturesWalker() : map_(NULL) {}
~NamedCapturesWalker() { delete map_; }
map<string, int>* TakeMap() {
map<string, int>* m = map_;
std::map<string, int>* TakeMap() {
std::map<string, int>* m = map_;
map_ = NULL;
return m;
}
@ -542,7 +577,7 @@ class NamedCapturesWalker : public Regexp::Walker<Ignored> {
if (re->op() == kRegexpCapture && re->name() != NULL) {
// Allocate map once we find a name.
if (map_ == NULL)
map_ = new map<string, int>;
map_ = new std::map<string, int>;
// Record first occurrence of each name.
// (The rule is that if you have the same name
@ -560,11 +595,13 @@ class NamedCapturesWalker : public Regexp::Walker<Ignored> {
}
private:
map<string, int>* map_;
DISALLOW_EVIL_CONSTRUCTORS(NamedCapturesWalker);
std::map<string, int>* map_;
NamedCapturesWalker(const NamedCapturesWalker&) = delete;
NamedCapturesWalker& operator=(const NamedCapturesWalker&) = delete;
};
map<string, int>* Regexp::NamedCaptures() {
std::map<string, int>* Regexp::NamedCaptures() {
NamedCapturesWalker w;
w.Walk(this, 0);
return w.TakeMap();
@ -576,8 +613,8 @@ class CaptureNamesWalker : public Regexp::Walker<Ignored> {
CaptureNamesWalker() : map_(NULL) {}
~CaptureNamesWalker() { delete map_; }
map<int, string>* TakeMap() {
map<int, string>* m = map_;
std::map<int, string>* TakeMap() {
std::map<int, string>* m = map_;
map_ = NULL;
return m;
}
@ -586,7 +623,7 @@ class CaptureNamesWalker : public Regexp::Walker<Ignored> {
if (re->op() == kRegexpCapture && re->name() != NULL) {
// Allocate map once we find a name.
if (map_ == NULL)
map_ = new map<int, string>;
map_ = new std::map<int, string>;
(*map_)[re->cap()] = *re->name();
}
@ -600,11 +637,13 @@ class CaptureNamesWalker : public Regexp::Walker<Ignored> {
}
private:
map<int, string>* map_;
DISALLOW_EVIL_CONSTRUCTORS(CaptureNamesWalker);
std::map<int, string>* map_;
CaptureNamesWalker(const CaptureNamesWalker&) = delete;
CaptureNamesWalker& operator=(const CaptureNamesWalker&) = delete;
};
map<int, string>* Regexp::CaptureNames() {
std::map<int, string>* Regexp::CaptureNames() {
CaptureNamesWalker w;
w.Walk(this, 0);
return w.TakeMap();
@ -643,7 +682,7 @@ bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
if (re->parse_flags() & Latin1) {
prefix->resize(re->nrunes_);
for (int j = 0; j < re->nrunes_; j++)
(*prefix)[j] = re->runes_[j];
(*prefix)[j] = static_cast<char>(re->runes_[j]);
} else {
// Convert to UTF-8 in place.
// Assume worst-case space and then trim.
@ -652,7 +691,7 @@ bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
for (int j = 0; j < re->nrunes_; j++) {
Rune r = re->runes_[j];
if (r < Runeself)
*p++ = r;
*p++ = static_cast<char>(r);
else
p += runetochar(p, &r);
}
@ -662,14 +701,14 @@ bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
case kRegexpLiteral:
if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) {
prefix->append(1, re->rune_);
prefix->append(1, static_cast<char>(re->rune_));
} else {
char buf[UTFmax];
prefix->append(buf, runetochar(buf, &re->rune_));
}
break;
}
*foldcase = (sub[i]->parse_flags() & FoldCase);
*foldcase = (sub[i]->parse_flags() & FoldCase) != 0;
i++;
// The rest.
@ -704,13 +743,13 @@ bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
if (lo <= 'z' && hi >= 'A') {
// Overlaps some alpha, maybe not all.
// Update bitmaps telling which ASCII letters are in the set.
Rune lo1 = max<Rune>(lo, 'A');
Rune hi1 = min<Rune>(hi, 'Z');
Rune lo1 = std::max<Rune>(lo, 'A');
Rune hi1 = std::min<Rune>(hi, 'Z');
if (lo1 <= hi1)
upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');
lo1 = max<Rune>(lo, 'a');
hi1 = min<Rune>(hi, 'z');
lo1 = std::max<Rune>(lo, 'a');
hi1 = std::min<Rune>(hi, 'z');
if (lo1 <= hi1)
lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
}
@ -826,7 +865,7 @@ void CharClassBuilder::RemoveAbove(Rune r) {
void CharClassBuilder::Negate() {
// Build up negation and then copy in.
// Could edit ranges in place, but C++ won't let me.
vector<RuneRange> v;
std::vector<RuneRange> v;
v.reserve(ranges_.size() + 1);
// In negation, first range begins at 0, unless
@ -863,7 +902,7 @@ void CharClassBuilder::Negate() {
CharClass* CharClass::New(int maxranges) {
CharClass* cc;
uint8* data = new uint8[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
uint8_t* data = new uint8_t[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
cc = reinterpret_cast<CharClass*>(data);
cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc);
cc->nranges_ = 0;
@ -873,7 +912,7 @@ CharClass* CharClass::New(int maxranges) {
}
void CharClass::Delete() {
uint8 *data = reinterpret_cast<uint8*>(this);
uint8_t* data = reinterpret_cast<uint8_t*>(this);
delete[] data;
}
@ -915,7 +954,7 @@ bool CharClass::Contains(Rune r) {
}
CharClass* CharClassBuilder::GetCharClass() {
CharClass* cc = CharClass::New(ranges_.size());
CharClass* cc = CharClass::New(static_cast<int>(ranges_.size()));
int n = 0;
for (iterator it = begin(); it != end(); ++it)
cc->ranges_[n++] = *it;

View File

@ -2,6 +2,9 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_REGEXP_H_
#define RE2_REGEXP_H_
// --- SPONSORED LINK --------------------------------------------------
// If you want to use this library for regular expression matching,
// you should use re2/re2.h, which provides a class RE2 that
@ -83,10 +86,14 @@
// form accessible to clients, so that client code can analyze the
// parsed regular expressions.
#ifndef RE2_REGEXP_H__
#define RE2_REGEXP_H__
#include <stdint.h>
#include <map>
#include <set>
#include <string>
#include "util/util.h"
#include "util/logging.h"
#include "util/utf.h"
#include "re2/stringpiece.h"
namespace re2 {
@ -185,10 +192,10 @@ class RegexpStatus {
RegexpStatus() : code_(kRegexpSuccess), tmp_(NULL) {}
~RegexpStatus() { delete tmp_; }
void set_code(enum RegexpStatusCode code) { code_ = code; }
void set_code(RegexpStatusCode code) { code_ = code; }
void set_error_arg(const StringPiece& error_arg) { error_arg_ = error_arg; }
void set_tmp(string* tmp) { delete tmp_; tmp_ = tmp; }
enum RegexpStatusCode code() const { return code_; }
RegexpStatusCode code() const { return code_; }
const StringPiece& error_arg() const { return error_arg_; }
bool ok() const { return code() == kRegexpSuccess; }
@ -197,23 +204,21 @@ class RegexpStatus {
// Returns text equivalent of code, e.g.:
// "Bad character class"
static string CodeText(enum RegexpStatusCode code);
static string CodeText(RegexpStatusCode code);
// Returns text describing error, e.g.:
// "Bad character class: [z-a]"
string Text() const;
private:
enum RegexpStatusCode code_; // Kind of error
RegexpStatusCode code_; // Kind of error
StringPiece error_arg_; // Piece of regexp containing syntax error.
string* tmp_; // Temporary storage, possibly where error_arg_ is.
DISALLOW_EVIL_CONSTRUCTORS(RegexpStatus);
RegexpStatus(const RegexpStatus&) = delete;
RegexpStatus& operator=(const RegexpStatus&) = delete;
};
// Walker to implement Simplify.
class SimplifyWalker;
// Compiled form; see prog.h
class Prog;
@ -261,7 +266,9 @@ class CharClass {
int nrunes_;
RuneRange *ranges_;
int nranges_;
DISALLOW_EVIL_CONSTRUCTORS(CharClass);
CharClass(const CharClass&) = delete;
CharClass& operator=(const CharClass&) = delete;
};
class Regexp {
@ -306,14 +313,15 @@ class Regexp {
UnicodeGroups,
// Internal use only.
WasDollar = 1<<15, // on kRegexpEndText: was $ in regexp text
WasDollar = 1<<13, // on kRegexpEndText: was $ in regexp text
AllParseFlags = (1<<14)-1,
};
// Get. No set, Regexps are logically immutable once created.
RegexpOp op() { return static_cast<RegexpOp>(op_); }
int nsub() { return nsub_; }
bool simple() { return simple_; }
enum ParseFlags parse_flags() { return static_cast<ParseFlags>(parse_flags_); }
bool simple() { return simple_ != 0; }
ParseFlags parse_flags() { return static_cast<ParseFlags>(parse_flags_); }
int Ref(); // For testing.
Regexp** sub() {
@ -353,6 +361,7 @@ class Regexp {
// removed. The result will capture exactly the same
// subexpressions the original did, unless formatted with ToString.
Regexp* Simplify();
friend class CoalesceWalker;
friend class SimplifyWalker;
// Parses the regexp src and then simplifies it and sets *dst to the
@ -369,12 +378,12 @@ class Regexp {
// Returns a map from names to capturing group indices,
// or NULL if the regexp contains no named capture groups.
// The caller is responsible for deleting the map.
map<string, int>* NamedCaptures();
std::map<string, int>* NamedCaptures();
// Returns a map from capturing group indices to capturing group
// names or NULL if the regexp contains no named capture groups. The
// caller is responsible for deleting the map.
map<int, string>* CaptureNames();
std::map<int, string>* CaptureNames();
// Returns a string representation of the current regexp,
// using as few parentheses as possible.
@ -410,8 +419,8 @@ class Regexp {
// Construction and execution of prog will
// stay within approximately max_mem bytes of memory.
// If max_mem <= 0, a reasonable default is used.
Prog* CompileToProg(int64 max_mem);
Prog* CompileToReverseProg(int64 max_mem);
Prog* CompileToProg(int64_t max_mem);
Prog* CompileToReverseProg(int64_t max_mem);
// Whether to expect this library to find exactly the same answer as PCRE
// when running this regexp. Most regexps do mimic PCRE exactly, but a few
@ -427,6 +436,8 @@ class Regexp {
// begin with a non-empty fixed string (perhaps after ASCII
// case-folding). If so, returns the prefix and the sub-regexp that
// follows it.
// Callers should expect *prefix, *foldcase and *suffix to be "zeroed"
// regardless of the return value.
bool RequiredPrefix(string* prefix, bool* foldcase, Regexp** suffix);
private:
@ -441,6 +452,7 @@ class Regexp {
// Helpers for Parse. Listed here so they can edit Regexps.
class ParseState;
friend class ParseState;
friend bool ParseCharClass(StringPiece* s, Regexp** out_re,
RegexpStatus* status);
@ -451,6 +463,10 @@ class Regexp {
// Computes whether Regexp is already simple.
bool ComputeSimple();
// Constructor that generates a Star, Plus or Quest,
// squashing the pair if sub is also a Star, Plus or Quest.
static Regexp* StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags);
// Constructor that generates a concatenation or alternation,
// enforcing the limit on the number of subexpressions for
// a particular Regexp.
@ -478,8 +494,7 @@ class Regexp {
// Simplifies an alternation of literal strings by factoring out
// common prefixes.
static int FactorAlternation(Regexp** sub, int nsub, ParseFlags flags);
static int FactorAlternationRecursive(Regexp** sub, int nsub,
ParseFlags flags, int maxdepth);
friend class FactorAlternationImpl;
// Is a == b? Only efficient on regexps that have not been through
// Simplify yet - the expansion of a kRegexpRepeat will make this
@ -488,11 +503,10 @@ class Regexp {
// Allocate space for n sub-regexps.
void AllocSub(int n) {
if (n < 0 || static_cast<uint16>(n) != n)
LOG(FATAL) << "Cannot AllocSub " << n;
DCHECK(n >= 0 && static_cast<uint16_t>(n) == n);
if (n > 1)
submany_ = new Regexp*[n];
nsub_ = n;
nsub_ = static_cast<uint16_t>(n);
}
// Add Rune to LiteralString
@ -502,38 +516,38 @@ class Regexp {
void Swap(Regexp *that);
// Operator. See description of operators above.
// uint8 instead of RegexpOp to control space usage.
uint8 op_;
// uint8_t instead of RegexpOp to control space usage.
uint8_t op_;
// Is this regexp structure already simple
// (has it been returned by Simplify)?
// uint8 instead of bool to control space usage.
uint8 simple_;
// uint8_t instead of bool to control space usage.
uint8_t simple_;
// Flags saved from parsing and used during execution.
// (Only FoldCase is used.)
// uint16 instead of ParseFlags to control space usage.
uint16 parse_flags_;
// uint16_t instead of ParseFlags to control space usage.
uint16_t parse_flags_;
// Reference count. Exists so that SimplifyRegexp can build
// regexp structures that are dags rather than trees to avoid
// exponential blowup in space requirements.
// uint16 to control space usage.
// uint16_t to control space usage.
// The standard regexp routines will never generate a
// ref greater than the maximum repeat count (100),
// ref greater than the maximum repeat count (kMaxRepeat),
// but even so, Incref and Decref consult an overflow map
// when ref_ reaches kMaxRef.
uint16 ref_;
static const uint16 kMaxRef = 0xffff;
uint16_t ref_;
static const uint16_t kMaxRef = 0xffff;
// Subexpressions.
// uint16 to control space usage.
// uint16_t to control space usage.
// Concat and Alternate handle larger numbers of subexpressions
// by building concatenation or alternation trees.
// Other routines should call Concat or Alternate instead of
// filling in sub() by hand.
uint16 nsub_;
static const uint16 kMaxNsub = 0xffff;
uint16_t nsub_;
static const uint16_t kMaxNsub = 0xffff;
union {
Regexp** submany_; // if nsub_ > 1
Regexp* subone_; // if nsub_ == 1
@ -568,11 +582,12 @@ class Regexp {
void *the_union_[2]; // as big as any other element, for memset
};
DISALLOW_EVIL_CONSTRUCTORS(Regexp);
Regexp(const Regexp&) = delete;
Regexp& operator=(const Regexp&) = delete;
};
// Character class set: contains non-overlapping, non-abutting RuneRanges.
typedef set<RuneRange, RuneRangeLess> RuneRangeSet;
typedef std::set<RuneRange, RuneRangeLess> RuneRangeSet;
class CharClassBuilder {
public:
@ -597,37 +612,41 @@ class CharClassBuilder {
void AddRangeFlags(Rune lo, Rune hi, Regexp::ParseFlags parse_flags);
private:
static const uint32 AlphaMask = (1<<26) - 1;
uint32 upper_; // bitmap of A-Z
uint32 lower_; // bitmap of a-z
static const uint32_t AlphaMask = (1<<26) - 1;
uint32_t upper_; // bitmap of A-Z
uint32_t lower_; // bitmap of a-z
int nrunes_;
RuneRangeSet ranges_;
DISALLOW_EVIL_CONSTRUCTORS(CharClassBuilder);
CharClassBuilder(const CharClassBuilder&) = delete;
CharClassBuilder& operator=(const CharClassBuilder&) = delete;
};
// Tell g++ that bitwise ops on ParseFlags produce ParseFlags.
inline Regexp::ParseFlags operator|(Regexp::ParseFlags a, Regexp::ParseFlags b)
{
return static_cast<Regexp::ParseFlags>(static_cast<int>(a) | static_cast<int>(b));
// Bitwise ops on ParseFlags produce ParseFlags.
inline Regexp::ParseFlags operator|(Regexp::ParseFlags a,
Regexp::ParseFlags b) {
return static_cast<Regexp::ParseFlags>(
static_cast<int>(a) | static_cast<int>(b));
}
inline Regexp::ParseFlags operator^(Regexp::ParseFlags a, Regexp::ParseFlags b)
{
return static_cast<Regexp::ParseFlags>(static_cast<int>(a) ^ static_cast<int>(b));
inline Regexp::ParseFlags operator^(Regexp::ParseFlags a,
Regexp::ParseFlags b) {
return static_cast<Regexp::ParseFlags>(
static_cast<int>(a) ^ static_cast<int>(b));
}
inline Regexp::ParseFlags operator&(Regexp::ParseFlags a, Regexp::ParseFlags b)
{
return static_cast<Regexp::ParseFlags>(static_cast<int>(a) & static_cast<int>(b));
inline Regexp::ParseFlags operator&(Regexp::ParseFlags a,
Regexp::ParseFlags b) {
return static_cast<Regexp::ParseFlags>(
static_cast<int>(a) & static_cast<int>(b));
}
inline Regexp::ParseFlags operator~(Regexp::ParseFlags a)
{
return static_cast<Regexp::ParseFlags>(~static_cast<int>(a));
inline Regexp::ParseFlags operator~(Regexp::ParseFlags a) {
// Attempting to produce a value out of enum's range has undefined behaviour.
return static_cast<Regexp::ParseFlags>(
~static_cast<int>(a) & static_cast<int>(Regexp::AllParseFlags));
}
} // namespace re2
#endif // RE2_REGEXP_H__
#endif // RE2_REGEXP_H_

View File

@ -4,36 +4,42 @@
#include "re2/set.h"
#include <stddef.h>
#include <algorithm>
#include <memory>
#include "util/util.h"
#include "util/logging.h"
#include "re2/stringpiece.h"
#include "re2/prog.h"
#include "re2/re2.h"
#include "re2/regexp.h"
using namespace re2;
namespace re2 {
RE2::Set::Set(const RE2::Options& options, RE2::Anchor anchor) {
options_.Copy(options);
options_.set_never_capture(true); // might unblock some optimisations
anchor_ = anchor;
prog_ = NULL;
compiled_ = false;
size_ = 0;
}
RE2::Set::~Set() {
for (size_t i = 0; i < re_.size(); i++)
re_[i]->Decref();
for (size_t i = 0; i < elem_.size(); i++)
elem_[i].second->Decref();
delete prog_;
}
int RE2::Set::Add(const StringPiece& pattern, string* error) {
if (compiled_) {
LOG(DFATAL) << "RE2::Set::Add after Compile";
LOG(DFATAL) << "RE2::Set::Add() called after compiling";
return -1;
}
Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
options_.ParseFlags());
RegexpStatus status;
re2::Regexp* re = Regexp::Parse(pattern, pf, &status);
if (re == NULL) {
@ -45,7 +51,7 @@ int RE2::Set::Add(const StringPiece& pattern, string* error) {
}
// Concatenate with match index and push on vector.
int n = re_.size();
int n = static_cast<int>(elem_.size());
re2::Regexp* m = re2::Regexp::HaveMatch(n, pf);
if (re->op() == kRegexpConcat) {
int nsub = re->nsub();
@ -62,52 +68,87 @@ int RE2::Set::Add(const StringPiece& pattern, string* error) {
sub[1] = m;
re = re2::Regexp::Concat(sub, 2, pf);
}
re_.push_back(re);
elem_.emplace_back(pattern.ToString(), re);
return n;
}
bool RE2::Set::Compile() {
if (compiled_) {
LOG(DFATAL) << "RE2::Set::Compile multiple times";
LOG(DFATAL) << "RE2::Set::Compile() called more than once";
return false;
}
compiled_ = true;
size_ = static_cast<int>(elem_.size());
// Sort the elements by their patterns. This is good enough for now
// until we have a Regexp comparison function. (Maybe someday...)
std::sort(elem_.begin(), elem_.end(),
[](const Elem& a, const Elem& b) -> bool {
return a.first < b.first;
});
re2::Regexp** sub = new re2::Regexp*[size_];
for (size_t i = 0; i < elem_.size(); i++)
sub[i] = elem_[i].second;
elem_.clear();
elem_.shrink_to_fit();
Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
options_.ParseFlags());
re2::Regexp* re = re2::Regexp::Alternate(const_cast<re2::Regexp**>(&re_[0]),
re_.size(), pf);
re_.clear();
re2::Regexp* sre = re->Simplify();
re->Decref();
re = sre;
if (re == NULL) {
if (options_.log_errors())
LOG(ERROR) << "Error simplifying during Compile.";
return false;
}
re2::Regexp* re = re2::Regexp::Alternate(sub, size_, pf);
delete[] sub;
prog_ = Prog::CompileSet(options_, anchor_, re);
prog_ = Prog::CompileSet(re, anchor_, options_.max_mem());
re->Decref();
return prog_ != NULL;
}
bool RE2::Set::Match(const StringPiece& text, vector<int>* v) const {
if (!compiled_) {
LOG(DFATAL) << "RE2::Set::Match without Compile";
return false;
bool RE2::Set::Match(const StringPiece& text, std::vector<int>* v) const {
return Match(text, v, NULL);
}
v->clear();
bool failed;
bool ret = prog_->SearchDFA(text, text, Prog::kAnchored,
Prog::kManyMatch, NULL, &failed, v);
if (failed)
LOG(DFATAL) << "RE2::Set::Match: DFA ran out of cache space";
if (ret == false)
return false;
if (v->size() == 0) {
LOG(DFATAL) << "RE2::Set::Match: match but unknown regexp set";
bool RE2::Set::Match(const StringPiece& text, std::vector<int>* v,
ErrorInfo* error_info) const {
if (!compiled_) {
LOG(DFATAL) << "RE2::Set::Match() called before compiling";
if (error_info != NULL)
error_info->kind = kNotCompiled;
return false;
}
bool dfa_failed = false;
std::unique_ptr<SparseSet> matches;
if (v != NULL) {
matches.reset(new SparseSet(size_));
v->clear();
}
bool ret = prog_->SearchDFA(text, text, Prog::kAnchored, Prog::kManyMatch,
NULL, &dfa_failed, matches.get());
if (dfa_failed) {
if (options_.log_errors())
LOG(ERROR) << "DFA out of memory: size " << prog_->size() << ", "
<< "bytemap range " << prog_->bytemap_range() << ", "
<< "list count " << prog_->list_count();
if (error_info != NULL)
error_info->kind = kOutOfMemory;
return false;
}
if (ret == false) {
if (error_info != NULL)
error_info->kind = kNoError;
return false;
}
if (v != NULL) {
if (matches->empty()) {
LOG(DFATAL) << "RE2::Set::Match() matched, but no matches returned?!";
if (error_info != NULL)
error_info->kind = kInconsistent;
return false;
}
v->assign(matches->begin(), matches->end());
}
if (error_info != NULL)
error_info->kind = kNoError;
return true;
}
} // namespace re2

View File

@ -2,54 +2,79 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_SET_H
#define RE2_SET_H
#ifndef RE2_SET_H_
#define RE2_SET_H_
#include <string>
#include <utility>
#include <vector>
#include "re2/re2.h"
namespace re2 {
using std::vector;
class Prog;
class Regexp;
} // namespace re2
namespace re2 {
// An RE2::Set represents a collection of regexps that can
// be searched for simultaneously.
class RE2::Set {
public:
enum ErrorKind {
kNoError = 0,
kNotCompiled, // The set is not compiled.
kOutOfMemory, // The DFA ran out of memory.
kInconsistent, // The result is inconsistent. This should never happen.
};
struct ErrorInfo {
ErrorKind kind;
};
Set(const RE2::Options& options, RE2::Anchor anchor);
~Set();
// Add adds regexp pattern to the set, interpreted using the RE2 options.
// (The RE2 constructor's default options parameter is RE2::UTF8.)
// Add returns the regexp index that will be used to identify
// it in the result of Match, or -1 if the regexp cannot be parsed.
// Adds pattern to the set using the options passed to the constructor.
// Returns the index that will identify the regexp in the output of Match(),
// or -1 if the regexp cannot be parsed.
// Indices are assigned in sequential order starting from 0.
// Error returns do not increment the index.
// If an error occurs and error != NULL, *error will hold an error message.
// Errors do not increment the index; if error is not NULL, *error will hold
// the error message from the parser.
int Add(const StringPiece& pattern, string* error);
// Compile prepares the Set for matching.
// Add must not be called again after Compile.
// Compile must be called before FullMatch or PartialMatch.
// Compile may return false if it runs out of memory.
// Compiles the set in preparation for matching.
// Returns false if the compiler runs out of memory.
// Add() must not be called again after Compile().
// Compile() must be called before Match().
bool Compile();
// Match returns true if text matches any of the regexps in the set.
// If so, it fills v with the indices of the matching regexps.
bool Match(const StringPiece& text, vector<int>* v) const;
// Returns true if text matches at least one of the regexps in the set.
// Fills v (if not NULL) with the indices of the matching regexps.
// Callers must not expect v to be sorted.
bool Match(const StringPiece& text, std::vector<int>* v) const;
// As above, but populates error_info (if not NULL) when none of the regexps
// in the set matched. This can inform callers when DFA execution fails, for
// example, because they might wish to handle that case differently.
bool Match(const StringPiece& text, std::vector<int>* v,
ErrorInfo* error_info) const;
private:
typedef std::pair<string, re2::Regexp*> Elem;
RE2::Options options_;
RE2::Anchor anchor_;
vector<re2::Regexp*> re_;
std::vector<Elem> elem_;
re2::Prog* prog_;
bool compiled_;
//DISALLOW_EVIL_CONSTRUCTORS(Set);
Set(const Set&);
void operator=(const Set&);
int size_;
Set(const Set&) = delete;
Set& operator=(const Set&) = delete;
};
} // namespace re2
#endif // RE2_SET_H
#endif // RE2_SET_H_

View File

@ -6,7 +6,11 @@
// to use simple extended regular expression features.
// Also sort and simplify character classes.
#include <string>
#include "util/util.h"
#include "util/logging.h"
#include "util/utf.h"
#include "re2/regexp.h"
#include "re2/walker-inl.h"
@ -61,7 +65,7 @@ bool Regexp::ComputeSimple() {
// These are simple as long as the subpieces are simple.
subs = sub();
for (int i = 0; i < nsub_; i++)
if (!subs[i]->simple_)
if (!subs[i]->simple())
return false;
return true;
case kRegexpCharClass:
@ -71,12 +75,12 @@ bool Regexp::ComputeSimple() {
return !cc_->empty() && !cc_->full();
case kRegexpCapture:
subs = sub();
return subs[0]->simple_;
return subs[0]->simple();
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
subs = sub();
if (!subs[0]->simple_)
if (!subs[0]->simple())
return false;
switch (subs[0]->op_) {
case kRegexpStar:
@ -96,6 +100,37 @@ bool Regexp::ComputeSimple() {
return false;
}
// Walker subclass used by Simplify.
// Coalesces runs of star/plus/quest/repeat of the same literal along with any
// occurrences of that literal into repeats of that literal. It also works for
// char classes, any char and any byte.
// PostVisit creates the coalesced result, which should then be simplified.
class CoalesceWalker : public Regexp::Walker<Regexp*> {
public:
CoalesceWalker() {}
virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
Regexp** child_args, int nchild_args);
virtual Regexp* Copy(Regexp* re);
virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
private:
// These functions are declared inside CoalesceWalker so that
// they can edit the private fields of the Regexps they construct.
// Returns true if r1 and r2 can be coalesced. In particular, ensures that
// the parse flags are consistent. (They will not be checked again later.)
static bool CanCoalesce(Regexp* r1, Regexp* r2);
// Coalesces *r1ptr and *r2ptr. In most cases, the array elements afterwards
// will be empty match and the coalesced op. In other cases, where part of a
// literal string was removed to be coalesced, the array elements afterwards
// will be the coalesced op and the remainder of the literal string.
static void DoCoalesce(Regexp** r1ptr, Regexp** r2ptr);
CoalesceWalker(const CoalesceWalker&) = delete;
CoalesceWalker& operator=(const CoalesceWalker&) = delete;
};
// Walker subclass used by Simplify.
// The simplify walk is purely post-recursive: given the simplified children,
// PostVisit creates the simplified result.
@ -104,9 +139,7 @@ class SimplifyWalker : public Regexp::Walker<Regexp*> {
public:
SimplifyWalker() {}
virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
virtual Regexp* PostVisit(Regexp* re,
Regexp* parent_arg,
Regexp* pre_arg,
virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
Regexp** child_args, int nchild_args);
virtual Regexp* Copy(Regexp* re);
virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
@ -130,7 +163,8 @@ class SimplifyWalker : public Regexp::Walker<Regexp*> {
// Caller must Decref return value when done with it.
static Regexp* SimplifyCharClass(Regexp* re);
DISALLOW_EVIL_CONSTRUCTORS(SimplifyWalker);
SimplifyWalker(const SimplifyWalker&) = delete;
SimplifyWalker& operator=(const SimplifyWalker&) = delete;
};
// Simplifies a regular expression, returning a new regexp.
@ -143,14 +177,261 @@ class SimplifyWalker : public Regexp::Walker<Regexp*> {
// Caller must Decref() return value when done with it.
Regexp* Regexp::Simplify() {
if (simple_)
return Incref();
SimplifyWalker w;
return w.Walk(this, NULL);
CoalesceWalker cw;
Regexp* cre = cw.Walk(this, NULL);
if (cre == NULL)
return cre;
SimplifyWalker sw;
Regexp* sre = sw.Walk(cre, NULL);
cre->Decref();
return sre;
}
#define Simplify DontCallSimplify // Avoid accidental recursion
// Utility function for PostVisit implementations that compares re->sub() with
// child_args to determine whether any child_args changed. In the common case,
// where nothing changed, calls Decref() for all child_args and returns false,
// so PostVisit must return re->Incref(). Otherwise, returns true.
static bool ChildArgsChanged(Regexp* re, Regexp** child_args) {
for (int i = 0; i < re->nsub(); i++) {
Regexp* sub = re->sub()[i];
Regexp* newsub = child_args[i];
if (newsub != sub)
return true;
}
for (int i = 0; i < re->nsub(); i++) {
Regexp* newsub = child_args[i];
newsub->Decref();
}
return false;
}
Regexp* CoalesceWalker::Copy(Regexp* re) {
return re->Incref();
}
Regexp* CoalesceWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
// This should never be called, since we use Walk and not
// WalkExponential.
LOG(DFATAL) << "CoalesceWalker::ShortVisit called";
return re->Incref();
}
Regexp* CoalesceWalker::PostVisit(Regexp* re,
Regexp* parent_arg,
Regexp* pre_arg,
Regexp** child_args,
int nchild_args) {
if (re->nsub() == 0)
return re->Incref();
if (re->op() != kRegexpConcat) {
if (!ChildArgsChanged(re, child_args))
return re->Incref();
// Something changed. Build a new op.
Regexp* nre = new Regexp(re->op(), re->parse_flags());
nre->AllocSub(re->nsub());
Regexp** nre_subs = nre->sub();
for (int i = 0; i < re->nsub(); i++)
nre_subs[i] = child_args[i];
// Repeats and Captures have additional data that must be copied.
if (re->op() == kRegexpRepeat) {
nre->min_ = re->min();
nre->max_ = re->max();
} else if (re->op() == kRegexpCapture) {
nre->cap_ = re->cap();
}
return nre;
}
bool can_coalesce = false;
for (int i = 0; i < re->nsub(); i++) {
if (i+1 < re->nsub() &&
CanCoalesce(child_args[i], child_args[i+1])) {
can_coalesce = true;
break;
}
}
if (!can_coalesce) {
if (!ChildArgsChanged(re, child_args))
return re->Incref();
// Something changed. Build a new op.
Regexp* nre = new Regexp(re->op(), re->parse_flags());
nre->AllocSub(re->nsub());
Regexp** nre_subs = nre->sub();
for (int i = 0; i < re->nsub(); i++)
nre_subs[i] = child_args[i];
return nre;
}
for (int i = 0; i < re->nsub(); i++) {
if (i+1 < re->nsub() &&
CanCoalesce(child_args[i], child_args[i+1]))
DoCoalesce(&child_args[i], &child_args[i+1]);
}
// Determine how many empty matches were left by DoCoalesce.
int n = 0;
for (int i = n; i < re->nsub(); i++) {
if (child_args[i]->op() == kRegexpEmptyMatch)
n++;
}
// Build a new op.
Regexp* nre = new Regexp(re->op(), re->parse_flags());
nre->AllocSub(re->nsub() - n);
Regexp** nre_subs = nre->sub();
for (int i = 0, j = 0; i < re->nsub(); i++) {
if (child_args[i]->op() == kRegexpEmptyMatch) {
child_args[i]->Decref();
continue;
}
nre_subs[j] = child_args[i];
j++;
}
return nre;
}
bool CoalesceWalker::CanCoalesce(Regexp* r1, Regexp* r2) {
// r1 must be a star/plus/quest/repeat of a literal, char class, any char or
// any byte.
if ((r1->op() == kRegexpStar ||
r1->op() == kRegexpPlus ||
r1->op() == kRegexpQuest ||
r1->op() == kRegexpRepeat) &&
(r1->sub()[0]->op() == kRegexpLiteral ||
r1->sub()[0]->op() == kRegexpCharClass ||
r1->sub()[0]->op() == kRegexpAnyChar ||
r1->sub()[0]->op() == kRegexpAnyByte)) {
// r2 must be a star/plus/quest/repeat of the same literal, char class,
// any char or any byte.
if ((r2->op() == kRegexpStar ||
r2->op() == kRegexpPlus ||
r2->op() == kRegexpQuest ||
r2->op() == kRegexpRepeat) &&
Regexp::Equal(r1->sub()[0], r2->sub()[0]) &&
// The parse flags must be consistent.
((r1->parse_flags() & Regexp::NonGreedy) ==
(r2->parse_flags() & Regexp::NonGreedy))) {
return true;
}
// ... OR an occurrence of that literal, char class, any char or any byte
if (Regexp::Equal(r1->sub()[0], r2)) {
return true;
}
// ... OR a literal string that begins with that literal.
if (r1->sub()[0]->op() == kRegexpLiteral &&
r2->op() == kRegexpLiteralString &&
r2->runes()[0] == r1->sub()[0]->rune() &&
// The parse flags must be consistent.
((r1->sub()[0]->parse_flags() & Regexp::FoldCase) ==
(r2->parse_flags() & Regexp::FoldCase))) {
return true;
}
}
return false;
}
void CoalesceWalker::DoCoalesce(Regexp** r1ptr, Regexp** r2ptr) {
Regexp* r1 = *r1ptr;
Regexp* r2 = *r2ptr;
Regexp* nre = Regexp::Repeat(
r1->sub()[0]->Incref(), r1->parse_flags(), 0, 0);
switch (r1->op()) {
case kRegexpStar:
nre->min_ = 0;
nre->max_ = -1;
break;
case kRegexpPlus:
nre->min_ = 1;
nre->max_ = -1;
break;
case kRegexpQuest:
nre->min_ = 0;
nre->max_ = 1;
break;
case kRegexpRepeat:
nre->min_ = r1->min();
nre->max_ = r1->max();
break;
default:
LOG(DFATAL) << "DoCoalesce failed: r1->op() is " << r1->op();
nre->Decref();
return;
}
switch (r2->op()) {
case kRegexpStar:
nre->max_ = -1;
goto LeaveEmpty;
case kRegexpPlus:
nre->min_++;
nre->max_ = -1;
goto LeaveEmpty;
case kRegexpQuest:
if (nre->max() != -1)
nre->max_++;
goto LeaveEmpty;
case kRegexpRepeat:
nre->min_ += r2->min();
if (r2->max() == -1)
nre->max_ = -1;
else if (nre->max() != -1)
nre->max_ += r2->max();
goto LeaveEmpty;
case kRegexpLiteral:
case kRegexpCharClass:
case kRegexpAnyChar:
case kRegexpAnyByte:
nre->min_++;
if (nre->max() != -1)
nre->max_++;
goto LeaveEmpty;
LeaveEmpty:
*r1ptr = new Regexp(kRegexpEmptyMatch, Regexp::NoParseFlags);
*r2ptr = nre;
break;
case kRegexpLiteralString: {
Rune r = r1->sub()[0]->rune();
// Determine how much of the literal string is removed.
// We know that we have at least one rune. :)
int n = 1;
while (n < r2->nrunes() && r2->runes()[n] == r)
n++;
nre->min_ += n;
if (nre->max() != -1)
nre->max_ += n;
if (n == r2->nrunes())
goto LeaveEmpty;
*r1ptr = nre;
*r2ptr = Regexp::LiteralString(
&r2->runes()[n], r2->nrunes() - n, r2->parse_flags());
break;
}
default:
LOG(DFATAL) << "DoCoalesce failed: r2->op() is " << r2->op();
nre->Decref();
return;
}
r1->Decref();
r2->Decref();
}
Regexp* SimplifyWalker::Copy(Regexp* re) {
return re->Incref();
}
@ -163,7 +444,7 @@ Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
}
Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {
if (re->simple_) {
if (re->simple()) {
*stop = true;
return re->Incref();
}
@ -196,29 +477,14 @@ Regexp* SimplifyWalker::PostVisit(Regexp* re,
case kRegexpConcat:
case kRegexpAlternate: {
// These are simple as long as the subpieces are simple.
// Two passes to avoid allocation in the common case.
bool changed = false;
Regexp** subs = re->sub();
for (int i = 0; i < re->nsub_; i++) {
Regexp* sub = subs[i];
Regexp* newsub = child_args[i];
if (newsub != sub) {
changed = true;
break;
}
}
if (!changed) {
for (int i = 0; i < re->nsub_; i++) {
Regexp* newsub = child_args[i];
newsub->Decref();
}
if (!ChildArgsChanged(re, child_args)) {
re->simple_ = true;
return re->Incref();
}
Regexp* nre = new Regexp(re->op(), re->parse_flags());
nre->AllocSub(re->nsub_);
nre->AllocSub(re->nsub());
Regexp** nre_subs = nre->sub();
for (int i = 0; i <re->nsub_; i++)
for (int i = 0; i < re->nsub(); i++)
nre_subs[i] = child_args[i];
nre->simple_ = true;
return nre;
@ -234,7 +500,7 @@ Regexp* SimplifyWalker::PostVisit(Regexp* re,
Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());
nre->AllocSub(1);
nre->sub()[0] = newsub;
nre->cap_ = re->cap_;
nre->cap_ = re->cap();
nre->simple_ = true;
return nre;
}
@ -323,13 +589,12 @@ Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
return Regexp::Plus(re->Incref(), f);
// General case: x{4,} is xxxx+
Regexp* nre = new Regexp(kRegexpConcat, f);
nre->AllocSub(min);
VLOG(1) << "Simplify " << min;
Regexp** nre_subs = nre->sub();
Regexp** nre_subs = new Regexp*[min];
for (int i = 0; i < min-1; i++)
nre_subs[i] = re->Incref();
nre_subs[min-1] = Regexp::Plus(re->Incref(), f);
Regexp* nre = Regexp::Concat(nre_subs, min, f);
delete[] nre_subs;
return nre;
}
@ -348,11 +613,11 @@ Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
// Build leading prefix: xx. Capturing only on the last one.
Regexp* nre = NULL;
if (min > 0) {
nre = new Regexp(kRegexpConcat, f);
nre->AllocSub(min);
Regexp** nre_subs = nre->sub();
Regexp** nre_subs = new Regexp*[min];
for (int i = 0; i < min; i++)
nre_subs[i] = re->Incref();
nre = Regexp::Concat(nre_subs, min, f);
delete[] nre_subs;
}
// Build and attach suffix: (x(x(x)?)?)?

View File

@ -0,0 +1,65 @@
// Copyright 2004 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "re2/stringpiece.h"
#include <ostream>
#include "util/util.h"
namespace re2 {
const StringPiece::size_type StringPiece::npos; // initialized in stringpiece.h
StringPiece::size_type StringPiece::copy(char* buf, size_type n,
size_type pos) const {
size_type ret = std::min(size_ - pos, n);
memcpy(buf, data_ + pos, ret);
return ret;
}
StringPiece StringPiece::substr(size_type pos, size_type n) const {
if (pos > size_) pos = size_;
if (n > size_ - pos) n = size_ - pos;
return StringPiece(data_ + pos, n);
}
StringPiece::size_type StringPiece::find(const StringPiece& s,
size_type pos) const {
if (pos > size_) return npos;
const_pointer result = std::search(data_ + pos, data_ + size_,
s.data_, s.data_ + s.size_);
size_type xpos = result - data_;
return xpos + s.size_ <= size_ ? xpos : npos;
}
StringPiece::size_type StringPiece::find(char c, size_type pos) const {
if (size_ <= 0 || pos >= size_) return npos;
const_pointer result = std::find(data_ + pos, data_ + size_, c);
return result != data_ + size_ ? result - data_ : npos;
}
StringPiece::size_type StringPiece::rfind(const StringPiece& s,
size_type pos) const {
if (size_ < s.size_) return npos;
if (s.size_ == 0) return std::min(size_, pos);
const_pointer last = data_ + std::min(size_ - s.size_, pos) + s.size_;
const_pointer result = std::find_end(data_, last, s.data_, s.data_ + s.size_);
return result != last ? result - data_ : npos;
}
StringPiece::size_type StringPiece::rfind(char c, size_type pos) const {
if (size_ <= 0) return npos;
for (size_t i = std::min(pos + 1, size_); i != 0;) {
if (data_[--i] == c) return i;
}
return npos;
}
std::ostream& operator<<(std::ostream& o, const StringPiece& p) {
o.write(p.data(), p.size());
return o;
}
} // namespace re2

View File

@ -2,6 +2,9 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_STRINGPIECE_H_
#define RE2_STRINGPIECE_H_
// A string-like object that points to a sized piece of memory.
//
// Functions or methods may use const StringPiece& parameters to accept either
@ -16,140 +19,145 @@
//
// Arghh! I wish C++ literals were "string".
#ifndef STRINGS_STRINGPIECE_H__
#define STRINGS_STRINGPIECE_H__
#include <stddef.h>
#include <string.h>
#include <cstddef>
#include <algorithm>
#include <iosfwd>
#include <iterator>
#include <string>
namespace re2 {
class StringPiece {
private:
const char* ptr_;
int length_;
public:
typedef char value_type;
typedef char* pointer;
typedef const char* const_pointer;
typedef char& reference;
typedef const char& const_reference;
typedef const char* const_iterator;
typedef const_iterator iterator;
typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
typedef const_reverse_iterator reverse_iterator;
typedef size_t size_type;
typedef ptrdiff_t difference_type;
static const size_type npos = static_cast<size_type>(-1);
// We provide non-explicit singleton constructors so users can pass
// in a "const char*" or a "string" wherever a "StringPiece" is
// expected.
StringPiece() : ptr_(NULL), length_(0) { }
StringPiece(const char* str)
: ptr_(str), length_((str == NULL) ? 0 : static_cast<int>(strlen(str))) { }
StringPiece()
: data_(NULL), size_(0) {}
StringPiece(const std::string& str)
: ptr_(str.data()), length_(static_cast<int>(str.size())) { }
StringPiece(const char* offset, int len) : ptr_(offset), length_(len) { }
: data_(str.data()), size_(str.size()) {}
StringPiece(const char* str)
: data_(str), size_(str == NULL ? 0 : strlen(str)) {}
StringPiece(const char* str, size_type len)
: data_(str), size_(len) {}
// data() may return a pointer to a buffer with embedded NULs, and the
// returned buffer may or may not be null terminated. Therefore it is
// typically a mistake to pass data() to a routine that expects a NUL
// terminated string.
const char* data() const { return ptr_; }
int size() const { return length_; }
int length() const { return length_; }
bool empty() const { return length_ == 0; }
const_iterator begin() const { return data_; }
const_iterator end() const { return data_ + size_; }
const_reverse_iterator rbegin() const {
return const_reverse_iterator(data_ + size_);
}
const_reverse_iterator rend() const {
return const_reverse_iterator(data_);
}
size_type size() const { return size_; }
size_type length() const { return size_; }
bool empty() const { return size_ == 0; }
const_reference operator[](size_type i) const { return data_[i]; }
const_pointer data() const { return data_; }
void remove_prefix(size_type n) {
data_ += n;
size_ -= n;
}
void remove_suffix(size_type n) {
size_ -= n;
}
void clear() { ptr_ = NULL; length_ = 0; }
void set(const char* data, int len) { ptr_ = data; length_ = len; }
void set(const char* str) {
ptr_ = str;
if (str != NULL)
length_ = static_cast<int>(strlen(str));
else
length_ = 0;
}
void set(const void* data, int len) {
ptr_ = reinterpret_cast<const char*>(data);
length_ = len;
data_ = str;
size_ = str == NULL ? 0 : strlen(str);
}
char operator[](int i) const { return ptr_[i]; }
void remove_prefix(int n) {
ptr_ += n;
length_ -= n;
}
void remove_suffix(int n) {
length_ -= n;
}
int compare(const StringPiece& x) const {
int r = memcmp(ptr_, x.ptr_, std::min(length_, x.length_));
if (r == 0) {
if (length_ < x.length_) r = -1;
else if (length_ > x.length_) r = +1;
}
return r;
void set(const char* str, size_type len) {
data_ = str;
size_ = len;
}
std::string as_string() const {
return std::string(data(), size());
return std::string(data_, size_);
}
// We also define ToString() here, since many other string-like
// interfaces name the routine that converts to a C++ string
// "ToString", and it's confusing to have the method that does that
// for a StringPiece be called "as_string()". We also leave the
// "as_string()" method defined here for existing code.
std::string ToString() const {
return std::string(data(), size());
return std::string(data_, size_);
}
void CopyToString(std::string* target) const;
void AppendToString(std::string* target) const;
void CopyToString(std::string* target) const {
target->assign(data_, size_);
}
// Does "this" start with "x"
void AppendToString(std::string* target) const {
target->append(data_, size_);
}
size_type copy(char* buf, size_type n, size_type pos = 0) const;
StringPiece substr(size_type pos = 0, size_type n = npos) const;
int compare(const StringPiece& x) const {
size_type min_size = std::min(size(), x.size());
if (min_size > 0) {
int r = memcmp(data(), x.data(), min_size);
if (r < 0) return -1;
if (r > 0) return 1;
}
if (size() < x.size()) return -1;
if (size() > x.size()) return 1;
return 0;
}
// Does "this" start with "x"?
bool starts_with(const StringPiece& x) const {
return ((length_ >= x.length_) &&
(memcmp(ptr_, x.ptr_, x.length_) == 0));
return x.empty() ||
(size() >= x.size() && memcmp(data(), x.data(), x.size()) == 0);
}
// Does "this" end with "x"
// Does "this" end with "x"?
bool ends_with(const StringPiece& x) const {
return ((length_ >= x.length_) &&
(memcmp(ptr_ + (length_-x.length_), x.ptr_, x.length_) == 0));
return x.empty() ||
(size() >= x.size() &&
memcmp(data() + (size() - x.size()), x.data(), x.size()) == 0);
}
// standard STL container boilerplate
typedef char value_type;
typedef const char* pointer;
typedef const char& reference;
typedef const char& const_reference;
typedef size_t size_type;
typedef ptrdiff_t difference_type;
static const size_type npos;
typedef const char* const_iterator;
typedef const char* iterator;
typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
typedef std::reverse_iterator<iterator> reverse_iterator;
iterator begin() const { return ptr_; }
iterator end() const { return ptr_ + length_; }
const_reverse_iterator rbegin() const {
return const_reverse_iterator(ptr_ + length_);
bool contains(const StringPiece& s) const {
return find(s) != npos;
}
const_reverse_iterator rend() const {
return const_reverse_iterator(ptr_);
}
// STLS says return size_type, but Google says return int
int max_size() const { return length_; }
int capacity() const { return length_; }
int copy(char* buf, size_type n, size_type pos = 0) const;
size_type find(const StringPiece& s, size_type pos = 0) const;
size_type find(char c, size_type pos = 0) const;
size_type rfind(const StringPiece& s, size_type pos = npos) const;
size_type rfind(char c, size_type pos = npos) const;
int find(const StringPiece& s, size_type pos = 0) const;
int find(char c, size_type pos = 0) const;
int rfind(const StringPiece& s, size_type pos = npos) const;
int rfind(char c, size_type pos = npos) const;
StringPiece substr(size_type pos, size_type n = npos) const;
static bool _equal(const StringPiece&, const StringPiece&);
private:
const_pointer data_;
size_type size_;
};
inline bool operator==(const StringPiece& x, const StringPiece& y) {
return StringPiece::_equal(x, y);
StringPiece::size_type len = x.size();
if (len != y.size()) return false;
return x.data() == y.data() || len == 0 ||
memcmp(x.data(), y.data(), len) == 0;
}
inline bool operator!=(const StringPiece& x, const StringPiece& y) {
@ -157,9 +165,9 @@ inline bool operator!=(const StringPiece& x, const StringPiece& y) {
}
inline bool operator<(const StringPiece& x, const StringPiece& y) {
const int r = memcmp(x.data(), y.data(),
std::min(x.size(), y.size()));
return ((r < 0) || ((r == 0) && (x.size() < y.size())));
StringPiece::size_type min_size = std::min(x.size(), y.size());
int r = min_size == 0 ? 0 : memcmp(x.data(), y.data(), min_size);
return (r < 0) || (r == 0 && x.size() < y.size());
}
inline bool operator>(const StringPiece& x, const StringPiece& y) {
@ -174,9 +182,9 @@ inline bool operator>=(const StringPiece& x, const StringPiece& y) {
return !(x < y);
}
// Allow StringPiece to be logged.
std::ostream& operator<<(std::ostream& o, const StringPiece& p);
} // namespace re2
// allow StringPiece to be logged
extern std::ostream& operator<<(std::ostream& o, const re2::StringPiece& piece);
#endif // STRINGS_STRINGPIECE_H__
#endif // RE2_STRINGPIECE_H_

View File

@ -5,7 +5,13 @@
// Format a regular expression structure as a string.
// Tested by parse_test.cc
#include <string.h>
#include <string>
#include "util/util.h"
#include "util/logging.h"
#include "util/strutil.h"
#include "util/utf.h"
#include "re2/regexp.h"
#include "re2/walker-inl.h"
@ -42,7 +48,8 @@ class ToStringWalker : public Regexp::Walker<int> {
private:
string* t_; // The string the walker appends to.
DISALLOW_EVIL_CONSTRUCTORS(ToStringWalker);
ToStringWalker(const ToStringWalker&) = delete;
ToStringWalker& operator=(const ToStringWalker&) = delete;
};
string Regexp::ToString() {
@ -94,6 +101,8 @@ int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) {
case kRegexpCapture:
t_->append("(");
if (re->cap() == 0)
LOG(DFATAL) << "kRegexpCapture cap() == 0";
if (re->name()) {
t_->append("?P<");
t_->append(*re->name());
@ -120,13 +129,12 @@ int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) {
static void AppendLiteral(string *t, Rune r, bool foldcase) {
if (r != 0 && r < 0x80 && strchr("(){}[]*+?|.^$\\", r)) {
t->append(1, '\\');
t->append(1, r);
t->append(1, static_cast<char>(r));
} else if (foldcase && 'a' <= r && r <= 'z') {
if ('a' <= r && r <= 'z')
r += 'A' - 'a';
r -= 'a' - 'A';
t->append(1, '[');
t->append(1, r);
t->append(1, r + 'a' - 'A');
t->append(1, static_cast<char>(r));
t->append(1, static_cast<char>(r) + 'a' - 'A');
t->append(1, ']');
} else {
AppendCCRange(t, r, r);
@ -154,12 +162,14 @@ int ToStringWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg,
break;
case kRegexpLiteral:
AppendLiteral(t_, re->rune(), re->parse_flags() & Regexp::FoldCase);
AppendLiteral(t_, re->rune(),
(re->parse_flags() & Regexp::FoldCase) != 0);
break;
case kRegexpLiteralString:
for (int i = 0; i < re->nrunes(); i++)
AppendLiteral(t_, re->runes()[i], re->parse_flags() & Regexp::FoldCase);
AppendLiteral(t_, re->runes()[i],
(re->parse_flags() & Regexp::FoldCase) != 0);
if (prec < PrecConcat)
t_->append(")");
break;
@ -297,7 +307,7 @@ static void AppendCCChar(string* t, Rune r) {
if (0x20 <= r && r <= 0x7E) {
if (strchr("[]^-\\", r))
t->append("\\");
t->append(1, r);
t->append(1, static_cast<char>(r));
return;
}
switch (r) {

View File

@ -9,7 +9,7 @@ import re
import urllib2
# Directory or URL where Unicode tables reside.
_UNICODE_DIR = "http://www.unicode.org/Public/6.3.0/ucd"
_UNICODE_DIR = "http://www.unicode.org/Public/10.0.0/ucd"
# Largest valid Unicode code value.
_RUNE_MAX = 0x10FFFF

View File

@ -7,7 +7,7 @@
namespace re2 {
// 1034 groups, 2089 pairs, 289 ranges
// 1295 groups, 2620 pairs, 343 ranges
const CaseFold unicode_casefold[] = {
{ 65, 90, 32 },
{ 97, 106, -32 },
@ -105,13 +105,17 @@ const CaseFold unicode_casefold[] = {
{ 598, 599, -205 },
{ 601, 601, -202 },
{ 603, 603, -203 },
{ 604, 604, 42319 },
{ 608, 608, -205 },
{ 609, 609, 42315 },
{ 611, 611, -207 },
{ 613, 613, 42280 },
{ 614, 614, 42308 },
{ 616, 616, -209 },
{ 617, 617, -211 },
{ 618, 618, 42308 },
{ 619, 619, 10743 },
{ 620, 620, 42305 },
{ 623, 623, -211 },
{ 625, 625, 10749 },
{ 626, 626, -213 },
@ -119,15 +123,19 @@ const CaseFold unicode_casefold[] = {
{ 637, 637, 10727 },
{ 640, 640, -218 },
{ 643, 643, -218 },
{ 647, 647, 42282 },
{ 648, 648, -218 },
{ 649, 649, -69 },
{ 650, 651, -217 },
{ 652, 652, -71 },
{ 658, 658, -219 },
{ 669, 669, 42261 },
{ 670, 670, 42258 },
{ 837, 837, 84 },
{ 880, 883, EvenOdd },
{ 886, 887, EvenOdd },
{ 891, 893, 130 },
{ 895, 895, 116 },
{ 902, 902, 38 },
{ 904, 906, 37 },
{ 908, 908, 64 },
@ -168,6 +176,7 @@ const CaseFold unicode_casefold[] = {
{ 1008, 1008, -86 },
{ 1009, 1009, -80 },
{ 1010, 1010, 7 },
{ 1011, 1011, -116 },
{ 1012, 1012, -92 },
{ 1013, 1013, -96 },
{ 1015, 1016, OddEven },
@ -176,19 +185,43 @@ const CaseFold unicode_casefold[] = {
{ 1021, 1023, -130 },
{ 1024, 1039, 80 },
{ 1040, 1071, 32 },
{ 1072, 1103, -32 },
{ 1072, 1073, -32 },
{ 1074, 1074, 6222 },
{ 1075, 1075, -32 },
{ 1076, 1076, 6221 },
{ 1077, 1085, -32 },
{ 1086, 1086, 6212 },
{ 1087, 1088, -32 },
{ 1089, 1090, 6210 },
{ 1091, 1097, -32 },
{ 1098, 1098, 6204 },
{ 1099, 1103, -32 },
{ 1104, 1119, -80 },
{ 1120, 1153, EvenOdd },
{ 1120, 1122, EvenOdd },
{ 1123, 1123, 6180 },
{ 1124, 1153, EvenOdd },
{ 1162, 1215, EvenOdd },
{ 1216, 1216, 15 },
{ 1217, 1230, OddEven },
{ 1231, 1231, -15 },
{ 1232, 1319, EvenOdd },
{ 1232, 1327, EvenOdd },
{ 1329, 1366, 48 },
{ 1377, 1414, -48 },
{ 4256, 4293, 7264 },
{ 4295, 4295, 7264 },
{ 4301, 4301, 7264 },
{ 5024, 5103, 38864 },
{ 5104, 5109, 8 },
{ 5112, 5117, -8 },
{ 7296, 7296, -6254 },
{ 7297, 7297, -6253 },
{ 7298, 7298, -6244 },
{ 7299, 7299, -6242 },
{ 7300, 7300, EvenOdd },
{ 7301, 7301, -6243 },
{ 7302, 7302, -6236 },
{ 7303, 7303, -6181 },
{ 7304, 7304, 35266 },
{ 7545, 7545, 35332 },
{ 7549, 7549, 3814 },
{ 7680, 7776, EvenOdd },
@ -282,8 +315,10 @@ const CaseFold unicode_casefold[] = {
{ 11520, 11557, -7264 },
{ 11559, 11559, -7264 },
{ 11565, 11565, -7264 },
{ 42560, 42605, EvenOdd },
{ 42624, 42647, EvenOdd },
{ 42560, 42570, EvenOdd },
{ 42571, 42571, -35267 },
{ 42572, 42605, EvenOdd },
{ 42624, 42651, EvenOdd },
{ 42786, 42799, EvenOdd },
{ 42802, 42863, EvenOdd },
{ 42873, 42876, OddEven },
@ -292,16 +327,35 @@ const CaseFold unicode_casefold[] = {
{ 42891, 42892, OddEven },
{ 42893, 42893, -42280 },
{ 42896, 42899, EvenOdd },
{ 42912, 42921, EvenOdd },
{ 42902, 42921, EvenOdd },
{ 42922, 42922, -42308 },
{ 42923, 42923, -42319 },
{ 42924, 42924, -42315 },
{ 42925, 42925, -42305 },
{ 42926, 42926, -42308 },
{ 42928, 42928, -42258 },
{ 42929, 42929, -42282 },
{ 42930, 42930, -42261 },
{ 42931, 42931, 928 },
{ 42932, 42935, EvenOdd },
{ 43859, 43859, -928 },
{ 43888, 43967, -38864 },
{ 65313, 65338, 32 },
{ 65345, 65370, -32 },
{ 66560, 66599, 40 },
{ 66600, 66639, -40 },
{ 66736, 66771, 40 },
{ 66776, 66811, -40 },
{ 68736, 68786, 64 },
{ 68800, 68850, -64 },
{ 71840, 71871, 32 },
{ 71872, 71903, -32 },
{ 125184, 125217, 34 },
{ 125218, 125251, -34 },
};
const int num_unicode_casefold = 289;
const int num_unicode_casefold = 343;
// 1034 groups, 1055 pairs, 167 ranges
// 1295 groups, 1325 pairs, 191 ranges
const CaseFold unicode_tolower[] = {
{ 65, 90, 32 },
{ 181, 181, 775 },
@ -370,6 +424,7 @@ const CaseFold unicode_tolower[] = {
{ 837, 837, 116 },
{ 880, 882, EvenOddSkip },
{ 886, 886, EvenOdd },
{ 895, 895, 116 },
{ 902, 902, 38 },
{ 904, 906, 37 },
{ 908, 908, 64 },
@ -397,11 +452,20 @@ const CaseFold unicode_tolower[] = {
{ 1162, 1214, EvenOddSkip },
{ 1216, 1216, 15 },
{ 1217, 1229, OddEvenSkip },
{ 1232, 1318, EvenOddSkip },
{ 1232, 1326, EvenOddSkip },
{ 1329, 1366, 48 },
{ 4256, 4293, 7264 },
{ 4295, 4295, 7264 },
{ 4301, 4301, 7264 },
{ 5112, 5117, -8 },
{ 7296, 7296, -6222 },
{ 7297, 7297, -6221 },
{ 7298, 7298, -6212 },
{ 7299, 7300, -6210 },
{ 7301, 7301, -6211 },
{ 7302, 7302, -6204 },
{ 7303, 7303, -6180 },
{ 7304, 7304, 35267 },
{ 7680, 7828, EvenOddSkip },
{ 7835, 7835, -58 },
{ 7838, 7838, -7615 },
@ -457,7 +521,7 @@ const CaseFold unicode_tolower[] = {
{ 11499, 11501, OddEvenSkip },
{ 11506, 11506, EvenOdd },
{ 42560, 42604, EvenOddSkip },
{ 42624, 42646, EvenOddSkip },
{ 42624, 42650, EvenOddSkip },
{ 42786, 42798, EvenOddSkip },
{ 42802, 42862, EvenOddSkip },
{ 42873, 42875, OddEvenSkip },
@ -466,12 +530,26 @@ const CaseFold unicode_tolower[] = {
{ 42891, 42891, OddEven },
{ 42893, 42893, -42280 },
{ 42896, 42898, EvenOddSkip },
{ 42912, 42920, EvenOddSkip },
{ 42902, 42920, EvenOddSkip },
{ 42922, 42922, -42308 },
{ 42923, 42923, -42319 },
{ 42924, 42924, -42315 },
{ 42925, 42925, -42305 },
{ 42926, 42926, -42308 },
{ 42928, 42928, -42258 },
{ 42929, 42929, -42282 },
{ 42930, 42930, -42261 },
{ 42931, 42931, 928 },
{ 42932, 42934, EvenOddSkip },
{ 43888, 43967, -38864 },
{ 65313, 65338, 32 },
{ 66560, 66599, 40 },
{ 66736, 66771, 40 },
{ 68736, 68786, 64 },
{ 71840, 71871, 32 },
{ 125184, 125217, 34 },
};
const int num_unicode_tolower = 167;
const int num_unicode_tolower = 191;

View File

@ -2,6 +2,9 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_UNICODE_CASEFOLD_H_
#define RE2_UNICODE_CASEFOLD_H_
// Unicode case folding tables.
// The Unicode case folding tables encode the mapping from one Unicode point
@ -16,7 +19,7 @@
// '' -> 'K'
//
// Like everything Unicode, these tables are big. If we represent the table
// as a sorted list of uint32 pairs, it has 2049 entries and is 16 kB.
// as a sorted list of uint32_t pairs, it has 2049 entries and is 16 kB.
// Most table entries look like the ones around them:
// 'A' maps to 'A'+32, 'B' maps to 'B'+32, etc.
// Instead of listing all the pairs explicitly, we make a list of ranges
@ -36,10 +39,10 @@
// The grouped form also allows for efficient fold range calculations
// rather than looping one character at a time.
#ifndef RE2_UNICODE_CASEFOLD_H__
#define RE2_UNICODE_CASEFOLD_H__
#include <stdint.h>
#include "util/util.h"
#include "util/utf.h"
namespace re2 {
@ -51,9 +54,9 @@ enum {
};
struct CaseFold {
uint32 lo;
uint32 hi;
int32 delta;
Rune lo;
Rune hi;
int32_t delta;
};
extern const CaseFold unicode_casefold[];
@ -72,4 +75,4 @@ extern Rune ApplyFold(const CaseFold *f, Rune r);
} // namespace re2
#endif // RE2_UNICODE_CASEFOLD_H__
#endif // RE2_UNICODE_CASEFOLD_H_

File diff suppressed because it is too large Load Diff

View File

@ -2,6 +2,9 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_UNICODE_GROUPS_H_
#define RE2_UNICODE_GROUPS_H_
// Unicode character groups.
// The codes get split into ranges of 16-bit codes
@ -15,23 +18,23 @@
// to 16.5 kB of data but make the data harder to use;
// we don't bother.
#ifndef RE2_UNICODE_GROUPS_H__
#define RE2_UNICODE_GROUPS_H__
#include <stdint.h>
#include "util/util.h"
#include "util/utf.h"
namespace re2 {
struct URange16
{
uint16 lo;
uint16 hi;
uint16_t lo;
uint16_t hi;
};
struct URange32
{
uint32 lo;
uint32 hi;
Rune lo;
Rune hi;
};
struct UGroup
@ -61,4 +64,4 @@ extern const int num_perl_groups;
} // namespace re2
#endif // RE2_UNICODE_GROUPS_H__
#endif // RE2_UNICODE_GROUPS_H_

View File

@ -2,6 +2,9 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_WALKER_INL_H_
#define RE2_WALKER_INL_H_
// Helper class for traversing Regexps without recursion.
// Clients should declare their own subclasses that override
// the PreVisit and PostVisit methods, which are called before
@ -10,9 +13,9 @@
// Not quite the Visitor pattern, because (among other things)
// the Visitor pattern is recursive.
#ifndef RE2_WALKER_INL_H__
#define RE2_WALKER_INL_H__
#include <stack>
#include "util/logging.h"
#include "re2/regexp.h"
namespace re2 {
@ -86,13 +89,14 @@ template<typename T> class Regexp::Walker {
private:
// Walk state for the entire traversal.
stack<WalkState<T> >* stack_;
std::stack<WalkState<T> >* stack_;
bool stopped_early_;
int max_visits_;
T WalkInternal(Regexp* re, T top_arg, bool use_copy);
DISALLOW_EVIL_CONSTRUCTORS(Walker);
Walker(const Walker&) = delete;
Walker& operator=(const Walker&) = delete;
};
template<typename T> T Regexp::Walker<T>::PreVisit(Regexp* re,
@ -130,7 +134,7 @@ template<typename T> struct WalkState {
};
template<typename T> Regexp::Walker<T>::Walker() {
stack_ = new stack<WalkState<T> >;
stack_ = new std::stack<WalkState<T> >;
stopped_early_ = false;
}
@ -187,7 +191,7 @@ template<typename T> T Regexp::Walker<T>::WalkInternal(Regexp* re, T top_arg,
s->child_args = &s->child_arg;
else if (re->nsub_ > 1)
s->child_args = new T[re->nsub_];
// Fall through.
FALLTHROUGH_INTENDED;
}
default: {
if (re->nsub_ > 0) {
@ -241,4 +245,4 @@ template<typename T> T Regexp::Walker<T>::WalkExponential(Regexp* re, T top_arg,
} // namespace re2
#endif // RE2_WALKER_INL_H__
#endif // RE2_WALKER_INL_H_

View File

@ -1,5 +1,6 @@
file (READ ${SOURCE_FILENAME} CONTENT)
string (REGEX REPLACE "using re2::RE2;" "" CONTENT "${CONTENT}")
string (REGEX REPLACE "using re2::LazyRE2;" "" CONTENT "${CONTENT}")
string (REGEX REPLACE "namespace re2" "namespace re2_st" CONTENT "${CONTENT}")
string (REGEX REPLACE "re2::" "re2_st::" CONTENT "${CONTENT}")
string (REGEX REPLACE "\"re2/" "\"re2_st/" CONTENT "${CONTENT}")

View File

@ -1,168 +0,0 @@
// Copyright 2000 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "util/util.h"
namespace re2 {
// ----------------------------------------------------------------------
// UnsafeArena::UnsafeArena()
// UnsafeArena::~UnsafeArena()
// Destroying the arena automatically calls Reset()
// ----------------------------------------------------------------------
UnsafeArena::UnsafeArena(const size_t block_size)
: block_size_(block_size),
freestart_(NULL), // set for real in Reset()
last_alloc_(NULL),
remaining_(0),
blocks_alloced_(1),
overflow_blocks_(NULL) {
assert(block_size > kDefaultAlignment);
first_blocks_[0].mem = reinterpret_cast<char*>(malloc(block_size_));
first_blocks_[0].size = block_size_;
Reset();
}
UnsafeArena::~UnsafeArena() {
FreeBlocks();
assert(overflow_blocks_ == NULL); // FreeBlocks() should do that
// The first X blocks stay allocated always by default. Delete them now.
for (int i = 0; i < blocks_alloced_; i++)
free(first_blocks_[i].mem);
}
// ----------------------------------------------------------------------
// UnsafeArena::Reset()
// Clears all the memory an arena is using.
// ----------------------------------------------------------------------
void UnsafeArena::Reset() {
FreeBlocks();
freestart_ = first_blocks_[0].mem;
remaining_ = first_blocks_[0].size;
last_alloc_ = NULL;
// We do not know for sure whether or not the first block is aligned,
// so we fix that right now.
const int overage = reinterpret_cast<uintptr_t>(freestart_) &
(kDefaultAlignment-1);
if (overage > 0) {
const int waste = kDefaultAlignment - overage;
freestart_ += waste;
remaining_ -= waste;
}
freestart_when_empty_ = freestart_;
assert(!(reinterpret_cast<uintptr_t>(freestart_)&(kDefaultAlignment-1)));
}
// -------------------------------------------------------------
// UnsafeArena::AllocNewBlock()
// Adds and returns an AllocatedBlock.
// The returned AllocatedBlock* is valid until the next call
// to AllocNewBlock or Reset. (i.e. anything that might
// affect overflow_blocks_).
// -------------------------------------------------------------
UnsafeArena::AllocatedBlock* UnsafeArena::AllocNewBlock(const size_t block_size) {
AllocatedBlock *block;
// Find the next block.
if (static_cast<size_t>(blocks_alloced_) < arraysize(first_blocks_) ) {
// Use one of the pre-allocated blocks
block = &first_blocks_[blocks_alloced_++];
} else { // oops, out of space, move to the vector
if (overflow_blocks_ == NULL) overflow_blocks_ = new vector<AllocatedBlock>;
// Adds another block to the vector.
overflow_blocks_->resize(overflow_blocks_->size()+1);
// block points to the last block of the vector.
block = &overflow_blocks_->back();
}
block->mem = reinterpret_cast<char*>(malloc(block_size));
block->size = block_size;
return block;
}
// ----------------------------------------------------------------------
// UnsafeArena::GetMemoryFallback()
// We take memory out of our pool, aligned on the byte boundary
// requested. If we don't have space in our current pool, we
// allocate a new block (wasting the remaining space in the
// current block) and give you that. If your memory needs are
// too big for a single block, we make a special your-memory-only
// allocation -- this is equivalent to not using the arena at all.
// ----------------------------------------------------------------------
void* UnsafeArena::GetMemoryFallback(const size_t size, const int align) {
if (size == 0)
return NULL; // stl/stl_alloc.h says this is okay
assert(align > 0 && 0 == (align & (align - 1))); // must be power of 2
// If the object is more than a quarter of the block size, allocate
// it separately to avoid wasting too much space in leftover bytes
if (block_size_ == 0 || size > block_size_/4) {
// then it gets its own block in the arena
assert(align <= kDefaultAlignment); // because that's what new gives us
// This block stays separate from the rest of the world; in particular
// we don't update last_alloc_ so you can't reclaim space on this block.
return AllocNewBlock(size)->mem;
}
const int overage =
(reinterpret_cast<uintptr_t>(freestart_) & (align-1));
if (overage) {
const int waste = align - overage;
freestart_ += waste;
if (waste < static_cast<int>(remaining_)) {
remaining_ -= waste;
} else {
remaining_ = 0;
}
}
if (size > remaining_) {
AllocatedBlock *block = AllocNewBlock(block_size_);
freestart_ = block->mem;
remaining_ = block->size;
}
remaining_ -= size;
last_alloc_ = freestart_;
freestart_ += size;
assert((reinterpret_cast<uintptr_t>(last_alloc_) & (align-1)) == 0);
return reinterpret_cast<void*>(last_alloc_);
}
// ----------------------------------------------------------------------
// UnsafeArena::FreeBlocks()
// Unlike GetMemory(), which does actual work, ReturnMemory() is a
// no-op: we don't "free" memory until Reset() is called. We do
// update some stats, though. Note we do no checking that the
// pointer you pass in was actually allocated by us, or that it
// was allocated for the size you say, so be careful here!
// FreeBlocks() does the work for Reset(), actually freeing all
// memory allocated in one fell swoop.
// ----------------------------------------------------------------------
void UnsafeArena::FreeBlocks() {
for ( int i = 1; i < blocks_alloced_; ++i ) { // keep first block alloced
free(first_blocks_[i].mem);
first_blocks_[i].mem = NULL;
first_blocks_[i].size = 0;
}
blocks_alloced_ = 1;
if (overflow_blocks_ != NULL) {
vector<AllocatedBlock>::iterator it;
for (it = overflow_blocks_->begin(); it != overflow_blocks_->end(); ++it) {
free(it->mem);
}
delete overflow_blocks_; // These should be used very rarely
overflow_blocks_ = NULL;
}
}
} // namespace re2

View File

@ -1,103 +0,0 @@
// Copyright 2000 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Sometimes it is necessary to allocate a large number of small
// objects. Doing this the usual way (malloc, new) is slow,
// especially for multithreaded programs. An UnsafeArena provides a
// mark/release method of memory management: it asks for a large chunk
// from the operating system and doles it out bit by bit as required.
// Then you free all the memory at once by calling UnsafeArena::Reset().
// The "Unsafe" refers to the fact that UnsafeArena is not safe to
// call from multiple threads.
//
// The global operator new that can be used as follows:
//
// #include "lib/arena-inl.h"
//
// UnsafeArena arena(1000);
// Foo* foo = new (AllocateInArena, &arena) Foo;
//
#ifndef RE2_UTIL_ARENA_H_
#define RE2_UTIL_ARENA_H_
namespace re2 {
// This class is thread-compatible.
class UnsafeArena {
public:
UnsafeArena(const size_t block_size);
virtual ~UnsafeArena();
void Reset();
// This should be the worst-case alignment for any type. This is
// good for IA-32, SPARC version 7 (the last one I know), and
// supposedly Alpha. i386 would be more time-efficient with a
// default alignment of 8, but ::operator new() uses alignment of 4,
// and an assertion will fail below after the call to MakeNewBlock()
// if you try to use a larger alignment.
#ifdef __i386__
static const int kDefaultAlignment = 4;
#else
static const int kDefaultAlignment = 8;
#endif
private:
void* GetMemoryFallback(const size_t size, const int align);
public:
void* GetMemory(const size_t size, const int align) {
if ( size > 0 && size < remaining_ && align == 1 ) { // common case
last_alloc_ = freestart_;
freestart_ += size;
remaining_ -= size;
return reinterpret_cast<void*>(last_alloc_);
}
return GetMemoryFallback(size, align);
}
private:
struct AllocatedBlock {
char *mem;
size_t size;
};
// The returned AllocatedBlock* is valid until the next call to AllocNewBlock
// or Reset (i.e. anything that might affect overflow_blocks_).
AllocatedBlock *AllocNewBlock(const size_t block_size);
const AllocatedBlock *IndexToBlock(int index) const;
const size_t block_size_;
char* freestart_; // beginning of the free space in most recent block
char* freestart_when_empty_; // beginning of the free space when we're empty
char* last_alloc_; // used to make sure ReturnBytes() is safe
size_t remaining_;
// STL vector isn't as efficient as it could be, so we use an array at first
int blocks_alloced_; // how many of the first_blocks_ have been alloced
AllocatedBlock first_blocks_[16]; // the length of this array is arbitrary
// if the first_blocks_ aren't enough, expand into overflow_blocks_.
vector<AllocatedBlock>* overflow_blocks_;
void FreeBlocks(); // Frees all except first block
DISALLOW_EVIL_CONSTRUCTORS(UnsafeArena);
};
// Operators for allocation on the arena
// Syntax: new (AllocateInArena, arena) MyClass;
// STL containers, etc.
enum AllocateInArenaType { AllocateInArena };
} // namespace re2
inline void* operator new(size_t size,
re2::AllocateInArenaType /* unused */,
re2::UnsafeArena *arena) {
return reinterpret_cast<char*>(arena->GetMemory(size, 1));
}
#endif // RE2_UTIL_ARENA_H_

View File

@ -1,137 +0,0 @@
// Copyright 2006-2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_UTIL_ATOMICOPS_H__
#define RE2_UTIL_ATOMICOPS_H__
// The memory ordering constraints resemble the ones in C11.
// RELAXED - no memory ordering, just an atomic operation.
// CONSUME - data-dependent ordering.
// ACQUIRE - prevents memory accesses from hoisting above the operation.
// RELEASE - prevents memory accesses from sinking below the operation.
#if (__clang_major__ * 100 + __clang_minor__ >= 303) || \
(__GNUC__ * 1000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__ >= 40801)
#define ATOMIC_LOAD_RELAXED(x, p) do { (x) = __atomic_load_n((p), __ATOMIC_RELAXED); } while (0)
#define ATOMIC_LOAD_CONSUME(x, p) do { (x) = __atomic_load_n((p), __ATOMIC_CONSUME); } while (0)
#define ATOMIC_LOAD_ACQUIRE(x, p) do { (x) = __atomic_load_n((p), __ATOMIC_ACQUIRE); } while (0)
#define ATOMIC_STORE_RELAXED(p, v) __atomic_store_n((p), (v), __ATOMIC_RELAXED)
#define ATOMIC_STORE_RELEASE(p, v) __atomic_store_n((p), (v), __ATOMIC_RELEASE)
#else // old compiler
#define ATOMIC_LOAD_RELAXED(x, p) do { (x) = *(p); } while (0)
#define ATOMIC_LOAD_CONSUME(x, p) do { (x) = *(p); MaybeReadMemoryBarrier(); } while (0)
#define ATOMIC_LOAD_ACQUIRE(x, p) do { (x) = *(p); ReadMemoryBarrier(); } while (0)
#define ATOMIC_STORE_RELAXED(p, v) do { *(p) = (v); } while (0)
#define ATOMIC_STORE_RELEASE(p, v) do { WriteMemoryBarrier(); *(p) = (v); } while (0)
// WriteMemoryBarrier(), ReadMemoryBarrier() and MaybeReadMemoryBarrier()
// are an implementation detail and must not be used in the rest of the code.
#if defined(__i386__)
static inline void WriteMemoryBarrier() {
int x;
__asm__ __volatile__("xchgl (%0),%0" // The lock prefix is implicit for xchg.
:: "r" (&x));
}
#elif defined(__x86_64__)
// 64-bit implementations of memory barrier can be simpler, because
// "sfence" is guaranteed to exist.
static inline void WriteMemoryBarrier() {
__asm__ __volatile__("sfence" : : : "memory");
}
#elif defined(__ppc__)
static inline void WriteMemoryBarrier() {
__asm__ __volatile__("eieio" : : : "memory");
}
#elif defined(__alpha__)
static inline void WriteMemoryBarrier() {
__asm__ __volatile__("wmb" : : : "memory");
}
#elif defined(__aarch64__)
static inline void WriteMemoryBarrier() {
__asm__ __volatile__("dmb st" : : : "memory");
}
#else
#include "util/mutex.h"
static inline void WriteMemoryBarrier() {
// Slight overkill, but good enough:
// any mutex implementation must have
// a read barrier after the lock operation and
// a write barrier before the unlock operation.
//
// It may be worthwhile to write architecture-specific
// barriers for the common platforms, as above, but
// this is a correct fallback.
re2::Mutex mu;
re2::MutexLock l(&mu);
}
/*
#error Need WriteMemoryBarrier for architecture.
// Windows
inline void WriteMemoryBarrier() {
LONG x;
::InterlockedExchange(&x, 0);
}
*/
#endif
// Alpha has very weak memory ordering. If relying on WriteBarriers, one must
// use read barriers for the readers too.
#if defined(__alpha__)
static inline void MaybeReadMemoryBarrier() {
__asm__ __volatile__("mb" : : : "memory");
}
#else
static inline void MaybeReadMemoryBarrier() {}
#endif // __alpha__
// Read barrier for various targets.
#if defined(__aarch64__)
static inline void ReadMemoryBarrier() {
__asm__ __volatile__("dmb ld" : : : "memory");
}
#elif defined(__alpha__)
static inline void ReadMemoryBarrier() {
__asm__ __volatile__("mb" : : : "memory");
}
#else
static inline void ReadMemoryBarrier() {}
#endif
#endif // old compiler
#ifndef NO_THREAD_SAFETY_ANALYSIS
#define NO_THREAD_SAFETY_ANALYSIS
#endif
#endif // RE2_UTIL_ATOMICOPS_H__

View File

@ -2,6 +2,12 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <chrono>
#include "util/util.h"
#include "util/flags.h"
#include "util/benchmark.h"
@ -9,8 +15,11 @@
DEFINE_string(test_tmpdir, "/var/tmp", "temp directory");
#ifdef _WIN32
#define snprintf _snprintf
#endif
using testing::Benchmark;
using namespace re2;
static Benchmark* benchmarks[10000];
static int nbenchmarks;
@ -24,19 +33,17 @@ void Benchmark::Register() {
nbenchmarks++;
}
static int64 nsec() {
struct timeval tv;
if(gettimeofday(&tv, 0) < 0)
return -1;
return (int64)tv.tv_sec*1000*1000*1000 + tv.tv_usec*1000;
static int64_t nsec() {
return std::chrono::duration_cast<std::chrono::nanoseconds>(
std::chrono::steady_clock::now().time_since_epoch()).count();
}
static int64 bytes;
static int64 ns;
static int64 t0;
static int64 items;
static int64_t bytes;
static int64_t ns;
static int64_t t0;
static int64_t items;
void SetBenchmarkBytesProcessed(long long x) {
void SetBenchmarkBytesProcessed(int64_t x) {
bytes = x;
}
@ -74,7 +81,7 @@ static void runN(Benchmark *b, int n, int siz) {
b->fnr(n, siz);
else {
fprintf(stderr, "%s: missing function\n", b->name);
exit(2);
abort();
}
if(t0 != 0)
ns += nsec() - t0;
@ -105,11 +112,11 @@ void RunBench(Benchmark* b, int nthread, int siz) {
while(ns < (int)1e9 && n < (int)1e9) {
last = n;
if(ns/n == 0)
n = 1e9;
n = (int)1e9;
else
n = 1e9 / (ns/n);
n = (int)1e9 / static_cast<int>(ns/n);
n = max(last+1, min(n+n/2, 100*last));
n = std::max(last+1, std::min(n+n/2, 100*last));
n = round(n);
runN(b, n, siz);
}
@ -146,7 +153,7 @@ int main(int argc, const char** argv) {
Benchmark* b = benchmarks[i];
if(match(b->name, argc, argv))
for(int j = b->threadlo; j <= b->threadhi; j++)
for(int k = max(b->lo, 1); k <= max(b->hi, 1); k<<=1)
for(int k = std::max(b->lo, 1); k <= std::max(b->hi, 1); k<<=1)
RunBench(b, j, k);
}
}

View File

@ -2,8 +2,10 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_UTIL_BENCHMARK_H__
#define RE2_UTIL_BENCHMARK_H__
#ifndef UTIL_BENCHMARK_H_
#define UTIL_BENCHMARK_H_
#include <stdint.h>
namespace testing {
struct Benchmark {
@ -23,7 +25,7 @@ struct Benchmark {
};
} // namespace testing
void SetBenchmarkBytesProcessed(long long);
void SetBenchmarkBytesProcessed(int64_t);
void StopBenchmarkTiming();
void StartBenchmarkTiming();
void BenchmarkMemoryUsage();
@ -38,4 +40,4 @@ int NumCPUs();
::testing::Benchmark* _benchmark_##f = \
(new ::testing::Benchmark(#f, f, lo, hi))
#endif // RE2_UTIL_BENCHMARK_H__
#endif // UTIL_BENCHMARK_H_

View File

@ -2,13 +2,15 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef UTIL_FLAGS_H_
#define UTIL_FLAGS_H_
// Simplified version of Google's command line flags.
// Does not support parsing the command line.
// If you want to do that, see
// http://code.google.com/p/google-gflags
// https://gflags.github.io/gflags/
#ifndef RE2_UTIL_FLAGS_H__
#define RE2_UTIL_FLAGS_H__
#include <stdint.h>
#define DEFINE_flag(type, name, deflt, desc) \
namespace re2 { type FLAGS_##name = deflt; }
@ -17,11 +19,11 @@
namespace re2 { extern type FLAGS_##name; }
#define DEFINE_bool(name, deflt, desc) DEFINE_flag(bool, name, deflt, desc)
#define DEFINE_int32(name, deflt, desc) DEFINE_flag(int32, name, deflt, desc)
#define DEFINE_int32(name, deflt, desc) DEFINE_flag(int32_t, name, deflt, desc)
#define DEFINE_string(name, deflt, desc) DEFINE_flag(string, name, deflt, desc)
#define DECLARE_bool(name) DECLARE_flag(bool, name)
#define DECLARE_int32(name) DECLARE_flag(int32, name)
#define DECLARE_int32(name) DECLARE_flag(int32_t, name)
#define DECLARE_string(name) DECLARE_flag(string, name)
#endif // RE2_UTIL_FLAGS_H__
#endif // UTIL_FLAGS_H_

View File

@ -0,0 +1,21 @@
// Copyright 2016 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include <stddef.h>
#include <stdint.h>
#include <stdlib.h>
// Entry point for libFuzzer.
extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size);
int main(int argc, char** argv) {
uint8_t data[32];
for (int i = 0; i < 32; i++) {
for (int j = 0; j < 32; j++) {
data[j] = random() & 0xFF;
}
LLVMFuzzerTestOneInput(data, 32);
}
return 0;
}

View File

@ -1,231 +0,0 @@
// Modified by Russ Cox to add "namespace re2".
// Also threw away all but hashword and hashword2.
// http://burtleburtle.net/bob/c/lookup3.c
/*
-------------------------------------------------------------------------------
lookup3.c, by Bob Jenkins, May 2006, Public Domain.
These are functions for producing 32-bit hashes for hash table lookup.
hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
are externally useful functions. Routines to test the hash are included
if SELF_TEST is defined. You can use this free for any purpose. It's in
the public domain. It has no warranty.
You probably want to use hashlittle(). hashlittle() and hashbig()
hash byte arrays. hashlittle() is is faster than hashbig() on
little-endian machines. Intel and AMD are little-endian machines.
On second thought, you probably want hashlittle2(), which is identical to
hashlittle() except it returns two 32-bit hashes for the price of one.
You could implement hashbig2() if you wanted but I haven't bothered here.
If you want to find a hash of, say, exactly 7 integers, do
a = i1; b = i2; c = i3;
mix(a,b,c);
a += i4; b += i5; c += i6;
mix(a,b,c);
a += i7;
final(a,b,c);
then use c as the hash value. If you have a variable length array of
4-byte integers to hash, use hashword(). If you have a byte array (like
a character string), use hashlittle(). If you have several byte arrays, or
a mix of things, see the comments above hashlittle().
Why is this so big? I read 12 bytes at a time into 3 4-byte integers,
then mix those integers. This is fast (you can do a lot more thorough
mixing with 12*3 instructions on 3 integers than you can with 3 instructions
on 1 byte), but shoehorning those bytes into integers efficiently is messy.
-------------------------------------------------------------------------------
*/
#include "util/util.h"
#define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
/*
-------------------------------------------------------------------------------
mix -- mix 3 32-bit values reversibly.
This is reversible, so any information in (a,b,c) before mix() is
still in (a,b,c) after mix().
If four pairs of (a,b,c) inputs are run through mix(), or through
mix() in reverse, there are at least 32 bits of the output that
are sometimes the same for one pair and different for another pair.
This was tested for:
* pairs that differed by one bit, by two bits, in any combination
of top bits of (a,b,c), or in any combination of bottom bits of
(a,b,c).
* "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
is commonly produced by subtraction) look like a single 1-bit
difference.
* the base values were pseudorandom, all zero but one bit set, or
all zero plus a counter that starts at zero.
Some k values for my "a-=c; a^=rot(c,k); c+=b;" arrangement that
satisfy this are
4 6 8 16 19 4
9 15 3 18 27 15
14 9 3 7 17 3
Well, "9 15 3 18 27 15" didn't quite get 32 bits diffing
for "differ" defined as + with a one-bit base and a two-bit delta. I
used http://burtleburtle.net/bob/hash/avalanche.html to choose
the operations, constants, and arrangements of the variables.
This does not achieve avalanche. There are input bits of (a,b,c)
that fail to affect some output bits of (a,b,c), especially of a. The
most thoroughly mixed value is c, but it doesn't really even achieve
avalanche in c.
This allows some parallelism. Read-after-writes are good at doubling
the number of bits affected, so the goal of mixing pulls in the opposite
direction as the goal of parallelism. I did what I could. Rotates
seem to cost as much as shifts on every machine I could lay my hands
on, and rotates are much kinder to the top and bottom bits, so I used
rotates.
-------------------------------------------------------------------------------
*/
#define mix(a,b,c) \
{ \
a -= c; a ^= rot(c, 4); c += b; \
b -= a; b ^= rot(a, 6); a += c; \
c -= b; c ^= rot(b, 8); b += a; \
a -= c; a ^= rot(c,16); c += b; \
b -= a; b ^= rot(a,19); a += c; \
c -= b; c ^= rot(b, 4); b += a; \
}
/*
-------------------------------------------------------------------------------
final -- final mixing of 3 32-bit values (a,b,c) into c
Pairs of (a,b,c) values differing in only a few bits will usually
produce values of c that look totally different. This was tested for
* pairs that differed by one bit, by two bits, in any combination
of top bits of (a,b,c), or in any combination of bottom bits of
(a,b,c).
* "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
is commonly produced by subtraction) look like a single 1-bit
difference.
* the base values were pseudorandom, all zero but one bit set, or
all zero plus a counter that starts at zero.
These constants passed:
14 11 25 16 4 14 24
12 14 25 16 4 14 24
and these came close:
4 8 15 26 3 22 24
10 8 15 26 3 22 24
11 8 15 26 3 22 24
-------------------------------------------------------------------------------
*/
#define final(a,b,c) \
{ \
c ^= b; c -= rot(b,14); \
a ^= c; a -= rot(c,11); \
b ^= a; b -= rot(a,25); \
c ^= b; c -= rot(b,16); \
a ^= c; a -= rot(c,4); \
b ^= a; b -= rot(a,14); \
c ^= b; c -= rot(b,24); \
}
namespace re2 {
/*
--------------------------------------------------------------------
This works on all machines. To be useful, it requires
-- that the key be an array of uint32_t's, and
-- that the length be the number of uint32_t's in the key
The function hashword() is identical to hashlittle() on little-endian
machines, and identical to hashbig() on big-endian machines,
except that the length has to be measured in uint32_ts rather than in
bytes. hashlittle() is more complicated than hashword() only because
hashlittle() has to dance around fitting the key bytes into registers.
--------------------------------------------------------------------
*/
uint32 hashword(
const uint32 *k, /* the key, an array of uint32_t values */
size_t length, /* the length of the key, in uint32_ts */
uint32 initval) /* the previous hash, or an arbitrary value */
{
uint32_t a,b,c;
/* Set up the internal state */
a = b = c = 0xdeadbeef + (((uint32_t)length)<<2) + initval;
/*------------------------------------------------- handle most of the key */
while (length > 3)
{
a += k[0];
b += k[1];
c += k[2];
mix(a,b,c);
length -= 3;
k += 3;
}
/*------------------------------------------- handle the last 3 uint32_t's */
switch(length) /* all the case statements fall through */
{
case 3 : c+=k[2];
case 2 : b+=k[1];
case 1 : a+=k[0];
final(a,b,c);
case 0: /* case 0: nothing left to add */
break;
}
/*------------------------------------------------------ report the result */
return c;
}
/*
--------------------------------------------------------------------
hashword2() -- same as hashword(), but take two seeds and return two
32-bit values. pc and pb must both be nonnull, and *pc and *pb must
both be initialized with seeds. If you pass in (*pb)==0, the output
(*pc) will be the same as the return value from hashword().
--------------------------------------------------------------------
*/
void hashword2 (
const uint32 *k, /* the key, an array of uint32_t values */
size_t length, /* the length of the key, in uint32_ts */
uint32 *pc, /* IN: seed OUT: primary hash value */
uint32 *pb) /* IN: more seed OUT: secondary hash value */
{
uint32_t a,b,c;
/* Set up the internal state */
a = b = c = 0xdeadbeef + ((uint32_t)(length<<2)) + *pc;
c += *pb;
/*------------------------------------------------- handle most of the key */
while (length > 3)
{
a += k[0];
b += k[1];
c += k[2];
mix(a,b,c);
length -= 3;
k += 3;
}
/*------------------------------------------- handle the last 3 uint32_t's */
switch(length) /* all the case statements fall through */
{
case 3 : c+=k[2];
case 2 : b+=k[1];
case 1 : a+=k[0];
final(a,b,c);
case 0: /* case 0: nothing left to add */
break;
}
/*------------------------------------------------------ report the result */
*pc=c; *pb=b;
}
} // namespace re2

View File

@ -2,14 +2,19 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef UTIL_LOGGING_H_
#define UTIL_LOGGING_H_
// Simplified version of Google's logging.
#ifndef RE2_UTIL_LOGGING_H__
#define RE2_UTIL_LOGGING_H__
#include <unistd.h> /* for write */
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <ostream>
#include <sstream>
#include "util/util.h"
// Debug-only checking.
#define DCHECK(condition) assert(condition)
#define DCHECK_EQ(val1, val2) assert((val1) == (val2))
@ -29,33 +34,37 @@
#define CHECK_NE(x, y) CHECK((x) != (y))
#define LOG_INFO LogMessage(__FILE__, __LINE__)
#define LOG_ERROR LOG_INFO
#define LOG_WARNING LOG_INFO
#define LOG_WARNING LogMessage(__FILE__, __LINE__)
#define LOG_ERROR LogMessage(__FILE__, __LINE__)
#define LOG_FATAL LogMessageFatal(__FILE__, __LINE__)
#define LOG_QFATAL LOG_FATAL
#define VLOG(x) if((x)>0){}else LOG_INFO.stream()
// It seems that one of the Windows header files defines ERROR as 0.
#ifdef _WIN32
#define LOG_0 LOG_INFO
#endif
#ifdef NDEBUG
#define DEBUG_MODE 0
#define LOG_DFATAL LOG_ERROR
#else
#define DEBUG_MODE 1
#define LOG_DFATAL LOG_FATAL
#endif
#define LOG(severity) LOG_ ## severity.stream()
#define VLOG(x) if((x)>0){}else LOG_INFO.stream()
class LogMessage {
public:
LogMessage(const char* file, int line) : flushed_(false) {
LogMessage(const char* file, int line)
: flushed_(false) {
stream() << file << ":" << line << ": ";
}
void Flush() {
stream() << "\n";
string s = str_.str();
int n = (int)s.size(); // shut up msvc
if(write(2, s.data(), n) < 0) {} // shut up gcc
size_t n = s.size();
if (fwrite(s.data(), 1, n, stderr) < n) {} // shut up gcc
flushed_ = true;
}
~LogMessage() {
@ -63,14 +72,23 @@ class LogMessage {
Flush();
}
}
ostream& stream() { return str_; }
std::ostream& stream() { return str_; }
private:
bool flushed_;
std::ostringstream str_;
DISALLOW_EVIL_CONSTRUCTORS(LogMessage);
LogMessage(const LogMessage&) = delete;
LogMessage& operator=(const LogMessage&) = delete;
};
// Silence "destructor never returns" warning for ~LogMessageFatal().
// Since this is a header file, push and then pop to limit the scope.
#ifdef _MSC_VER
#pragma warning(push)
#pragma warning(disable: 4722)
#endif
class LogMessageFatal : public LogMessage {
public:
LogMessageFatal(const char* file, int line)
@ -80,7 +98,12 @@ class LogMessageFatal : public LogMessage {
abort();
}
private:
DISALLOW_EVIL_CONSTRUCTORS(LogMessageFatal);
LogMessageFatal(const LogMessageFatal&) = delete;
LogMessageFatal& operator=(const LogMessageFatal&) = delete;
};
#endif // RE2_UTIL_LOGGING_H__
#ifdef _MSC_VER
#pragma warning(pop)
#endif
#endif // UTIL_LOGGING_H_

41
contrib/libre2/util/mix.h Normal file
View File

@ -0,0 +1,41 @@
// Copyright 2016 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef UTIL_MIX_H_
#define UTIL_MIX_H_
#include <stddef.h>
#include <limits>
namespace re2 {
// Silence "truncation of constant value" warning for kMul in 32-bit mode.
// Since this is a header file, push and then pop to limit the scope.
#ifdef _MSC_VER
#pragma warning(push)
#pragma warning(disable: 4309)
#endif
class HashMix {
public:
HashMix() : hash_(1) {}
explicit HashMix(size_t val) : hash_(val + 83) {}
void Mix(size_t val) {
static const size_t kMul = static_cast<size_t>(0xdc3eb94af8ab4c93ULL);
hash_ *= kMul;
hash_ = ((hash_ << 19) |
(hash_ >> (std::numeric_limits<size_t>::digits - 19))) + val;
}
size_t get() const { return hash_; }
private:
size_t hash_;
};
#ifdef _MSC_VER
#pragma warning(pop)
#endif
} // namespace re2
#endif // UTIL_MIX_H_

View File

@ -2,64 +2,41 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef UTIL_MUTEX_H_
#define UTIL_MUTEX_H_
/*
* A simple mutex wrapper, supporting locks and read-write locks.
* You should assume the locks are *not* re-entrant.
*/
#ifndef RE2_UTIL_MUTEX_H_
#define RE2_UTIL_MUTEX_H_
#if !defined(_WIN32)
#ifndef _POSIX_C_SOURCE
#define _POSIX_C_SOURCE 200809L
#endif
#include <unistd.h>
#if defined(_POSIX_READER_WRITER_LOCKS) && _POSIX_READER_WRITER_LOCKS > 0
#define MUTEX_IS_PTHREAD_RWLOCK
#endif
#endif
#if defined(MUTEX_IS_PTHREAD_RWLOCK)
#include <pthread.h>
#include <stdlib.h>
typedef pthread_rwlock_t MutexType;
#else
#include <mutex>
typedef std::mutex MutexType;
#endif
namespace re2 {
#define HAVE_PTHREAD 1
#define HAVE_RWLOCK 1
#if defined(NO_THREADS)
typedef int MutexType; // to keep a lock-count
#elif defined(HAVE_PTHREAD) && defined(HAVE_RWLOCK)
// Needed for pthread_rwlock_*. If it causes problems, you could take it
// out, but then you'd have to unset HAVE_RWLOCK (at least on linux -- it
// *does* cause problems for FreeBSD, or MacOSX, but isn't needed
// for locking there.)
# ifdef __linux__
# undef _XOPEN_SOURCE
# define _XOPEN_SOURCE 500 // may be needed to get the rwlock calls
# endif
# include <pthread.h>
typedef pthread_rwlock_t MutexType;
#elif defined(HAVE_PTHREAD)
# include <pthread.h>
typedef pthread_mutex_t MutexType;
#elif defined(WIN32)
# define WIN32_LEAN_AND_MEAN // We only need minimal includes
# ifdef GMUTEX_TRYLOCK
// We need Windows NT or later for TryEnterCriticalSection(). If you
// don't need that functionality, you can remove these _WIN32_WINNT
// lines, and change TryLock() to assert(0) or something.
# ifndef _WIN32_WINNT
# define _WIN32_WINNT 0x0400
# endif
# endif
# include <windows.h>
typedef CRITICAL_SECTION MutexType;
#else
# error Need to implement mutex.h for your architecture, or #define NO_THREADS
#endif
class Mutex {
public:
// Create a Mutex that is not held by anybody.
inline Mutex();
// Destructor
inline ~Mutex();
inline void Lock(); // Block if needed until free then acquire exclusively
inline void Unlock(); // Release a lock acquired via Lock()
inline bool TryLock(); // If free, Lock() and return true, else return false
// Note that on systems that don't support read-write locks, these may
// be implemented as synonyms to Lock() and Unlock(). So you can use
// these for efficiency, but don't use them anyplace where being able
@ -68,80 +45,44 @@ class Mutex {
inline void ReaderUnlock(); // Release a read share of this Mutex
inline void WriterLock() { Lock(); } // Acquire an exclusive lock
inline void WriterUnlock() { Unlock(); } // Release a lock from WriterLock()
inline void AssertHeld() { }
private:
MutexType mutex_;
// Catch the error of writing Mutex when intending MutexLock.
Mutex(Mutex *ignored);
// Disallow "evil" constructors
Mutex(const Mutex&);
void operator=(const Mutex&);
Mutex(const Mutex&) = delete;
Mutex& operator=(const Mutex&) = delete;
};
// Now the implementation of Mutex for various systems
#if defined(NO_THREADS)
#if defined(MUTEX_IS_PTHREAD_RWLOCK)
// When we don't have threads, we can be either reading or writing,
// but not both. We can have lots of readers at once (in no-threads
// mode, that's most likely to happen in recursive function calls),
// but only one writer. We represent this by having mutex_ be -1 when
// writing and a number > 0 when reading (and 0 when no lock is held).
//
// In debug mode, we assert these invariants, while in non-debug mode
// we do nothing, for efficiency. That's why everything is in an
// assert.
#include <assert.h>
Mutex::Mutex() : mutex_(0) { }
Mutex::~Mutex() { assert(mutex_ == 0); }
void Mutex::Lock() { assert(--mutex_ == -1); }
void Mutex::Unlock() { assert(mutex_++ == -1); }
bool Mutex::TryLock() { if (mutex_) return false; Lock(); return true; }
void Mutex::ReaderLock() { assert(++mutex_ > 0); }
void Mutex::ReaderUnlock() { assert(mutex_-- > 0); }
#elif defined(HAVE_PTHREAD) && defined(HAVE_RWLOCK)
#define SAFE_PTHREAD(fncall) do { if ((fncall) != 0) abort(); } while (0)
#define SAFE_PTHREAD(fncall) \
do { \
if ((fncall) != 0) abort(); \
} while (0)
Mutex::Mutex() { SAFE_PTHREAD(pthread_rwlock_init(&mutex_, NULL)); }
Mutex::~Mutex() { SAFE_PTHREAD(pthread_rwlock_destroy(&mutex_)); }
void Mutex::Lock() { SAFE_PTHREAD(pthread_rwlock_wrlock(&mutex_)); }
void Mutex::Unlock() { SAFE_PTHREAD(pthread_rwlock_unlock(&mutex_)); }
bool Mutex::TryLock() { return pthread_rwlock_trywrlock(&mutex_) == 0; }
void Mutex::ReaderLock() { SAFE_PTHREAD(pthread_rwlock_rdlock(&mutex_)); }
void Mutex::ReaderUnlock() { SAFE_PTHREAD(pthread_rwlock_unlock(&mutex_)); }
#undef SAFE_PTHREAD
#elif defined(HAVE_PTHREAD)
#else
#define SAFE_PTHREAD(fncall) do { if ((fncall) != 0) abort(); } while (0)
Mutex::Mutex() { SAFE_PTHREAD(pthread_mutex_init(&mutex_, NULL)); }
Mutex::~Mutex() { SAFE_PTHREAD(pthread_mutex_destroy(&mutex_)); }
void Mutex::Lock() { SAFE_PTHREAD(pthread_mutex_lock(&mutex_)); }
void Mutex::Unlock() { SAFE_PTHREAD(pthread_mutex_unlock(&mutex_)); }
bool Mutex::TryLock() { return pthread_mutex_trylock(&mutex_) == 0; }
void Mutex::ReaderLock() { Lock(); } // we don't have read-write locks
void Mutex::ReaderUnlock() { Unlock(); }
#undef SAFE_PTHREAD
#elif defined(WIN32)
Mutex::Mutex() { InitializeCriticalSection(&mutex_); }
Mutex::~Mutex() { DeleteCriticalSection(&mutex_); }
void Mutex::Lock() { EnterCriticalSection(&mutex_); }
void Mutex::Unlock() { LeaveCriticalSection(&mutex_); }
bool Mutex::TryLock() { return TryEnterCriticalSection(&mutex_) != 0; }
void Mutex::ReaderLock() { Lock(); } // we don't have read-write locks
Mutex::Mutex() { }
Mutex::~Mutex() { }
void Mutex::Lock() { mutex_.lock(); }
void Mutex::Unlock() { mutex_.unlock(); }
void Mutex::ReaderLock() { Lock(); } // C++11 doesn't have std::shared_mutex.
void Mutex::ReaderUnlock() { Unlock(); }
#endif
// --------------------------------------------------------------------------
// Some helper classes
@ -152,9 +93,9 @@ class MutexLock {
~MutexLock() { mu_->Unlock(); }
private:
Mutex * const mu_;
// Disallow "evil" constructors
MutexLock(const MutexLock&);
void operator=(const MutexLock&);
MutexLock(const MutexLock&) = delete;
MutexLock& operator=(const MutexLock&) = delete;
};
// ReaderMutexLock and WriterMutexLock do the same, for rwlocks
@ -164,9 +105,9 @@ class ReaderMutexLock {
~ReaderMutexLock() { mu_->ReaderUnlock(); }
private:
Mutex * const mu_;
// Disallow "evil" constructors
ReaderMutexLock(const ReaderMutexLock&);
void operator=(const ReaderMutexLock&);
ReaderMutexLock(const ReaderMutexLock&) = delete;
ReaderMutexLock& operator=(const ReaderMutexLock&) = delete;
};
class WriterMutexLock {
@ -175,37 +116,16 @@ class WriterMutexLock {
~WriterMutexLock() { mu_->WriterUnlock(); }
private:
Mutex * const mu_;
// Disallow "evil" constructors
WriterMutexLock(const WriterMutexLock&);
void operator=(const WriterMutexLock&);
WriterMutexLock(const WriterMutexLock&) = delete;
WriterMutexLock& operator=(const WriterMutexLock&) = delete;
};
// Catch bug where variable name is omitted, e.g. MutexLock (&mu);
#define MutexLock(x) COMPILE_ASSERT(0, mutex_lock_decl_missing_var_name)
#define ReaderMutexLock(x) COMPILE_ASSERT(0, rmutex_lock_decl_missing_var_name)
#define WriterMutexLock(x) COMPILE_ASSERT(0, wmutex_lock_decl_missing_var_name)
// Provide safe way to declare and use global, linker-initialized mutex. Sigh.
#ifdef HAVE_PTHREAD
#define GLOBAL_MUTEX(name) \
static pthread_mutex_t (name) = PTHREAD_MUTEX_INITIALIZER
#define GLOBAL_MUTEX_LOCK(name) \
pthread_mutex_lock(&(name))
#define GLOBAL_MUTEX_UNLOCK(name) \
pthread_mutex_unlock(&(name))
#else
#define GLOBAL_MUTEX(name) \
static Mutex name
#define GLOBAL_MUTEX_LOCK(name) \
name.Lock()
#define GLOBAL_MUTEX_UNLOCK(name) \
name.Unlock()
#endif
#define MutexLock(x) static_assert(false, "MutexLock declaration missing variable name")
#define ReaderMutexLock(x) static_assert(false, "ReaderMutexLock declaration missing variable name")
#define WriterMutexLock(x) static_assert(false, "WriterMutexLock declaration missing variable name")
} // namespace re2
#endif /* #define RE2_UTIL_MUTEX_H_ */
#endif // UTIL_MUTEX_H_

View File

@ -6,12 +6,25 @@
// The main changes are the addition of the HitLimit method and
// compilation as PCRE in namespace re2.
#include <assert.h>
#include <ctype.h>
#include <errno.h>
#include <stdlib.h>
#include <string.h>
#include <limits>
#include <string>
#include <utility>
#include "util/util.h"
#include "util/flags.h"
#include "util/logging.h"
#include "util/pcre.h"
#include "util/strutil.h"
#if __GNUC__ > 5
// Silence warnings about the wacky formatting in the operator() functions.
// Note that we test for Clang first because it defines __GNUC__ as well.
#if defined(__clang__)
#elif defined(__GNUC__) && __GNUC__ >= 6
#pragma GCC diagnostic ignored "-Wmisleading-indentation"
#endif
@ -26,6 +39,42 @@ DEFINE_int32(regexp_stack_limit, 256<<10, "default PCRE stack limit (bytes)");
DEFINE_int32(regexp_match_limit, 1000000,
"default PCRE match limit (function calls)");
#ifndef USEPCRE
// Fake just enough of the PCRE API to allow this file to build. :)
struct pcre_extra {
int flags;
int match_limit;
int match_limit_recursion;
};
#define PCRE_EXTRA_MATCH_LIMIT 0
#define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0
#define PCRE_ANCHORED 0
#define PCRE_NOTEMPTY 0
#define PCRE_ERROR_NOMATCH 1
#define PCRE_ERROR_MATCHLIMIT 2
#define PCRE_ERROR_RECURSIONLIMIT 3
#define PCRE_INFO_CAPTURECOUNT 0
void pcre_free(void*) {
}
pcre* pcre_compile(const char*, int, const char**, int*, const unsigned char*) {
return NULL;
}
int pcre_exec(const pcre*, const pcre_extra*, const char*, int, int, int, int*, int) {
return 0;
}
int pcre_fullinfo(const pcre*, const pcre_extra*, int, void*) {
return 0;
}
#endif
namespace re2 {
// Maximum number of args we can set
@ -117,7 +166,7 @@ pcre* PCRE::Compile(Anchor anchor) {
// ANCHOR_BOTH Tack a "\z" to the end of the original pattern
// and use a pcre anchored match.
const char* error;
const char* error = "";
int eoffset;
pcre* re;
if (anchor != ANCHOR_BOTH) {
@ -181,8 +230,8 @@ bool PCRE::FullMatchFunctor::operator ()(const StringPiece& text,
if (&a15 == &no_more_args) goto done; args[n++] = &a15;
done:
int consumed;
int vec[kVecSize];
size_t consumed;
int vec[kVecSize] = {};
return re.DoMatchImpl(text, ANCHOR_BOTH, &consumed, args, n, vec, kVecSize);
}
@ -224,8 +273,8 @@ bool PCRE::PartialMatchFunctor::operator ()(const StringPiece& text,
if (&a15 == &no_more_args) goto done; args[n++] = &a15;
done:
int consumed;
int vec[kVecSize];
size_t consumed;
int vec[kVecSize] = {};
return re.DoMatchImpl(text, UNANCHORED, &consumed, args, n, vec, kVecSize);
}
@ -267,8 +316,8 @@ bool PCRE::ConsumeFunctor::operator ()(StringPiece* input,
if (&a15 == &no_more_args) goto done; args[n++] = &a15;
done:
int consumed;
int vec[kVecSize];
size_t consumed;
int vec[kVecSize] = {};
if (pattern.DoMatchImpl(*input, ANCHOR_START, &consumed,
args, n, vec, kVecSize)) {
input->remove_prefix(consumed);
@ -316,8 +365,8 @@ bool PCRE::FindAndConsumeFunctor::operator ()(StringPiece* input,
if (&a15 == &no_more_args) goto done; args[n++] = &a15;
done:
int consumed;
int vec[kVecSize];
size_t consumed;
int vec[kVecSize] = {};
if (pattern.DoMatchImpl(*input, UNANCHORED, &consumed,
args, n, vec, kVecSize)) {
input->remove_prefix(consumed);
@ -330,7 +379,7 @@ done:
bool PCRE::Replace(string *str,
const PCRE& pattern,
const StringPiece& rewrite) {
int vec[kVecSize];
int vec[kVecSize] = {};
int matches = pattern.TryMatch(*str, 0, UNANCHORED, true, vec, kVecSize);
if (matches == 0)
return false;
@ -349,12 +398,12 @@ int PCRE::GlobalReplace(string *str,
const PCRE& pattern,
const StringPiece& rewrite) {
int count = 0;
int vec[kVecSize];
int vec[kVecSize] = {};
string out;
size_t start = 0;
bool last_match_was_empty_string = false;
for (; start <= str->length();) {
while (start <= str->size()) {
// If the previous match was for the empty string, we shouldn't
// just match again: we'll match in the same way and get an
// infinite loop. Instead, we do the match in a special way:
@ -370,19 +419,20 @@ int PCRE::GlobalReplace(string *str,
matches = pattern.TryMatch(*str, start, ANCHOR_START, false,
vec, kVecSize);
if (matches <= 0) {
if (start < str->length())
if (start < str->size())
out.push_back((*str)[start]);
start++;
last_match_was_empty_string = false;
continue;
}
} else {
matches = pattern.TryMatch(*str, start, UNANCHORED, true, vec, kVecSize);
matches = pattern.TryMatch(*str, start, UNANCHORED, true,
vec, kVecSize);
if (matches <= 0)
break;
}
int matchstart = vec[0], matchend = vec[1];
assert(matchstart >= static_cast<int>(start));
size_t matchstart = vec[0], matchend = vec[1];
assert(matchstart >= start);
assert(matchend >= matchstart);
out.append(*str, start, matchstart - start);
@ -395,8 +445,9 @@ int PCRE::GlobalReplace(string *str,
if (count == 0)
return 0;
if (start < str->length())
out.append(*str, start, str->length() - start);
if (start < str->size())
out.append(*str, start, str->size() - start);
using std::swap;
swap(out, *str);
return count;
}
@ -405,7 +456,7 @@ bool PCRE::Extract(const StringPiece &text,
const PCRE& pattern,
const StringPiece &rewrite,
string *out) {
int vec[kVecSize];
int vec[kVecSize] = {};
int matches = pattern.TryMatch(text, 0, UNANCHORED, true, vec, kVecSize);
if (matches == 0)
return false;
@ -424,7 +475,7 @@ string PCRE::QuoteMeta(const StringPiece& unquoted) {
// that. (This also makes it identical to the perl function of the
// same name except for the null-character special case;
// see `perldoc -f quotemeta`.)
for (int ii = 0; ii < unquoted.length(); ++ii) {
for (size_t ii = 0; ii < unquoted.size(); ++ii) {
// Note that using 'isalnum' here raises the benchmark time from
// 32ns to 58ns:
if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
@ -451,7 +502,7 @@ string PCRE::QuoteMeta(const StringPiece& unquoted) {
/***** Actual matching and rewriting code *****/
bool PCRE::HitLimit() {
return hit_limit_;
return hit_limit_ != 0;
}
void PCRE::ClearHitLimit() {
@ -459,7 +510,7 @@ void PCRE::ClearHitLimit() {
}
int PCRE::TryMatch(const StringPiece& text,
int startpos,
size_t startpos,
Anchor anchor,
bool empty_ok,
int *vec,
@ -499,8 +550,8 @@ int PCRE::TryMatch(const StringPiece& text,
int rc = pcre_exec(re, // The regular expression object
&extra,
(text.data() == NULL) ? "" : text.data(),
text.size(),
startpos,
static_cast<int>(text.size()),
static_cast<int>(startpos),
options,
vec,
vecsize);
@ -554,14 +605,9 @@ int PCRE::TryMatch(const StringPiece& text,
return rc;
}
#if !__clang__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#endif
bool PCRE::DoMatchImpl(const StringPiece& text,
Anchor anchor,
int* consumed,
size_t* consumed,
const Arg* const* args,
int n,
int* vec,
@ -589,7 +635,17 @@ bool PCRE::DoMatchImpl(const StringPiece& text,
for (int i = 0; i < n; i++) {
const int start = vec[2*(i+1)];
const int limit = vec[2*(i+1)+1];
if (!args[i]->Parse(text.data() + start, limit-start)) {
// Avoid invoking undefined behavior when text.data() happens
// to be null and start happens to be -1, the latter being the
// case for an unmatched subexpression. Even if text.data() is
// not null, pointing one byte before was a longstanding bug.
const char* addr = NULL;
if (start != -1) {
addr = text.data() + start;
}
if (!args[i]->Parse(addr, limit-start)) {
// TODO: Should we indicate what the error was?
return false;
}
@ -598,17 +654,13 @@ bool PCRE::DoMatchImpl(const StringPiece& text,
return true;
}
#if !__clang__
#pragma GCC diagnostic pop
#endif
bool PCRE::DoMatch(const StringPiece& text,
Anchor anchor,
int* consumed,
size_t* consumed,
const Arg* const args[],
int n) const {
assert(n >= 0);
size_t const vecsize = (1 + n) * 3; // results + PCRE workspace
const int vecsize = (1 + n) * 3; // results + PCRE workspace
// (as for kVecSize)
int* vec = new int[vecsize];
bool b = DoMatchImpl(text, anchor, consumed, args, n, vec, vecsize);
@ -695,41 +747,52 @@ int PCRE::NumberOfCapturingGroups() const {
if (re_partial_ == NULL) return -1;
int result;
CHECK(pcre_fullinfo(re_partial_, // The regular expression object
int rc = pcre_fullinfo(re_partial_, // The regular expression object
NULL, // We did not study the pattern
PCRE_INFO_CAPTURECOUNT,
&result) == 0);
&result);
if (rc != 0) {
PCREPORT(ERROR) << "Unexpected return code: " << rc;
return -1;
}
return result;
}
/***** Parsers for various types *****/
bool PCRE::Arg::parse_null(const char* str, int n, void* dest) {
bool PCRE::Arg::parse_null(const char* str, size_t n, void* dest) {
// We fail if somebody asked us to store into a non-NULL void* pointer
return (dest == NULL);
}
bool PCRE::Arg::parse_string(const char* str, int n, void* dest) {
bool PCRE::Arg::parse_string(const char* str, size_t n, void* dest) {
if (dest == NULL) return true;
reinterpret_cast<string*>(dest)->assign(str, n);
return true;
}
bool PCRE::Arg::parse_stringpiece(const char* str, int n, void* dest) {
bool PCRE::Arg::parse_stringpiece(const char* str, size_t n, void* dest) {
if (dest == NULL) return true;
reinterpret_cast<StringPiece*>(dest)->set(str, n);
*(reinterpret_cast<StringPiece*>(dest)) = StringPiece(str, n);
return true;
}
bool PCRE::Arg::parse_char(const char* str, int n, void* dest) {
bool PCRE::Arg::parse_char(const char* str, size_t n, void* dest) {
if (n != 1) return false;
if (dest == NULL) return true;
*(reinterpret_cast<char*>(dest)) = str[0];
return true;
}
bool PCRE::Arg::parse_uchar(const char* str, int n, void* dest) {
bool PCRE::Arg::parse_schar(const char* str, size_t n, void* dest) {
if (n != 1) return false;
if (dest == NULL) return true;
*(reinterpret_cast<signed char*>(dest)) = str[0];
return true;
}
bool PCRE::Arg::parse_uchar(const char* str, size_t n, void* dest) {
if (n != 1) return false;
if (dest == NULL) return true;
*(reinterpret_cast<unsigned char*>(dest)) = str[0];
@ -746,7 +809,7 @@ static const int kMaxNumberLength = 32;
// a. "str" if no termination is needed
// b. "buf" if the string was copied and null-terminated
// c. "" if the input was invalid and has no hope of being parsed
static const char* TerminateNumber(char* buf, const char* str, int n) {
static const char* TerminateNumber(char* buf, const char* str, size_t n) {
if ((n > 0) && isspace(*str)) {
// We are less forgiving than the strtoxxx() routines and do not
// allow leading spaces.
@ -769,7 +832,7 @@ static const char* TerminateNumber(char* buf, const char* str, int n) {
}
bool PCRE::Arg::parse_long_radix(const char* str,
int n,
size_t n,
void* dest,
int radix) {
if (n == 0) return false;
@ -786,7 +849,7 @@ bool PCRE::Arg::parse_long_radix(const char* str,
}
bool PCRE::Arg::parse_ulong_radix(const char* str,
int n,
size_t n,
void* dest,
int radix) {
if (n == 0) return false;
@ -809,55 +872,55 @@ bool PCRE::Arg::parse_ulong_radix(const char* str,
}
bool PCRE::Arg::parse_short_radix(const char* str,
int n,
size_t n,
void* dest,
int radix) {
long r;
if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
if ((short)r != r) return false; // Out of range
if (dest == NULL) return true;
*(reinterpret_cast<short*>(dest)) = r;
*(reinterpret_cast<short*>(dest)) = (short)r;
return true;
}
bool PCRE::Arg::parse_ushort_radix(const char* str,
int n,
size_t n,
void* dest,
int radix) {
unsigned long r;
if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
if ((ushort)r != r) return false; // Out of range
if ((unsigned short)r != r) return false; // Out of range
if (dest == NULL) return true;
*(reinterpret_cast<unsigned short*>(dest)) = r;
*(reinterpret_cast<unsigned short*>(dest)) = (unsigned short)r;
return true;
}
bool PCRE::Arg::parse_int_radix(const char* str,
int n,
size_t n,
void* dest,
int radix) {
long r;
if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
if ((int)r != r) return false; // Out of range
if (dest == NULL) return true;
*(reinterpret_cast<int*>(dest)) = r;
*(reinterpret_cast<int*>(dest)) = (int)r;
return true;
}
bool PCRE::Arg::parse_uint_radix(const char* str,
int n,
size_t n,
void* dest,
int radix) {
unsigned long r;
if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
if ((uint)r != r) return false; // Out of range
if ((unsigned int)r != r) return false; // Out of range
if (dest == NULL) return true;
*(reinterpret_cast<unsigned int*>(dest)) = r;
*(reinterpret_cast<unsigned int*>(dest)) = (unsigned int)r;
return true;
}
bool PCRE::Arg::parse_longlong_radix(const char* str,
int n,
size_t n,
void* dest,
int radix) {
if (n == 0) return false;
@ -865,16 +928,16 @@ bool PCRE::Arg::parse_longlong_radix(const char* str,
str = TerminateNumber(buf, str, n);
char* end;
errno = 0;
int64 r = strtoll(str, &end, radix);
long long r = strtoll(str, &end, radix);
if (end != str + n) return false; // Leftover junk
if (errno) return false;
if (dest == NULL) return true;
*(reinterpret_cast<int64*>(dest)) = r;
*(reinterpret_cast<long long*>(dest)) = r;
return true;
}
bool PCRE::Arg::parse_ulonglong_radix(const char* str,
int n,
size_t n,
void* dest,
int radix) {
if (n == 0) return false;
@ -887,26 +950,32 @@ bool PCRE::Arg::parse_ulonglong_radix(const char* str,
}
char* end;
errno = 0;
uint64 r = strtoull(str, &end, radix);
unsigned long long r = strtoull(str, &end, radix);
if (end != str + n) return false; // Leftover junk
if (errno) return false;
if (dest == NULL) return true;
*(reinterpret_cast<uint64*>(dest)) = r;
*(reinterpret_cast<unsigned long long*>(dest)) = r;
return true;
}
bool PCRE::Arg::parse_double(const char* str, int n, void* dest) {
static bool parse_double_float(const char* str, size_t n, bool isfloat,
void* dest) {
if (n == 0) return false;
static const int kMaxLength = 200;
char buf[kMaxLength];
if (n >= kMaxLength) return false;
memcpy(buf, str, n);
buf[n] = '\0';
errno = 0;
char* end;
double r = strtod(buf, &end);
errno = 0;
double r;
if (isfloat) {
r = strtof(buf, &end);
} else {
r = strtod(buf, &end);
}
if (end != buf + n) {
#ifdef COMPILER_MSVC
#ifdef _WIN32
// Microsoft's strtod() doesn't handle inf and nan, so we have to
// handle it explicitly. Speed is not important here because this
// code is only called in unit tests.
@ -918,12 +987,12 @@ bool PCRE::Arg::parse_double(const char* str, int n, void* dest) {
} else if ('+' == *i) {
++i;
}
if (0 == stricmp(i, "inf") || 0 == stricmp(i, "infinity")) {
r = numeric_limits<double>::infinity();
if (0 == _stricmp(i, "inf") || 0 == _stricmp(i, "infinity")) {
r = std::numeric_limits<double>::infinity();
if (!pos)
r = -r;
} else if (0 == stricmp(i, "nan")) {
r = numeric_limits<double>::quiet_NaN();
} else if (0 == _stricmp(i, "nan")) {
r = std::numeric_limits<double>::quiet_NaN();
} else {
return false;
}
@ -933,42 +1002,47 @@ bool PCRE::Arg::parse_double(const char* str, int n, void* dest) {
}
if (errno) return false;
if (dest == NULL) return true;
if (isfloat) {
*(reinterpret_cast<float*>(dest)) = (float)r;
} else {
*(reinterpret_cast<double*>(dest)) = r;
}
return true;
}
bool PCRE::Arg::parse_float(const char* str, int n, void* dest) {
double r;
if (!parse_double(str, n, &r)) return false;
if (dest == NULL) return true;
*(reinterpret_cast<float*>(dest)) = static_cast<float>(r);
return true;
bool PCRE::Arg::parse_double(const char* str, size_t n, void* dest) {
return parse_double_float(str, n, false, dest);
}
bool PCRE::Arg::parse_float(const char* str, size_t n, void* dest) {
return parse_double_float(str, n, true, dest);
}
#define DEFINE_INTEGER_PARSERS(name) \
bool PCRE::Arg::parse_##name(const char* str, int n, void* dest) { \
#define DEFINE_INTEGER_PARSER(name) \
bool PCRE::Arg::parse_##name(const char* str, size_t n, void* dest) { \
return parse_##name##_radix(str, n, dest, 10); \
} \
bool PCRE::Arg::parse_##name##_hex(const char* str, int n, void* dest) { \
bool PCRE::Arg::parse_##name##_hex(const char* str, size_t n, void* dest) { \
return parse_##name##_radix(str, n, dest, 16); \
} \
bool PCRE::Arg::parse_##name##_octal(const char* str, int n, void* dest) { \
bool PCRE::Arg::parse_##name##_octal(const char* str, size_t n, \
void* dest) { \
return parse_##name##_radix(str, n, dest, 8); \
} \
bool PCRE::Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \
bool PCRE::Arg::parse_##name##_cradix(const char* str, size_t n, \
void* dest) { \
return parse_##name##_radix(str, n, dest, 0); \
}
DEFINE_INTEGER_PARSERS(short);
DEFINE_INTEGER_PARSERS(ushort);
DEFINE_INTEGER_PARSERS(int);
DEFINE_INTEGER_PARSERS(uint);
DEFINE_INTEGER_PARSERS(long);
DEFINE_INTEGER_PARSERS(ulong);
DEFINE_INTEGER_PARSERS(longlong);
DEFINE_INTEGER_PARSERS(ulonglong);
DEFINE_INTEGER_PARSER(short);
DEFINE_INTEGER_PARSER(ushort);
DEFINE_INTEGER_PARSER(int);
DEFINE_INTEGER_PARSER(uint);
DEFINE_INTEGER_PARSER(long);
DEFINE_INTEGER_PARSER(ulong);
DEFINE_INTEGER_PARSER(longlong);
DEFINE_INTEGER_PARSER(ulonglong);
#undef DEFINE_INTEGER_PARSERS
#undef DEFINE_INTEGER_PARSER
} // namespace re2

View File

@ -2,6 +2,9 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef UTIL_PCRE_H_
#define UTIL_PCRE_H_
// This is a variant of PCRE's pcrecpp.h, originally written at Google.
// The main changes are the addition of the HitLimit method and
// compilation as PCRE in namespace re2.
@ -167,22 +170,9 @@ namespace re2 {
const bool UsingPCRE = true;
} // namespace re2
#else
struct pcre; // opaque
namespace re2 {
const bool UsingPCRE = false;
struct pcre;
struct pcre_extra { int flags, match_limit, match_limit_recursion; };
#define pcre_free(x) {}
#define PCRE_EXTRA_MATCH_LIMIT 0
#define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0
#define PCRE_ANCHORED 0
#define PCRE_NOTEMPTY 0
#define PCRE_ERROR_NOMATCH 1
#define PCRE_ERROR_MATCHLIMIT 2
#define PCRE_ERROR_RECURSIONLIMIT 3
#define PCRE_INFO_CAPTURECOUNT 0
#define pcre_compile(a,b,c,d,e) ({ (void)(a); (void)(b); *(c)=""; *(d)=0; (void)(e); ((pcre*)0); })
#define pcre_exec(a, b, c, d, e, f, g, h) ({ (void)(a); (void)(b); (void)(c); (void)(d); (void)(e); (void)(f); (void)(g); (void)(h); 0; })
#define pcre_fullinfo(a, b, c, d) ({ (void)(a); (void)(b); (void)(c); *(d) = 0; 0; })
} // namespace re2
#endif
@ -258,7 +248,7 @@ class PCRE {
// type, or one of:
// string (matched piece is copied to string)
// StringPiece (StringPiece is mutated to point to matched piece)
// T (where "bool T::ParseFrom(const char*, int)" exists)
// T (where "bool T::ParseFrom(const char*, size_t)" exists)
// (void*)NULL (the corresponding matched sub-pattern is not copied)
//
// Returns true iff all of the following conditions are satisfied:
@ -452,7 +442,7 @@ class PCRE {
// "*consumed" if successful.
bool DoMatch(const StringPiece& text,
Anchor anchor,
int* consumed,
size_t* consumed,
const Arg* const* args, int n) const;
// Return the number of capturing subpatterns, or -1 if the
@ -475,7 +465,7 @@ class PCRE {
// When matching PCRE("(foo)|hello") against "hello", it will return 1.
// But the values for all subpattern are filled in into "vec".
int TryMatch(const StringPiece& text,
int startpos,
size_t startpos,
Anchor anchor,
bool empty_ok,
int *vec,
@ -492,7 +482,7 @@ class PCRE {
// internal implementation for DoMatch
bool DoMatchImpl(const StringPiece& text,
Anchor anchor,
int* consumed,
size_t* consumed,
const Arg* const args[],
int n,
int* vec,
@ -510,7 +500,9 @@ class PCRE {
int match_limit_; // Limit on execution resources
int stack_limit_; // Limit on stack resources (bytes)
mutable int32_t hit_limit_; // Hit limit during execution (bool)?
DISALLOW_EVIL_CONSTRUCTORS(PCRE);
PCRE(const PCRE&) = delete;
PCRE& operator=(const PCRE&) = delete;
};
// PCRE_Options allow you to set the PCRE::Options, plus any pcre
@ -565,7 +557,7 @@ class PCRE_Options {
template <class T>
class _PCRE_MatchObject {
public:
static inline bool Parse(const char* str, int n, void* dest) {
static inline bool Parse(const char* str, size_t n, void* dest) {
if (dest == NULL) return true;
T* object = reinterpret_cast<T*>(dest);
return object->ParseFrom(str, n);
@ -580,16 +572,21 @@ class PCRE::Arg {
// Constructor specially designed for NULL arguments
Arg(void*);
typedef bool (*Parser)(const char* str, int n, void* dest);
typedef bool (*Parser)(const char* str, size_t n, void* dest);
// Type-specific parsers
#define MAKE_PARSER(type, name) \
Arg(type* p) : arg_(p), parser_(name) {} \
Arg(type* p, Parser parser) : arg_(p), parser_(parser) { } \
Arg(type* p, Parser parser) : arg_(p), parser_(parser) {}
MAKE_PARSER(char, parse_char);
MAKE_PARSER(signed char, parse_schar);
MAKE_PARSER(unsigned char, parse_uchar);
MAKE_PARSER(float, parse_float);
MAKE_PARSER(double, parse_double);
MAKE_PARSER(string, parse_string);
MAKE_PARSER(StringPiece, parse_stringpiece);
MAKE_PARSER(short, parse_short);
MAKE_PARSER(unsigned short, parse_ushort);
MAKE_PARSER(int, parse_int);
@ -598,10 +595,6 @@ class PCRE::Arg {
MAKE_PARSER(unsigned long, parse_ulong);
MAKE_PARSER(long long, parse_longlong);
MAKE_PARSER(unsigned long long, parse_ulonglong);
MAKE_PARSER(float, parse_float);
MAKE_PARSER(double, parse_double);
MAKE_PARSER(string, parse_string);
MAKE_PARSER(StringPiece, parse_stringpiece);
#undef MAKE_PARSER
@ -613,29 +606,31 @@ class PCRE::Arg {
}
// Parse the data
bool Parse(const char* str, int n) const;
bool Parse(const char* str, size_t n) const;
private:
void* arg_;
Parser parser_;
static bool parse_null (const char* str, int n, void* dest);
static bool parse_char (const char* str, int n, void* dest);
static bool parse_uchar (const char* str, int n, void* dest);
static bool parse_float (const char* str, int n, void* dest);
static bool parse_double (const char* str, int n, void* dest);
static bool parse_string (const char* str, int n, void* dest);
static bool parse_stringpiece (const char* str, int n, void* dest);
static bool parse_null (const char* str, size_t n, void* dest);
static bool parse_char (const char* str, size_t n, void* dest);
static bool parse_schar (const char* str, size_t n, void* dest);
static bool parse_uchar (const char* str, size_t n, void* dest);
static bool parse_float (const char* str, size_t n, void* dest);
static bool parse_double (const char* str, size_t n, void* dest);
static bool parse_string (const char* str, size_t n, void* dest);
static bool parse_stringpiece (const char* str, size_t n, void* dest);
#define DECLARE_INTEGER_PARSER(name) \
private: \
static bool parse_ ## name(const char* str, int n, void* dest); \
static bool parse_ ## name ## _radix( \
const char* str, int n, void* dest, int radix); \
static bool parse_##name(const char* str, size_t n, void* dest); \
static bool parse_##name##_radix(const char* str, size_t n, void* dest, \
int radix); \
\
public: \
static bool parse_ ## name ## _hex(const char* str, int n, void* dest); \
static bool parse_ ## name ## _octal(const char* str, int n, void* dest); \
static bool parse_ ## name ## _cradix(const char* str, int n, void* dest)
static bool parse_##name##_hex(const char* str, size_t n, void* dest); \
static bool parse_##name##_octal(const char* str, size_t n, void* dest); \
static bool parse_##name##_cradix(const char* str, size_t n, void* dest)
DECLARE_INTEGER_PARSER(short);
DECLARE_INTEGER_PARSER(ushort);
@ -647,23 +642,27 @@ class PCRE::Arg {
DECLARE_INTEGER_PARSER(ulonglong);
#undef DECLARE_INTEGER_PARSER
};
inline PCRE::Arg::Arg() : arg_(NULL), parser_(parse_null) { }
inline PCRE::Arg::Arg(void* p) : arg_(p), parser_(parse_null) { }
inline bool PCRE::Arg::Parse(const char* str, int n) const {
inline bool PCRE::Arg::Parse(const char* str, size_t n) const {
return (*parser_)(str, n, arg_);
}
// This part of the parser, appropriate only for ints, deals with bases
#define MAKE_INTEGER_PARSER(type, name) \
inline PCRE::Arg Hex(type* ptr) { \
return PCRE::Arg(ptr, PCRE::Arg::parse_ ## name ## _hex); } \
return PCRE::Arg(ptr, PCRE::Arg::parse_##name##_hex); \
} \
inline PCRE::Arg Octal(type* ptr) { \
return PCRE::Arg(ptr, PCRE::Arg::parse_ ## name ## _octal); } \
return PCRE::Arg(ptr, PCRE::Arg::parse_##name##_octal); \
} \
inline PCRE::Arg CRadix(type* ptr) { \
return PCRE::Arg(ptr, PCRE::Arg::parse_ ## name ## _cradix); }
return PCRE::Arg(ptr, PCRE::Arg::parse_##name##_cradix); \
}
MAKE_INTEGER_PARSER(short, short);
MAKE_INTEGER_PARSER(unsigned short, ushort);
@ -677,3 +676,5 @@ MAKE_INTEGER_PARSER(unsigned long long, ulonglong);
#undef MAKE_INTEGER_PARSER
} // namespace re2
#endif // UTIL_PCRE_H_

View File

@ -1,34 +0,0 @@
// Copyright 2005-2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Modified from Google perftools's tcmalloc_unittest.cc.
#include "util/random.h"
namespace re2 {
int32 ACMRandom::Next() {
const int32 M = 2147483647L; // 2^31-1
const int32 A = 16807;
// In effect, we are computing seed_ = (seed_ * A) % M, where M = 2^31-1
uint32 lo = A * (int32)(seed_ & 0xFFFF);
uint32 hi = A * (int32)((uint32)seed_ >> 16);
lo += (hi & 0x7FFF) << 16;
if (lo > M) {
lo &= M;
++lo;
}
lo += hi >> 15;
if (lo > M) {
lo &= M;
++lo;
}
return (seed_ = (int32) lo);
}
int32 ACMRandom::Uniform(int32 n) {
return Next() % n;
}
} // namespace re2

View File

@ -1,29 +0,0 @@
// Copyright 2005-2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Modified from Google perftools's tcmalloc_unittest.cc.
#ifndef RE2_UTIL_RANDOM_H__
#define RE2_UTIL_RANDOM_H__
#include "util/util.h"
namespace re2 {
// ACM minimal standard random number generator. (re-entrant.)
class ACMRandom {
public:
ACMRandom(int32 seed) : seed_(seed) {}
int32 Next();
int32 Uniform(int32);
void Reset(int32 seed) { seed_ = seed; }
private:
int32 seed_;
};
} // namespace re2
#endif // RE2_UTIL_RANDOM_H__

View File

@ -11,8 +11,10 @@
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
*/
#include <stdarg.h>
#include <string.h>
#include "util/utf.h"
namespace re2 {
@ -133,7 +135,7 @@ runetochar(char *str, const Rune *rune)
*/
c = *rune;
if(c <= Rune1) {
str[0] = c;
str[0] = static_cast<char>(c);
return 1;
}
@ -142,7 +144,7 @@ runetochar(char *str, const Rune *rune)
* 0080-07FF => T2 Tx
*/
if(c <= Rune2) {
str[0] = T2 | (c >> 1*Bitx);
str[0] = T2 | static_cast<char>(c >> 1*Bitx);
str[1] = Tx | (c & Maskx);
return 2;
}
@ -161,7 +163,7 @@ runetochar(char *str, const Rune *rune)
* 0800-FFFF => T3 Tx Tx
*/
if (c <= Rune3) {
str[0] = T3 | (c >> 2*Bitx);
str[0] = T3 | static_cast<char>(c >> 2*Bitx);
str[1] = Tx | ((c >> 1*Bitx) & Maskx);
str[2] = Tx | (c & Maskx);
return 3;
@ -171,7 +173,7 @@ runetochar(char *str, const Rune *rune)
* four character sequence (21-bit value)
* 10000-1FFFFF => T4 Tx Tx Tx
*/
str[0] = T4 | (c >> 3*Bitx);
str[0] = T4 | static_cast<char>(c >> 3*Bitx);
str[1] = Tx | ((c >> 2*Bitx) & Maskx);
str[2] = Tx | ((c >> 1*Bitx) & Maskx);
str[3] = Tx | (c & Maskx);

View File

@ -2,6 +2,9 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef UTIL_SPARSE_ARRAY_H_
#define UTIL_SPARSE_ARRAY_H_
// DESCRIPTION
//
// SparseArray<T>(m) is a map from integers in [0, m) to T values.
@ -52,47 +55,58 @@
// IMPLEMENTATION
//
// SparseArray uses a vector dense_ and an array sparse_to_dense_, both of
// size max_size_. At any point, the number of elements in the sparse array is
// size_.
// SparseArray is an array dense_ and an array sparse_, both of size max_size_.
// At any point, the number of elements in the sparse array is size_.
//
// The vector dense_ contains the size_ elements in the sparse array (with
// The array dense_ contains the size_ elements in the sparse array (with
// their indices),
// in the order that the elements were first inserted. This array is dense:
// the size_ pairs are dense_[0] through dense_[size_-1].
//
// The array sparse_to_dense_ maps from indices in [0,m) to indices in
// [0,size_).
// For indices present in the array, dense_[sparse_to_dense_[i]].index_ == i.
// For indices not present in the array, sparse_to_dense_ can contain
// any value at all, perhaps outside the range [0, size_) but perhaps not.
// The array sparse_ maps from indices in [0,m) to indices in [0,size_).
// For indices present in the array, dense_[sparse_[i]].index_ == i.
// For indices not present in the array, sparse_ can contain any value at all,
// perhaps outside the range [0, size_) but perhaps not.
//
// The lax requirement on sparse_to_dense_ values makes clearing
// the array very easy: set size_ to 0. Lookups are slightly more
// complicated. An index i has a value in the array if and only if:
// sparse_to_dense_[i] is in [0, size_) AND
// dense_[sparse_to_dense_[i]].index_ == i.
// The lax requirement on sparse_ values makes clearing the array very easy:
// set size_ to 0. Lookups are slightly more complicated.
// An index i has a value in the array if and only if:
// sparse_[i] is in [0, size_) AND
// dense_[sparse_[i]].index_ == i.
// If both these properties hold, only then it is safe to refer to
// dense_[sparse_to_dense_[i]].value_
// dense_[sparse_[i]].value_
// as the value associated with index i.
//
// To insert a new entry, set sparse_to_dense_[i] to size_,
// To insert a new entry, set sparse_[i] to size_,
// initialize dense_[size_], and then increment size_.
//
// Deletion of specific values from the array is implemented by
// swapping dense_[size_-1] and the dense_ being deleted and then
// updating the appropriate sparse_to_dense_ entries.
// updating the appropriate sparse_ entries.
//
// To make the sparse array as efficient as possible for non-primitive types,
// elements may or may not be destroyed when they are deleted from the sparse
// array through a call to erase(), erase_existing() or resize(). They
// immediately become inaccessible, but they are only guaranteed to be
// destroyed when the SparseArray destructor is called.
//
// A moved-from SparseArray will be empty.
#ifndef RE2_UTIL_SPARSE_ARRAY_H__
#define RE2_UTIL_SPARSE_ARRAY_H__
// Doing this simplifies the logic below.
#ifndef __has_feature
#define __has_feature(x) 0
#endif
#include "util/util.h"
#include <assert.h>
#include <stdint.h>
#include <string.h>
#if __has_feature(memory_sanitizer)
#include <sanitizer/msan_interface.h>
#endif
#include <algorithm>
#include <memory>
#include <type_traits>
#include <utility>
namespace re2 {
@ -100,36 +114,49 @@ template<typename Value>
class SparseArray {
public:
SparseArray();
SparseArray(int max_size);
explicit SparseArray(int max_size);
~SparseArray();
// IndexValue pairs: exposed in SparseArray::iterator.
class IndexValue;
static_assert(std::is_trivially_destructible<IndexValue>::value,
"IndexValue must be trivially destructible");
typedef IndexValue value_type;
typedef typename vector<IndexValue>::iterator iterator;
typedef typename vector<IndexValue>::const_iterator const_iterator;
typedef IndexValue* iterator;
typedef const IndexValue* const_iterator;
inline const IndexValue& iv(int i) const;
SparseArray(const SparseArray& src);
SparseArray(SparseArray&& src) /*noexcept*/;
SparseArray& operator=(const SparseArray& src);
SparseArray& operator=(SparseArray&& src) /*noexcept*/;
const IndexValue& iv(int i) const;
// Return the number of entries in the array.
int size() const {
return size_;
}
// Indicate whether the array is empty.
int empty() const {
return size_ == 0;
}
// Iterate over the array.
iterator begin() {
return dense_.begin();
return dense_.get();
}
iterator end() {
return dense_.begin() + size_;
return dense_.get() + size_;
}
const_iterator begin() const {
return dense_.begin();
return dense_.get();
}
const_iterator end() const {
return dense_.begin() + size_;
return dense_.get() + size_;
}
// Change the maximum size of the array.
@ -148,39 +175,68 @@ class SparseArray {
}
// Check whether index i is in the array.
inline bool has_index(int i) const;
bool has_index(int i) const;
// Comparison function for sorting.
// Can sort the sparse array so that future iterations
// will visit indices in increasing order using
// sort(arr.begin(), arr.end(), arr.less);
// std::sort(arr.begin(), arr.end(), arr.less);
static bool less(const IndexValue& a, const IndexValue& b);
public:
// Set the value at index i to v.
inline iterator set(int i, Value v);
iterator set(int i, const Value& v) {
return SetInternal(true, i, v);
}
iterator set(int i, Value&& v) { // NOLINT
return SetInternal(true, i, std::move(v));
}
pair<iterator, bool> insert(const value_type& new_value);
std::pair<iterator, bool> insert(const value_type& v) {
return InsertInternal(v);
}
std::pair<iterator, bool> insert(value_type&& v) { // NOLINT
return InsertInternal(std::move(v));
}
// Returns the value at index i
// or defaultv if index i is not initialized in the array.
inline Value get(int i, Value defaultv) const;
template <typename... Args>
std::pair<iterator, bool> emplace(Args&&... args) { // NOLINT
return InsertInternal(value_type(std::forward<Args>(args)...));
}
iterator find(int i);
iterator find(int i) {
if (has_index(i))
return dense_.get() + sparse_[i];
return end();
}
const_iterator find(int i) const;
const_iterator find(int i) const {
if (has_index(i))
return dense_.get() + sparse_[i];
return end();
}
// Change the value at index i to v.
// Fast but unsafe: only use if has_index(i) is true.
inline iterator set_existing(int i, Value v);
iterator set_existing(int i, const Value& v) {
return SetExistingInternal(i, v);
}
iterator set_existing(int i, Value&& v) { // NOLINT
return SetExistingInternal(i, std::move(v));
}
// Set the value at the new index i to v.
// Fast but unsafe: only use if has_index(i) is false.
inline iterator set_new(int i, Value v);
iterator set_new(int i, const Value& v) {
return SetInternal(false, i, v);
}
iterator set_new(int i, Value&& v) { // NOLINT
return SetInternal(false, i, std::move(v));
}
// Get the value at index i from the array..
// Fast but unsafe: only use if has_index(i) is true.
inline Value get_existing(int i) const;
const Value& get_existing(int i) const;
// Erasing items from the array during iteration is in general
// NOT safe. There is one special case, which is that the current
@ -201,37 +257,132 @@ class SparseArray {
// the iterators could walk past the end of the array.
// Erases the element at index i from the array.
inline void erase(int i);
void erase(int i);
// Erases the element at index i from the array.
// Fast but unsafe: only use if has_index(i) is true.
inline void erase_existing(int i);
void erase_existing(int i);
private:
template <typename U>
std::pair<iterator, bool> InsertInternal(U&& v) {
DebugCheckInvariants();
std::pair<iterator, bool> p;
if (has_index(v.index_)) {
p = {dense_.get() + sparse_[v.index_], false};
} else {
p = {set_new(std::forward<U>(v).index_, std::forward<U>(v).second), true};
}
DebugCheckInvariants();
return p;
}
template <typename U>
iterator SetInternal(bool allow_overwrite, int i, U&& v) { // NOLINT
DebugCheckInvariants();
if (static_cast<uint32_t>(i) >= static_cast<uint32_t>(max_size_)) {
assert(false && "illegal index");
// Semantically, end() would be better here, but we already know
// the user did something stupid, so begin() insulates them from
// dereferencing an invalid pointer.
return begin();
}
if (!allow_overwrite) {
assert(!has_index(i));
create_index(i);
} else {
if (!has_index(i))
create_index(i);
}
return set_existing(i, std::forward<U>(v)); // NOLINT
}
template <typename U>
iterator SetExistingInternal(int i, U&& v) { // NOLINT
DebugCheckInvariants();
assert(has_index(i));
dense_[sparse_[i]].value() = std::forward<U>(v);
DebugCheckInvariants();
return dense_.get() + sparse_[i];
}
// Add the index i to the array.
// Only use if has_index(i) is known to be false.
// Since it doesn't set the value associated with i,
// this function is private, only intended as a helper
// for other methods.
inline void create_index(int i);
void create_index(int i);
// In debug mode, verify that some invariant properties of the class
// are being maintained. This is called at the end of the constructor
// and at the beginning and end of all public non-const member functions.
inline void DebugCheckInvariants() const;
void DebugCheckInvariants() const;
int size_;
int max_size_;
int* sparse_to_dense_;
vector<IndexValue> dense_;
bool valgrind_;
// Initializes memory for elements [min, max).
void MaybeInitializeMemory(int min, int max) {
#if __has_feature(memory_sanitizer)
__msan_unpoison(sparse_.get() + min, (max - min) * sizeof sparse_[0]);
#elif defined(RE2_ON_VALGRIND)
for (int i = min; i < max; i++) {
sparse_[i] = 0xababababU;
}
#endif
}
DISALLOW_EVIL_CONSTRUCTORS(SparseArray);
int size_ = 0;
int max_size_ = 0;
std::unique_ptr<int[]> sparse_;
std::unique_ptr<IndexValue[]> dense_;
};
template<typename Value>
SparseArray<Value>::SparseArray()
: size_(0), max_size_(0), sparse_to_dense_(NULL), dense_(), valgrind_(RunningOnValgrind()) {}
SparseArray<Value>::SparseArray() = default;
template<typename Value>
SparseArray<Value>::SparseArray(const SparseArray& src)
: size_(src.size_),
max_size_(src.max_size_),
sparse_(new int[max_size_]),
dense_(new IndexValue[max_size_]) {
std::copy_n(src.sparse_.get(), max_size_, sparse_.get());
std::copy_n(src.dense_.get(), max_size_, dense_.get());
}
template<typename Value>
SparseArray<Value>::SparseArray(SparseArray&& src) /*noexcept*/ // NOLINT
: size_(src.size_),
max_size_(src.max_size_),
sparse_(std::move(src.sparse_)),
dense_(std::move(src.dense_)) {
src.size_ = 0;
src.max_size_ = 0;
}
template<typename Value>
SparseArray<Value>& SparseArray<Value>::operator=(const SparseArray& src) {
size_ = src.size_;
max_size_ = src.max_size_;
std::unique_ptr<int[]> a(new int[max_size_]);
std::copy_n(src.sparse_.get(), src.max_size_, a.get());
sparse_ = std::move(a);
std::unique_ptr<IndexValue[]> b(new IndexValue[max_size_]);
std::copy_n(src.dense_.get(), src.max_size_, b.get());
dense_ = std::move(b);
return *this;
}
template<typename Value>
SparseArray<Value>& SparseArray<Value>::operator=(
SparseArray&& src) /*noexcept*/ { // NOLINT
size_ = src.size_;
max_size_ = src.max_size_;
sparse_ = std::move(src.sparse_);
dense_ = std::move(src.dense_);
// clear out the source
src.size_ = 0;
src.max_size_ = 0;
return *this;
}
// IndexValue pairs: exposed in SparseArray::iterator.
template<typename Value>
@ -242,48 +393,55 @@ class SparseArray<Value>::IndexValue {
typedef Value second_type;
IndexValue() {}
IndexValue(int index, const Value& value) : second(value), index_(index) {}
IndexValue(int i, const Value& v) : index_(i), second(v) {}
IndexValue(int i, Value&& v) : index_(i), second(std::move(v)) {}
int index() const { return index_; }
Value value() const { return second; }
// Provide the data in the 'second' member so that the utilities
// in map-util work.
Value second;
Value& value() /*&*/ { return second; }
const Value& value() const /*&*/ { return second; }
//Value&& value() /*&&*/ { return std::move(second); } // NOLINT
private:
int index_;
public:
// Provide the data in the 'second' member so that the utilities
// in map-util work.
// TODO(billydonahue): 'second' is public for short-term compatibility.
// Users will be transitioned to using value() accessor.
Value second;
};
template<typename Value>
const typename SparseArray<Value>::IndexValue&
SparseArray<Value>::iv(int i) const {
DCHECK_GE(i, 0);
DCHECK_LT(i, size_);
assert(i >= 0);
assert(i < size_);
return dense_[i];
}
// Change the maximum size of the array.
// Invalidates all iterators.
template<typename Value>
void SparseArray<Value>::resize(int new_max_size) {
void SparseArray<Value>::resize(int max_size) {
DebugCheckInvariants();
if (new_max_size > max_size_) {
int* a = new int[new_max_size];
if (sparse_to_dense_) {
memmove(a, sparse_to_dense_, max_size_*sizeof a[0]);
// Don't need to zero the memory but appease Valgrind.
if (valgrind_) {
for (int i = max_size_; i < new_max_size; i++)
a[i] = 0xababababU;
if (max_size > max_size_) {
std::unique_ptr<int[]> a(new int[max_size]);
if (sparse_) {
std::copy_n(sparse_.get(), max_size_, a.get());
}
delete[] sparse_to_dense_;
}
sparse_to_dense_ = a;
sparse_ = std::move(a);
dense_.resize(new_max_size);
std::unique_ptr<IndexValue[]> b(new IndexValue[max_size]);
if (dense_) {
std::copy_n(dense_.get(), max_size_, b.get());
}
max_size_ = new_max_size;
dense_ = std::move(b);
MaybeInitializeMemory(max_size_, max_size);
}
max_size_ = max_size;
if (size_ > max_size_)
size_ = max_size_;
DebugCheckInvariants();
@ -292,97 +450,20 @@ void SparseArray<Value>::resize(int new_max_size) {
// Check whether index i is in the array.
template<typename Value>
bool SparseArray<Value>::has_index(int i) const {
DCHECK_GE(i, 0);
DCHECK_LT(i, max_size_);
if (static_cast<uint>(i) >= static_cast<uint>(max_size_)) {
assert(i >= 0);
assert(i < max_size_);
if (static_cast<uint32_t>(i) >= static_cast<uint32_t>(max_size_)) {
return false;
}
// Unsigned comparison avoids checking sparse_to_dense_[i] < 0.
return (uint)sparse_to_dense_[i] < (uint)size_ &&
dense_[sparse_to_dense_[i]].index_ == i;
}
// Set the value at index i to v.
template<typename Value>
typename SparseArray<Value>::iterator SparseArray<Value>::set(int i, Value v) {
DebugCheckInvariants();
if (static_cast<uint>(i) >= static_cast<uint>(max_size_)) {
// Semantically, end() would be better here, but we already know
// the user did something stupid, so begin() insulates them from
// dereferencing an invalid pointer.
return begin();
}
if (!has_index(i))
create_index(i);
return set_existing(i, v);
// Unsigned comparison avoids checking sparse_[i] < 0.
return (uint32_t)sparse_[i] < (uint32_t)size_ &&
dense_[sparse_[i]].index_ == i;
}
template<typename Value>
pair<typename SparseArray<Value>::iterator, bool> SparseArray<Value>::insert(
const value_type& new_value) {
DebugCheckInvariants();
pair<typename SparseArray<Value>::iterator, bool> p;
if (has_index(new_value.index_)) {
p = make_pair(dense_.begin() + sparse_to_dense_[new_value.index_], false);
} else {
p = make_pair(set_new(new_value.index_, new_value.second), true);
}
DebugCheckInvariants();
return p;
}
template<typename Value>
Value SparseArray<Value>::get(int i, Value defaultv) const {
if (!has_index(i))
return defaultv;
return get_existing(i);
}
template<typename Value>
typename SparseArray<Value>::iterator SparseArray<Value>::find(int i) {
if (has_index(i))
return dense_.begin() + sparse_to_dense_[i];
return end();
}
template<typename Value>
typename SparseArray<Value>::const_iterator
SparseArray<Value>::find(int i) const {
if (has_index(i)) {
return dense_.begin() + sparse_to_dense_[i];
}
return end();
}
template<typename Value>
typename SparseArray<Value>::iterator
SparseArray<Value>::set_existing(int i, Value v) {
DebugCheckInvariants();
DCHECK(has_index(i));
dense_[sparse_to_dense_[i]].second = v;
DebugCheckInvariants();
return dense_.begin() + sparse_to_dense_[i];
}
template<typename Value>
typename SparseArray<Value>::iterator
SparseArray<Value>::set_new(int i, Value v) {
DebugCheckInvariants();
if (static_cast<uint>(i) >= static_cast<uint>(max_size_)) {
// Semantically, end() would be better here, but we already know
// the user did something stupid, so begin() insulates them from
// dereferencing an invalid pointer.
return begin();
}
DCHECK(!has_index(i));
create_index(i);
return set_existing(i, v);
}
template<typename Value>
Value SparseArray<Value>::get_existing(int i) const {
DCHECK(has_index(i));
return dense_[sparse_to_dense_[i]].second;
const Value& SparseArray<Value>::get_existing(int i) const {
assert(has_index(i));
return dense_[sparse_[i]].second;
}
template<typename Value>
@ -396,11 +477,11 @@ void SparseArray<Value>::erase(int i) {
template<typename Value>
void SparseArray<Value>::erase_existing(int i) {
DebugCheckInvariants();
DCHECK(has_index(i));
int di = sparse_to_dense_[i];
assert(has_index(i));
int di = sparse_[i];
if (di < size_ - 1) {
dense_[di] = dense_[size_ - 1];
sparse_to_dense_[dense_[di].index_] = di;
dense_[di] = std::move(dense_[size_ - 1]);
sparse_[dense_[di].index_] = di;
}
size_--;
DebugCheckInvariants();
@ -408,38 +489,30 @@ void SparseArray<Value>::erase_existing(int i) {
template<typename Value>
void SparseArray<Value>::create_index(int i) {
DCHECK(!has_index(i));
DCHECK_LT(size_, max_size_);
sparse_to_dense_[i] = size_;
assert(!has_index(i));
assert(size_ < max_size_);
sparse_[i] = size_;
dense_[size_].index_ = i;
size_++;
}
template<typename Value> SparseArray<Value>::SparseArray(int max_size) {
max_size_ = max_size;
sparse_to_dense_ = new int[max_size];
valgrind_ = RunningOnValgrind();
dense_.resize(max_size);
// Don't need to zero the new memory, but appease Valgrind.
if (valgrind_) {
for (int i = 0; i < max_size; i++) {
sparse_to_dense_[i] = 0xababababU;
dense_[i].index_ = 0xababababU;
}
}
sparse_.reset(new int[max_size]);
dense_.reset(new IndexValue[max_size]);
size_ = 0;
MaybeInitializeMemory(size_, max_size);
max_size_ = max_size;
DebugCheckInvariants();
}
template<typename Value> SparseArray<Value>::~SparseArray() {
DebugCheckInvariants();
delete[] sparse_to_dense_;
}
template<typename Value> void SparseArray<Value>::DebugCheckInvariants() const {
DCHECK_LE(0, size_);
DCHECK_LE(size_, max_size_);
DCHECK(size_ == 0 || sparse_to_dense_ != NULL);
assert(0 <= size_);
assert(size_ <= max_size_);
assert(size_ == 0 || sparse_ != NULL);
}
// Comparison function for sorting.
@ -450,4 +523,4 @@ template<typename Value> bool SparseArray<Value>::less(const IndexValue& a,
} // namespace re2
#endif // RE2_UTIL_SPARSE_ARRAY_H__
#endif // UTIL_SPARSE_ARRAY_H_

View File

@ -2,9 +2,12 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef UTIL_SPARSE_SET_H_
#define UTIL_SPARSE_SET_H_
// DESCRIPTION
//
// SparseSet<T>(m) is a set of integers in [0, m).
// SparseSet(m) is a set of integers in [0, m).
// It requires sizeof(int)*m memory, but it provides
// fast iteration through the elements in the set and fast clearing
// of the set.
@ -20,7 +23,7 @@
// is the number of items in the set (not O(m)).
//
// The set iterator visits entries in the order they were first
// inserted into the array. It is safe to add items to the set while
// inserted into the set. It is safe to add items to the set while
// using an iterator: the iterator will visit indices added to the set
// during the iteration, but will not re-visit indices whose values
// change after visiting. Thus SparseSet can be a convenient
@ -38,142 +41,226 @@
// Representation for Sparse Sets, ACM Letters on Programming Languages
// and Systems, Volume 2, Issue 1-4 (March-Dec. 1993), pp. 59-69.
//
// For a generalization to sparse array, see sparse_array.h.
// This is a specialization of sparse array; see sparse_array.h.
// IMPLEMENTATION
//
// See sparse_array.h for implementation details
// See sparse_array.h for implementation details.
#ifndef RE2_UTIL_SPARSE_SET_H__
#define RE2_UTIL_SPARSE_SET_H__
// Doing this simplifies the logic below.
#ifndef __has_feature
#define __has_feature(x) 0
#endif
#include "util/util.h"
#include <assert.h>
#include <stdint.h>
#include <string.h>
#if __has_feature(memory_sanitizer)
#include <sanitizer/msan_interface.h>
#endif
#include <algorithm>
#include <memory>
#include <utility>
namespace re2 {
class SparseSet {
template<typename Value>
class SparseSetT {
public:
SparseSet()
: size_(0), max_size_(0), sparse_to_dense_(NULL), dense_(NULL), valgrind_(RunningOnValgrind()) {}
SparseSet(int max_size) {
max_size_ = max_size;
sparse_to_dense_ = new int[max_size];
dense_ = new int[max_size];
valgrind_ = RunningOnValgrind();
// Don't need to zero the memory, but do so anyway
// to appease Valgrind.
if (valgrind_) {
for (int i = 0; i < max_size; i++) {
dense_[i] = 0xababababU;
sparse_to_dense_[i] = 0xababababU;
}
}
size_ = 0;
}
~SparseSet() {
delete[] sparse_to_dense_;
delete[] dense_;
}
SparseSetT();
explicit SparseSetT(int max_size);
~SparseSetT();
typedef int* iterator;
typedef const int* const_iterator;
int size() const { return size_; }
iterator begin() { return dense_; }
iterator end() { return dense_ + size_; }
const_iterator begin() const { return dense_; }
const_iterator end() const { return dense_ + size_; }
// Return the number of entries in the set.
int size() const {
return size_;
}
// Change the maximum size of the array.
// Indicate whether the set is empty.
int empty() const {
return size_ == 0;
}
// Iterate over the set.
iterator begin() {
return dense_.get();
}
iterator end() {
return dense_.get() + size_;
}
const_iterator begin() const {
return dense_.get();
}
const_iterator end() const {
return dense_.get() + size_;
}
// Change the maximum size of the set.
// Invalidates all iterators.
void resize(int new_max_size) {
if (size_ > new_max_size)
size_ = new_max_size;
if (new_max_size > max_size_) {
int* a = new int[new_max_size];
if (sparse_to_dense_) {
memmove(a, sparse_to_dense_, max_size_*sizeof a[0]);
if (valgrind_) {
for (int i = max_size_; i < new_max_size; i++)
a[i] = 0xababababU;
}
delete[] sparse_to_dense_;
}
sparse_to_dense_ = a;
void resize(int max_size);
a = new int[new_max_size];
if (dense_) {
memmove(a, dense_, size_*sizeof a[0]);
if (valgrind_) {
for (int i = size_; i < new_max_size; i++)
a[i] = 0xababababU;
}
delete[] dense_;
}
dense_ = a;
}
max_size_ = new_max_size;
}
// Return the maximum size of the array.
// Return the maximum size of the set.
// Indices can be in the range [0, max_size).
int max_size() const { return max_size_; }
// Clear the array.
void clear() { size_ = 0; }
// Check whether i is in the array.
bool contains(int i) const {
DCHECK_GE(i, 0);
DCHECK_LT(i, max_size_);
if (static_cast<uint>(i) >= static_cast<uint>(max_size_)) {
return false;
}
// Unsigned comparison avoids checking sparse_to_dense_[i] < 0.
return (uint)sparse_to_dense_[i] < (uint)size_ &&
dense_[sparse_to_dense_[i]] == i;
int max_size() const {
return max_size_;
}
// Adds i to the set.
void insert(int i) {
if (!contains(i))
insert_new(i);
// Clear the set.
void clear() {
size_ = 0;
}
// Set the value at the new index i to v.
// Check whether index i is in the set.
bool contains(int i) const;
// Comparison function for sorting.
// Can sort the sparse set so that future iterations
// will visit indices in increasing order using
// std::sort(arr.begin(), arr.end(), arr.less);
static bool less(int a, int b);
public:
// Insert index i into the set.
iterator insert(int i) {
return InsertInternal(true, i);
}
// Insert index i into the set.
// Fast but unsafe: only use if contains(i) is false.
void insert_new(int i) {
if (static_cast<uint>(i) >= static_cast<uint>(max_size_)) {
iterator insert_new(int i) {
return InsertInternal(false, i);
}
private:
iterator InsertInternal(bool allow_existing, int i) {
DebugCheckInvariants();
if (static_cast<uint32_t>(i) >= static_cast<uint32_t>(max_size_)) {
assert(false && "illegal index");
// Semantically, end() would be better here, but we already know
// the user did something stupid, so begin() insulates them from
// dereferencing an invalid pointer.
return;
return begin();
}
DCHECK(!contains(i));
DCHECK_LT(size_, max_size_);
sparse_to_dense_[i] = size_;
if (!allow_existing) {
assert(!contains(i));
create_index(i);
} else {
if (!contains(i))
create_index(i);
}
DebugCheckInvariants();
return dense_.get() + sparse_[i];
}
// Add the index i to the set.
// Only use if contains(i) is known to be false.
// This function is private, only intended as a helper
// for other methods.
void create_index(int i);
// In debug mode, verify that some invariant properties of the class
// are being maintained. This is called at the end of the constructor
// and at the beginning and end of all public non-const member functions.
void DebugCheckInvariants() const;
// Initializes memory for elements [min, max).
void MaybeInitializeMemory(int min, int max) {
#if __has_feature(memory_sanitizer)
__msan_unpoison(sparse_.get() + min, (max - min) * sizeof sparse_[0]);
#elif defined(RE2_ON_VALGRIND)
for (int i = min; i < max; i++) {
sparse_[i] = 0xababababU;
}
#endif
}
int size_ = 0;
int max_size_ = 0;
std::unique_ptr<int[]> sparse_;
std::unique_ptr<int[]> dense_;
};
template<typename Value>
SparseSetT<Value>::SparseSetT() = default;
// Change the maximum size of the set.
// Invalidates all iterators.
template<typename Value>
void SparseSetT<Value>::resize(int max_size) {
DebugCheckInvariants();
if (max_size > max_size_) {
std::unique_ptr<int[]> a(new int[max_size]);
if (sparse_) {
std::copy_n(sparse_.get(), max_size_, a.get());
}
sparse_ = std::move(a);
std::unique_ptr<int[]> b(new int[max_size]);
if (dense_) {
std::copy_n(dense_.get(), max_size_, b.get());
}
dense_ = std::move(b);
MaybeInitializeMemory(max_size_, max_size);
}
max_size_ = max_size;
if (size_ > max_size_)
size_ = max_size_;
DebugCheckInvariants();
}
// Check whether index i is in the set.
template<typename Value>
bool SparseSetT<Value>::contains(int i) const {
assert(i >= 0);
assert(i < max_size_);
if (static_cast<uint32_t>(i) >= static_cast<uint32_t>(max_size_)) {
return false;
}
// Unsigned comparison avoids checking sparse_[i] < 0.
return (uint32_t)sparse_[i] < (uint32_t)size_ &&
dense_[sparse_[i]] == i;
}
template<typename Value>
void SparseSetT<Value>::create_index(int i) {
assert(!contains(i));
assert(size_ < max_size_);
sparse_[i] = size_;
dense_[size_] = i;
size_++;
}
template<typename Value> SparseSetT<Value>::SparseSetT(int max_size) {
sparse_.reset(new int[max_size]);
dense_.reset(new int[max_size]);
size_ = 0;
MaybeInitializeMemory(size_, max_size);
max_size_ = max_size;
DebugCheckInvariants();
}
template<typename Value> SparseSetT<Value>::~SparseSetT() {
DebugCheckInvariants();
}
template<typename Value> void SparseSetT<Value>::DebugCheckInvariants() const {
assert(0 <= size_);
assert(size_ <= max_size_);
assert(size_ == 0 || sparse_ != NULL);
}
// Comparison function for sorting.
// Can sort the sparse array so that future iterations
// will visit indices in increasing order using
// sort(arr.begin(), arr.end(), arr.less);
static bool less(int a, int b) { return a < b; }
template<typename Value> bool SparseSetT<Value>::less(int a, int b) {
return a < b;
}
private:
int size_;
int max_size_;
int* sparse_to_dense_;
int* dense_;
bool valgrind_;
DISALLOW_EVIL_CONSTRUCTORS(SparseSet);
};
typedef SparseSetT<void> SparseSet;
} // namespace re2
#endif // RE2_UTIL_SPARSE_SET_H__
#endif // UTIL_SPARSE_SET_H_

View File

@ -1,87 +0,0 @@
// Copyright 2004 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "re2/stringpiece.h"
#include "util/util.h"
using re2::StringPiece;
std::ostream& operator<<(std::ostream& o, const StringPiece& piece) {
o.write(piece.data(), piece.size());
return o;
}
bool StringPiece::_equal(const StringPiece& x, const StringPiece& y) {
int len = x.size();
if (len != y.size()) {
return false;
}
const char* p = x.data();
const char* p2 = y.data();
// Test last byte in case strings share large common prefix
if ((len > 0) && (p[len-1] != p2[len-1])) return false;
const char* p_limit = p + len;
for (; p < p_limit; p++, p2++) {
if (*p != *p2)
return false;
}
return true;
}
void StringPiece::CopyToString(string* target) const {
target->assign(ptr_, length_);
}
int StringPiece::copy(char* buf, size_type n, size_type pos) const {
int ret = min(length_ - pos, n);
memcpy(buf, ptr_ + pos, ret);
return ret;
}
int StringPiece::find(const StringPiece& s, size_type pos) const {
if (length_ < 0 || pos > static_cast<size_type>(length_))
return npos;
const char* result = std::search(ptr_ + pos, ptr_ + length_,
s.ptr_, s.ptr_ + s.length_);
const size_type xpos = result - ptr_;
return xpos + s.length_ <= static_cast<size_type>(length_) ? xpos : npos;
}
int StringPiece::find(char c, size_type pos) const {
if (length_ <= 0 || pos >= static_cast<size_type>(length_)) {
return npos;
}
const char* result = std::find(ptr_ + pos, ptr_ + length_, c);
return result != ptr_ + length_ ? result - ptr_ : npos;
}
int StringPiece::rfind(const StringPiece& s, size_type pos) const {
if (length_ < s.length_) return npos;
const size_t ulen = length_;
if (s.length_ == 0) return min(ulen, pos);
const char* last = ptr_ + min(ulen - s.length_, pos) + s.length_;
const char* result = std::find_end(ptr_, last, s.ptr_, s.ptr_ + s.length_);
return result != last ? result - ptr_ : npos;
}
int StringPiece::rfind(char c, size_type pos) const {
if (length_ <= 0) return npos;
for (int i = min(pos, static_cast<size_type>(length_ - 1));
i >= 0; --i) {
if (ptr_[i] == c) {
return i;
}
}
return npos;
}
StringPiece StringPiece::substr(size_type pos, size_type n) const {
if (pos > static_cast<size_type>(length_)) pos = length_;
if (n > length_ - pos) n = length_ - pos;
return StringPiece(ptr_ + pos, n);
}
const StringPiece::size_type StringPiece::npos = size_type(-1);

View File

@ -1,78 +0,0 @@
// Copyright 2002 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "util/util.h"
namespace re2 {
static void StringAppendV(string* dst, const char* format, va_list ap) {
// First try with a small fixed size buffer
char space[1024];
// It's possible for methods that use a va_list to invalidate
// the data in it upon use. The fix is to make a copy
// of the structure before using it and use that copy instead.
va_list backup_ap;
va_copy(backup_ap, ap);
int result = vsnprintf(space, sizeof(space), format, backup_ap);
va_end(backup_ap);
if ((result >= 0) && (static_cast<size_t>(result) < sizeof(space))) {
// It fit
dst->append(space, result);
return;
}
// Repeatedly increase buffer size until it fits
int length = sizeof(space);
while (true) {
if (result < 0) {
// Older behavior: just try doubling the buffer size
length *= 2;
} else {
// We need exactly "result+1" characters
length = result+1;
}
char* buf = new char[length];
// Restore the va_list before we use it again
va_copy(backup_ap, ap);
result = vsnprintf(buf, length, format, backup_ap);
va_end(backup_ap);
if ((result >= 0) && (result < length)) {
// It fit
dst->append(buf, result);
delete[] buf;
return;
}
delete[] buf;
}
}
string StringPrintf(const char* format, ...) {
va_list ap;
va_start(ap, format);
string result;
StringAppendV(&result, format, ap);
va_end(ap);
return result;
}
void SStringPrintf(string* dst, const char* format, ...) {
va_list ap;
va_start(ap, format);
dst->clear();
StringAppendV(dst, format, ap);
va_end(ap);
}
void StringAppendF(string* dst, const char* format, ...) {
va_list ap;
va_start(ap, format);
StringAppendV(dst, format, ap);
va_end(ap);
}
} // namespace re2

View File

@ -2,8 +2,15 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "util/util.h"
#include "re2/stringpiece.h"
#include <stdarg.h>
#include <stdio.h>
#include "util/strutil.h"
#ifdef _WIN32
#define snprintf _snprintf
#define vsnprintf _vsnprintf
#endif
namespace re2 {
@ -12,16 +19,16 @@ namespace re2 {
// Copies 'src' to 'dest', escaping dangerous characters using
// C-style escape sequences. 'src' and 'dest' should not overlap.
// Returns the number of bytes written to 'dest' (not including the \0)
// or -1 if there was insufficient space.
// or (size_t)-1 if there was insufficient space.
// ----------------------------------------------------------------------
int CEscapeString(const char* src, int src_len, char* dest,
int dest_len) {
static size_t CEscapeString(const char* src, size_t src_len,
char* dest, size_t dest_len) {
const char* src_end = src + src_len;
int used = 0;
size_t used = 0;
for (; src < src_end; src++) {
if (dest_len - used < 2) // Need space for two letter escape
return -1;
if (dest_len - used < 2) // space for two-character escape
return (size_t)-1;
unsigned char c = *src;
switch (c) {
@ -36,9 +43,9 @@ int CEscapeString(const char* src, int src_len, char* dest,
// digit then that digit must be escaped too to prevent it being
// interpreted as part of the character code by C.
if (c < ' ' || c > '~') {
if (dest_len - used < 4) // need space for 4 letter escape
return -1;
sprintf(dest + used, "\\%03o", c);
if (dest_len - used < 5) // space for four-character escape + \0
return (size_t)-1;
snprintf(dest + used, 5, "\\%03o", c);
used += 4;
} else {
dest[used++] = c; break;
@ -47,51 +54,111 @@ int CEscapeString(const char* src, int src_len, char* dest,
}
if (dest_len - used < 1) // make sure that there is room for \0
return -1;
return (size_t)-1;
dest[used] = '\0'; // doesn't count towards return value though
return used;
}
// ----------------------------------------------------------------------
// CEscape()
// Copies 'src' to result, escaping dangerous characters using
// C-style escape sequences. 'src' and 'dest' should not overlap.
// ----------------------------------------------------------------------
string CEscape(const StringPiece& src) {
const int dest_length = src.size() * 4 + 1; // Maximum possible expansion
char* dest = new char[dest_length];
const int len = CEscapeString(src.data(), src.size(),
dest, dest_length);
string s = string(dest, len);
const size_t dest_len = src.size() * 4 + 1; // Maximum possible expansion
char* dest = new char[dest_len];
const size_t used = CEscapeString(src.data(), src.size(),
dest, dest_len);
string s = string(dest, used);
delete[] dest;
return s;
}
string PrefixSuccessor(const StringPiece& prefix) {
void PrefixSuccessor(string* prefix) {
// We can increment the last character in the string and be done
// unless that character is 255, in which case we have to erase the
// last character and increment the previous character, unless that
// is 255, etc. If the string is empty or consists entirely of
// 255's, we just return the empty string.
bool done = false;
string limit(prefix.data(), prefix.size());
int index = limit.length() - 1;
while (!done && index >= 0) {
if ((limit[index]&255) == 255) {
limit.erase(index);
index--;
while (!prefix->empty()) {
char& c = prefix->back();
if (c == '\xff') { // char literal avoids signed/unsigned.
prefix->pop_back();
} else {
limit[index]++;
done = true;
++c;
break;
}
}
if (!done) {
return "";
} else {
return limit;
}
}
static void StringAppendV(string* dst, const char* format, va_list ap) {
// First try with a small fixed size buffer
char space[1024];
// It's possible for methods that use a va_list to invalidate
// the data in it upon use. The fix is to make a copy
// of the structure before using it and use that copy instead.
va_list backup_ap;
va_copy(backup_ap, ap);
int result = vsnprintf(space, sizeof(space), format, backup_ap);
va_end(backup_ap);
if ((result >= 0) && (static_cast<size_t>(result) < sizeof(space))) {
// It fit
dst->append(space, result);
return;
}
// Repeatedly increase buffer size until it fits
int length = sizeof(space);
while (true) {
if (result < 0) {
// Older behavior: just try doubling the buffer size
length *= 2;
} else {
// We need exactly "result+1" characters
length = result+1;
}
char* buf = new char[length];
// Restore the va_list before we use it again
va_copy(backup_ap, ap);
result = vsnprintf(buf, length, format, backup_ap);
va_end(backup_ap);
if ((result >= 0) && (result < length)) {
// It fit
dst->append(buf, result);
delete[] buf;
return;
}
delete[] buf;
}
}
string StringPrintf(const char* format, ...) {
va_list ap;
va_start(ap, format);
string result;
StringAppendV(&result, format, ap);
va_end(ap);
return result;
}
void SStringPrintf(string* dst, const char* format, ...) {
va_list ap;
va_start(ap, format);
dst->clear();
StringAppendV(dst, format, ap);
va_end(ap);
}
void StringAppendF(string* dst, const char* format, ...) {
va_list ap;
va_start(ap, format);
StringAppendV(dst, format, ap);
va_end(ap);
}
} // namespace re2

View File

@ -0,0 +1,23 @@
// Copyright 2016 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef UTIL_STRUTIL_H_
#define UTIL_STRUTIL_H_
#include <string>
#include "re2/stringpiece.h"
#include "util/util.h"
namespace re2 {
string CEscape(const StringPiece& src);
void PrefixSuccessor(string* prefix);
string StringPrintf(const char* format, ...);
void SStringPrintf(string* dst, const char* format, ...);
void StringAppendF(string* dst, const char* format, ...);
} // namespace re2
#endif // UTIL_STRUTIL_H_

View File

@ -3,7 +3,10 @@
// license that can be found in the LICENSE file.
#include <stdio.h>
#ifndef _WIN32
#include <sys/resource.h>
#endif
#include "util/test.h"
DEFINE_string(test_tmpdir, "/var/tmp", "temp directory");
@ -21,14 +24,6 @@ void RegisterTest(void (*fn)(void), const char *name) {
tests[ntests++].name = name;
}
namespace re2 {
int64 VirtualProcessSize() {
struct rusage ru;
getrusage(RUSAGE_SELF, &ru);
return (int64)ru.ru_maxrss*1024;
}
} // namespace re2
int main(int argc, char** argv) {
for (int i = 0; i < ntests; i++) {
printf("%s\n", tests[i].name);

View File

@ -2,11 +2,12 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_UTIL_TEST_H__
#define RE2_UTIL_TEST_H__
#ifndef UTIL_TEST_H_
#define UTIL_TEST_H_
#include "util/util.h"
#include "util/flags.h"
#include "util/logging.h"
#define TEST(x, y) \
void x##y(void); \
@ -31,14 +32,6 @@ class TestRegisterer {
#define EXPECT_GE CHECK_GE
#define EXPECT_FALSE(x) CHECK(!(x))
#define ARRAYSIZE arraysize
#define EXPECT_TRUE_M(x, y) CHECK(x) << (y)
#define EXPECT_FALSE_M(x, y) CHECK(!(x)) << (y)
#define ASSERT_TRUE_M(x, y) CHECK(x) << (y)
#define ASSERT_EQUALS(x, y) CHECK_EQ(x, y)
const bool UsingMallocCounter = false;
namespace testing {
class MallocCounter {
public:
@ -50,8 +43,4 @@ class MallocCounter {
};
} // namespace testing
namespace re2 {
int64 VirtualProcessSize();
} // namespace re2
#endif // RE2_UTIL_TEST_H__
#endif // UTIL_TEST_H_

View File

@ -1,44 +0,0 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include <pthread.h>
#include "util/util.h"
#include "util/thread.h"
Thread::Thread() {
pid_ = 0;
running_ = 0;
joinable_ = 0;
}
Thread::~Thread() {
}
void *startThread(void *v) {
Thread* t = (Thread*)v;
t->Run();
return 0;
}
void Thread::Start() {
CHECK(!running_);
pthread_create(&pid_, 0, startThread, this);
running_ = true;
if (!joinable_)
pthread_detach(pid_);
}
void Thread::Join() {
CHECK(running_);
CHECK(joinable_);
void *val;
pthread_join(pid_, &val);
running_ = 0;
}
void Thread::SetJoinable(bool j) {
CHECK(!running_);
joinable_ = j;
}

View File

@ -1,26 +0,0 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_UTIL_THREAD_H__
#define RE2_UTIL_THREAD_H__
#include <pthread.h>
class Thread {
public:
Thread();
virtual ~Thread();
void Start();
void Join();
void SetJoinable(bool);
virtual void Run() = 0;
private:
pthread_t pid_;
bool running_;
bool joinable_;
};
#endif // RE2_UTIL_THREAD_H__

View File

@ -14,8 +14,9 @@
* This file and rune.cc have been converted to compile as C++ code
* in name space re2.
*/
#ifndef RE2_UTIL_UTF_H__
#define RE2_UTIL_UTF_H__
#ifndef UTIL_UTF_H_
#define UTIL_UTF_H_
#include <stdint.h>
@ -40,4 +41,4 @@ char* utfrune(const char*, Rune);
} // namespace re2
#endif // RE2_UTIL_UTF_H__
#endif // UTIL_UTF_H_

View File

@ -2,125 +2,21 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_UTIL_UTIL_H__
#define RE2_UTIL_UTIL_H__
#ifndef UTIL_UTIL_H_
#define UTIL_UTIL_H_
// C
#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include <stddef.h> // For size_t
#include <assert.h>
#include <stdarg.h>
#include <sys/time.h>
#include <time.h>
#include <ctype.h> // For isdigit, isalpha.
// C++
#include <vector>
// TODO(junyer): Get rid of this.
#include <string>
#include <algorithm>
#include <iosfwd>
#include <map>
#include <stack>
#include <ostream>
#include <utility>
#include <set>
// Use std names.
using std::set;
using std::pair;
using std::vector;
using std::string;
using std::min;
using std::max;
using std::ostream;
using std::map;
using std::stack;
using std::sort;
using std::swap;
using std::make_pair;
#if defined(__GNUC__) && !defined(USE_CXX0X) && !defined(_LIBCPP_ABI_VERSION) && !defined(OS_ANDROID)
#define arraysize(array) (int)(sizeof(array)/sizeof((array)[0]))
#include <tr1/unordered_set>
using std::tr1::unordered_set;
#else
#include <unordered_set>
#if defined(WIN32) || defined(OS_ANDROID)
using std::tr1::unordered_set;
#else
using std::unordered_set;
#ifndef FALLTHROUGH_INTENDED
#define FALLTHROUGH_INTENDED do { } while (0)
#endif
#ifndef NO_THREAD_SAFETY_ANALYSIS
#define NO_THREAD_SAFETY_ANALYSIS
#endif
namespace re2 {
typedef int8_t int8;
typedef uint8_t uint8;
typedef int16_t int16;
typedef uint16_t uint16;
typedef int32_t int32;
typedef uint32_t uint32;
typedef int64_t int64;
typedef uint64_t uint64;
typedef unsigned long ulong;
typedef unsigned int uint;
typedef unsigned short ushort;
// COMPILE_ASSERT causes a compile error about msg if expr is not true.
#if __cplusplus >= 201103L
#define COMPILE_ASSERT(expr, msg) static_assert(expr, #msg)
#else
template<bool> struct CompileAssert {};
#define COMPILE_ASSERT(expr, msg) \
typedef CompileAssert<(bool(expr))> msg[bool(expr) ? 1 : -1]
#endif
// DISALLOW_EVIL_CONSTRUCTORS disallows the copy and operator= functions.
// It goes in the private: declarations in a class.
#define DISALLOW_EVIL_CONSTRUCTORS(TypeName) \
TypeName(const TypeName&); \
void operator=(const TypeName&)
#define arraysize(array) (sizeof(array)/sizeof((array)[0]))
class StringPiece;
string CEscape(const StringPiece& src);
int CEscapeString(const char* src, int src_len, char* dest, int dest_len);
extern string StringPrintf(const char* format, ...);
extern void SStringPrintf(string* dst, const char* format, ...);
extern void StringAppendF(string* dst, const char* format, ...);
extern string PrefixSuccessor(const StringPiece& prefix);
uint32 hashword(const uint32*, size_t, uint32);
void hashword2(const uint32*, size_t, uint32*, uint32*);
static inline uint32 Hash32StringWithSeed(const char* s, int len, uint32 seed) {
return hashword((uint32*)s, len/4, seed);
}
static inline uint64 Hash64StringWithSeed(const char* s, int len, uint32 seed) {
uint32 x, y;
x = seed;
y = 0;
hashword2((uint32*)s, len/4, &x, &y);
return ((uint64)x << 32) | y;
}
int RunningOnValgrind();
} // namespace re2
#include "util/arena.h"
#include "util/logging.h"
#include "util/mutex.h"
#include "util/utf.h"
#endif // RE2_UTIL_UTIL_H__
#endif // UTIL_UTIL_H_

View File

@ -1,24 +0,0 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "util/util.h"
#include "util/valgrind.h"
namespace re2 {
#ifndef __has_feature
#define __has_feature(x) 0
#endif
int RunningOnValgrind() {
#if __has_feature(memory_sanitizer)
return true;
#elif defined(RUNNING_ON_VALGRIND)
return RUNNING_ON_VALGRIND;
#else
return 0;
#endif
}
} // namespace re2

File diff suppressed because it is too large Load Diff

View File

@ -623,7 +623,7 @@ struct ReplaceRegexpImpl
{
re2_st::StringPiece matches[max_captures];
int start_pos = 0;
size_t start_pos = 0;
while (start_pos < input.length())
{
/// If no more replacements possible for current string