mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-22 15:42:02 +00:00
Updated re2 to the latest version [#CLICKHOUSE-2]
This commit is contained in:
parent
9f57a1f7a5
commit
3e3d7b354a
@ -1,33 +1,30 @@
|
||||
set (re2_sources
|
||||
./re2/tostring.cc
|
||||
./re2/dfa.cc
|
||||
./re2/prefilter.cc
|
||||
./re2/compile.cc
|
||||
./re2/regexp.cc
|
||||
./re2/onepass.cc
|
||||
./re2/prefilter_tree.cc
|
||||
./re2/set.cc
|
||||
./re2/filtered_re2.cc
|
||||
./re2/perl_groups.cc
|
||||
./re2/parse.cc
|
||||
./re2/nfa.cc
|
||||
./re2/bitstate.cc
|
||||
./re2/simplify.cc
|
||||
./re2/unicode_groups.cc
|
||||
./re2/mimics_pcre.cc
|
||||
./re2/re2.cc
|
||||
./re2/prog.cc
|
||||
./re2/unicode_casefold.cc
|
||||
./util/strutil.cc
|
||||
./util/stringpiece.cc
|
||||
./util/hash.cc
|
||||
./util/arena.cc
|
||||
./util/valgrind.cc
|
||||
./util/pcre.cc
|
||||
./util/stringprintf.cc
|
||||
./util/rune.cc
|
||||
./util/random.cc
|
||||
./util/thread.cc
|
||||
./re2/bitstate.cc
|
||||
./re2/compile.cc
|
||||
./re2/dfa.cc
|
||||
./re2/filtered_re2.cc
|
||||
./re2/mimics_pcre.cc
|
||||
./re2/nfa.cc
|
||||
./re2/onepass.cc
|
||||
./re2/parse.cc
|
||||
./re2/perl_groups.cc
|
||||
./re2/prefilter.cc
|
||||
./re2/prefilter_tree.cc
|
||||
./re2/prog.cc
|
||||
./re2/re2.cc
|
||||
./re2/regexp.cc
|
||||
./re2/set.cc
|
||||
./re2/simplify.cc
|
||||
./re2/stringpiece.cc
|
||||
./re2/tostring.cc
|
||||
./re2/unicode_casefold.cc
|
||||
./re2/unicode_groups.cc
|
||||
./util/benchmark.cc
|
||||
./util/fuzz.cc
|
||||
./util/pcre.cc
|
||||
./util/rune.cc
|
||||
./util/strutil.cc
|
||||
./util/test.cc
|
||||
)
|
||||
|
||||
# Building re2 which is thread-safe and re2_st which is not.
|
||||
|
@ -1,9 +1 @@
|
||||
Source: hg clone https://re2.googlecode.com/hg re2
|
||||
|
||||
Latest commit:
|
||||
|
||||
changeset: 118:1b483548272e
|
||||
tag: tip
|
||||
user: Russ Cox <rsc@swtch.com>
|
||||
date: Mon Oct 06 15:08:47 2014 -0400
|
||||
summary: doc: import clarifications from Go tree
|
||||
https://github.com/google/re2/tree/7cf8b88e8f70f97fd4926b56aa87e7f53b2717e0
|
||||
|
@ -1,10 +0,0 @@
|
||||
#!/bin/sh
|
||||
|
||||
rm -rf re2_st
|
||||
mkdir -p re2_st
|
||||
|
||||
for i in filtered_re2.h re2.h set.h stringpiece.h variadic_function.h;
|
||||
do
|
||||
cp $1/re2/$i re2_st/$i
|
||||
sed -i -r 's/using re2::RE2;//g;s/namespace re2/namespace re2_st/g;s/re2::/re2_st::/g;s/\"re2\//\"re2_st\//g;s/(.*?_H)/\1_ST/g' re2_st/$i;
|
||||
done
|
113
contrib/libre2/re2/bitmap256.h
Normal file
113
contrib/libre2/re2/bitmap256.h
Normal file
@ -0,0 +1,113 @@
|
||||
// Copyright 2016 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef RE2_BITMAP256_H_
|
||||
#define RE2_BITMAP256_H_
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#include <intrin.h>
|
||||
#endif
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "util/util.h"
|
||||
#include "util/logging.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
class Bitmap256 {
|
||||
public:
|
||||
Bitmap256() {
|
||||
memset(words_, 0, sizeof words_);
|
||||
}
|
||||
|
||||
// Tests the bit with index c.
|
||||
bool Test(int c) const {
|
||||
DCHECK_GE(c, 0);
|
||||
DCHECK_LE(c, 255);
|
||||
|
||||
return (words_[c / 64] & (1ULL << (c % 64))) != 0;
|
||||
}
|
||||
|
||||
// Sets the bit with index c.
|
||||
void Set(int c) {
|
||||
DCHECK_GE(c, 0);
|
||||
DCHECK_LE(c, 255);
|
||||
|
||||
words_[c / 64] |= (1ULL << (c % 64));
|
||||
}
|
||||
|
||||
// Finds the next non-zero bit with index >= c.
|
||||
// Returns -1 if no such bit exists.
|
||||
int FindNextSetBit(int c) const;
|
||||
|
||||
private:
|
||||
// Finds the least significant non-zero bit in n.
|
||||
static int FindLSBSet(uint64_t n) {
|
||||
DCHECK_NE(n, 0);
|
||||
|
||||
#if defined(__GNUC__)
|
||||
return __builtin_ctzll(n);
|
||||
#elif defined(_MSC_VER) && defined(_M_X64)
|
||||
unsigned long c;
|
||||
_BitScanForward64(&c, n);
|
||||
return static_cast<int>(c);
|
||||
#elif defined(_MSC_VER) && defined(_M_IX86)
|
||||
unsigned long c;
|
||||
if (static_cast<uint32_t>(n) != 0) {
|
||||
_BitScanForward(&c, static_cast<uint32_t>(n));
|
||||
return static_cast<int>(c);
|
||||
} else {
|
||||
_BitScanForward(&c, static_cast<uint32_t>(n >> 32));
|
||||
return static_cast<int>(c) + 32;
|
||||
}
|
||||
#else
|
||||
int c = 63;
|
||||
for (int shift = 1 << 5; shift != 0; shift >>= 1) {
|
||||
uint64_t word = n << shift;
|
||||
if (word != 0) {
|
||||
n = word;
|
||||
c -= shift;
|
||||
}
|
||||
}
|
||||
return c;
|
||||
#endif
|
||||
}
|
||||
|
||||
uint64_t words_[4];
|
||||
};
|
||||
|
||||
int Bitmap256::FindNextSetBit(int c) const {
|
||||
DCHECK_GE(c, 0);
|
||||
DCHECK_LE(c, 255);
|
||||
|
||||
// Check the word that contains the bit. Mask out any lower bits.
|
||||
int i = c / 64;
|
||||
uint64_t word = words_[i] & (~0ULL << (c % 64));
|
||||
if (word != 0)
|
||||
return (i * 64) + FindLSBSet(word);
|
||||
|
||||
// Check any following words.
|
||||
i++;
|
||||
switch (i) {
|
||||
case 1:
|
||||
if (words_[1] != 0)
|
||||
return (1 * 64) + FindLSBSet(words_[1]);
|
||||
FALLTHROUGH_INTENDED;
|
||||
case 2:
|
||||
if (words_[2] != 0)
|
||||
return (2 * 64) + FindLSBSet(words_[2]);
|
||||
FALLTHROUGH_INTENDED;
|
||||
case 3:
|
||||
if (words_[3] != 0)
|
||||
return (3 * 64) + FindLSBSet(words_[3]);
|
||||
FALLTHROUGH_INTENDED;
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
|
||||
#endif // RE2_BITMAP256_H_
|
@ -17,6 +17,11 @@
|
||||
// SearchBitState is a fast replacement for the NFA code on small
|
||||
// regexps and texts when SearchOnePass cannot be used.
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "util/logging.h"
|
||||
#include "re2/prog.h"
|
||||
#include "re2/regexp.h"
|
||||
|
||||
@ -60,8 +65,8 @@ class BitState {
|
||||
int ncap_;
|
||||
|
||||
static const int VisitedBits = 32;
|
||||
uint32 *visited_; // bitmap: (Inst*, char*) pairs already backtracked
|
||||
int nvisited_; // # of words in bitmap
|
||||
uint32_t *visited_; // bitmap: (Inst*, char*) pairs already backtracked
|
||||
size_t nvisited_; // # of words in bitmap
|
||||
|
||||
Job *job_; // stack of text positions to explore
|
||||
int njob_;
|
||||
@ -94,7 +99,7 @@ BitState::~BitState() {
|
||||
// If so, remember that it was visited so that the next time,
|
||||
// we don't repeat the visit.
|
||||
bool BitState::ShouldVisit(int id, const char* p) {
|
||||
uint n = id * (text_.size() + 1) + (p - text_.begin());
|
||||
size_t n = id * (text_.size() + 1) + (p - text_.begin());
|
||||
if (visited_[n/VisitedBits] & (1 << (n & (VisitedBits-1))))
|
||||
return false;
|
||||
visited_[n/VisitedBits] |= 1 << (n & (VisitedBits-1));
|
||||
@ -103,7 +108,6 @@ bool BitState::ShouldVisit(int id, const char* p) {
|
||||
|
||||
// Grow the stack.
|
||||
bool BitState::GrowStack() {
|
||||
// VLOG(0) << "Reallocate.";
|
||||
maxjob_ *= 2;
|
||||
Job* newjob = new Job[maxjob_];
|
||||
memmove(newjob, job_, njob_*sizeof job_[0]);
|
||||
@ -141,6 +145,7 @@ void BitState::Push(int id, const char* p, int arg) {
|
||||
// Return whether it succeeded.
|
||||
bool BitState::TrySearch(int id0, const char* p0) {
|
||||
bool matched = false;
|
||||
bool inaltmatch = false;
|
||||
const char* end = text_.end();
|
||||
njob_ = 0;
|
||||
Push(id0, p0, 0);
|
||||
@ -159,81 +164,86 @@ bool BitState::TrySearch(int id0, const char* p0) {
|
||||
// would have, but we avoid the stack
|
||||
// manipulation.
|
||||
if (0) {
|
||||
Next:
|
||||
// If the Match of a non-greedy AltMatch failed,
|
||||
// we stop ourselves from trying the ByteRange,
|
||||
// which would steer us off the short circuit.
|
||||
if (prog_->inst(id)->last() || inaltmatch)
|
||||
continue;
|
||||
id++;
|
||||
|
||||
CheckAndLoop:
|
||||
if (!ShouldVisit(id, p))
|
||||
continue;
|
||||
}
|
||||
|
||||
// Visit ip, p.
|
||||
// VLOG(0) << "Job: " << ip->id() << " "
|
||||
// << (p - text_.begin()) << " " << arg;
|
||||
Prog::Inst* ip = prog_->inst(id);
|
||||
switch (ip->opcode()) {
|
||||
case kInstFail:
|
||||
default:
|
||||
LOG(DFATAL) << "Unexpected opcode: " << ip->opcode() << " arg " << arg;
|
||||
return false;
|
||||
|
||||
case kInstAlt:
|
||||
// Cannot just
|
||||
// Push(ip->out1(), p, 0);
|
||||
// Push(ip->out(), p, 0);
|
||||
// If, during the processing of ip->out(), we encounter
|
||||
// ip->out1() via another path, we want to process it then.
|
||||
// Pushing it here will inhibit that. Instead, re-push
|
||||
// ip with arg==1 as a reminder to push ip->out1() later.
|
||||
case kInstFail:
|
||||
continue;
|
||||
|
||||
case kInstAltMatch:
|
||||
switch (arg) {
|
||||
case 0:
|
||||
inaltmatch = true;
|
||||
Push(id, p, 1); // come back when we're done
|
||||
|
||||
// One opcode is ByteRange; the other leads to Match
|
||||
// (possibly via Nop or Capture).
|
||||
if (ip->greedy(prog_)) {
|
||||
// out1 is the match
|
||||
Push(ip->out1(), p, 0);
|
||||
id = ip->out1();
|
||||
p = end;
|
||||
goto CheckAndLoop;
|
||||
}
|
||||
// out is the match - non-greedy
|
||||
Push(ip->out(), end, 0);
|
||||
id = ip->out();
|
||||
goto CheckAndLoop;
|
||||
|
||||
case 1:
|
||||
// Finished ip->out(); try ip->out1().
|
||||
arg = 0;
|
||||
id = ip->out1();
|
||||
goto CheckAndLoop;
|
||||
inaltmatch = false;
|
||||
continue;
|
||||
}
|
||||
LOG(DFATAL) << "Bad arg in kInstCapture: " << arg;
|
||||
LOG(DFATAL) << "Bad arg in kInstAltMatch: " << arg;
|
||||
continue;
|
||||
|
||||
case kInstAltMatch:
|
||||
// One opcode is byte range; the other leads to match.
|
||||
if (ip->greedy(prog_)) {
|
||||
// out1 is the match
|
||||
Push(ip->out1(), p, 0);
|
||||
id = ip->out1();
|
||||
p = end;
|
||||
goto CheckAndLoop;
|
||||
}
|
||||
// out is the match - non-greedy
|
||||
Push(ip->out(), end, 0);
|
||||
id = ip->out();
|
||||
goto CheckAndLoop;
|
||||
|
||||
case kInstByteRange: {
|
||||
int c = -1;
|
||||
if (p < end)
|
||||
c = *p & 0xFF;
|
||||
if (ip->Matches(c)) {
|
||||
id = ip->out();
|
||||
p++;
|
||||
goto CheckAndLoop;
|
||||
}
|
||||
continue;
|
||||
if (!ip->Matches(c))
|
||||
goto Next;
|
||||
|
||||
if (!ip->last())
|
||||
Push(id+1, p, 0); // try the next when we're done
|
||||
id = ip->out();
|
||||
p++;
|
||||
goto CheckAndLoop;
|
||||
}
|
||||
|
||||
case kInstCapture:
|
||||
switch (arg) {
|
||||
case 0:
|
||||
if (!ip->last())
|
||||
Push(id+1, p, 0); // try the next when we're done
|
||||
|
||||
if (0 <= ip->cap() && ip->cap() < ncap_) {
|
||||
// Capture p to register, but save old value.
|
||||
Push(id, cap_[ip->cap()], 1); // come back when we're done
|
||||
cap_[ip->cap()] = p;
|
||||
}
|
||||
|
||||
// Continue on.
|
||||
id = ip->out();
|
||||
goto CheckAndLoop;
|
||||
|
||||
case 1:
|
||||
// Finished ip->out(); restore the old value.
|
||||
cap_[ip->cap()] = p;
|
||||
@ -244,19 +254,23 @@ bool BitState::TrySearch(int id0, const char* p0) {
|
||||
|
||||
case kInstEmptyWidth:
|
||||
if (ip->empty() & ~Prog::EmptyFlags(context_, p))
|
||||
continue;
|
||||
goto Next;
|
||||
|
||||
if (!ip->last())
|
||||
Push(id+1, p, 0); // try the next when we're done
|
||||
id = ip->out();
|
||||
goto CheckAndLoop;
|
||||
|
||||
case kInstNop:
|
||||
if (!ip->last())
|
||||
Push(id+1, p, 0); // try the next when we're done
|
||||
id = ip->out();
|
||||
goto CheckAndLoop;
|
||||
|
||||
case kInstMatch: {
|
||||
if (endmatch_ && p != text_.end())
|
||||
continue;
|
||||
goto Next;
|
||||
|
||||
// VLOG(0) << "Found match.";
|
||||
// We found a match. If the caller doesn't care
|
||||
// where the match is, no point going further.
|
||||
if (nsubmatch_ == 0)
|
||||
@ -270,7 +284,9 @@ bool BitState::TrySearch(int id0, const char* p0) {
|
||||
if (submatch_[0].data() == NULL ||
|
||||
(longest_ && p > submatch_[0].end())) {
|
||||
for (int i = 0; i < nsubmatch_; i++)
|
||||
submatch_[i] = StringPiece(cap_[2*i], cap_[2*i+1] - cap_[2*i]);
|
||||
submatch_[i] =
|
||||
StringPiece(cap_[2 * i],
|
||||
static_cast<size_t>(cap_[2 * i + 1] - cap_[2 * i]));
|
||||
}
|
||||
|
||||
// If going for first match, we're done.
|
||||
@ -282,7 +298,7 @@ bool BitState::TrySearch(int id0, const char* p0) {
|
||||
return true;
|
||||
|
||||
// Otherwise, continue on in hope of a longer match.
|
||||
continue;
|
||||
goto Next;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -308,13 +324,12 @@ bool BitState::Search(const StringPiece& text, const StringPiece& context,
|
||||
submatch_ = submatch;
|
||||
nsubmatch_ = nsubmatch;
|
||||
for (int i = 0; i < nsubmatch_; i++)
|
||||
submatch_[i] = NULL;
|
||||
submatch_[i] = StringPiece();
|
||||
|
||||
// Allocate scratch space.
|
||||
nvisited_ = (prog_->size() * (text.size()+1) + VisitedBits-1) / VisitedBits;
|
||||
visited_ = new uint32[nvisited_];
|
||||
visited_ = new uint32_t[nvisited_];
|
||||
memset(visited_, 0, nvisited_*sizeof visited_[0]);
|
||||
// VLOG(0) << "nvisited_ = " << nvisited_;
|
||||
|
||||
ncap_ = 2*nsubmatch;
|
||||
if (ncap_ < 2)
|
||||
@ -338,6 +353,14 @@ bool BitState::Search(const StringPiece& text, const StringPiece& context,
|
||||
// but we are not clearing visited_ between calls to TrySearch,
|
||||
// so no work is duplicated and it ends up still being linear.
|
||||
for (const char* p = text.begin(); p <= text.end(); p++) {
|
||||
// Try to use memchr to find the first byte quickly.
|
||||
int fb = prog_->first_byte();
|
||||
if (fb >= 0 && p < text.end() && (p[0] & 0xFF) != fb) {
|
||||
p = reinterpret_cast<const char*>(memchr(p, fb, text.end() - p));
|
||||
if (p == NULL)
|
||||
p = text.end();
|
||||
}
|
||||
|
||||
cap_[0] = p;
|
||||
if (TrySearch(prog_->start(), p)) // Match must be leftmost; done.
|
||||
return true;
|
||||
|
@ -8,6 +8,13 @@
|
||||
// This file's external interface is just Regexp::CompileToProg.
|
||||
// The Compiler class defined in this file is private.
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
|
||||
#include "util/logging.h"
|
||||
#include "util/utf.h"
|
||||
#include "re2/prog.h"
|
||||
#include "re2/re2.h"
|
||||
#include "re2/regexp.h"
|
||||
@ -28,14 +35,14 @@ namespace re2 {
|
||||
// is always the fail instruction, which never appears on a list.
|
||||
|
||||
struct PatchList {
|
||||
uint32 p;
|
||||
uint32_t p;
|
||||
|
||||
// Returns patch list containing just p.
|
||||
static PatchList Mk(uint32 p);
|
||||
static PatchList Mk(uint32_t p);
|
||||
|
||||
// Patches all the entries on l to have value v.
|
||||
// Caller must not ever use patch list again.
|
||||
static void Patch(Prog::Inst *inst0, PatchList l, uint32 v);
|
||||
static void Patch(Prog::Inst *inst0, PatchList l, uint32_t v);
|
||||
|
||||
// Deref returns the next pointer pointed at by p.
|
||||
static PatchList Deref(Prog::Inst *inst0, PatchList l);
|
||||
@ -47,7 +54,7 @@ struct PatchList {
|
||||
static PatchList nullPatchList = { 0 };
|
||||
|
||||
// Returns patch list containing just p.
|
||||
PatchList PatchList::Mk(uint32 p) {
|
||||
PatchList PatchList::Mk(uint32_t p) {
|
||||
PatchList l;
|
||||
l.p = p;
|
||||
return l;
|
||||
@ -64,7 +71,7 @@ PatchList PatchList::Deref(Prog::Inst* inst0, PatchList l) {
|
||||
}
|
||||
|
||||
// Patches all the entries on l to have value v.
|
||||
void PatchList::Patch(Prog::Inst *inst0, PatchList l, uint32 val) {
|
||||
void PatchList::Patch(Prog::Inst *inst0, PatchList l, uint32_t val) {
|
||||
while (l.p != 0) {
|
||||
Prog::Inst* ip = &inst0[l.p>>1];
|
||||
if (l.p&1) {
|
||||
@ -103,17 +110,17 @@ PatchList PatchList::Append(Prog::Inst* inst0, PatchList l1, PatchList l2) {
|
||||
|
||||
// Compiled program fragment.
|
||||
struct Frag {
|
||||
uint32 begin;
|
||||
uint32_t begin;
|
||||
PatchList end;
|
||||
|
||||
Frag() : begin(0) { end.p = 0; } // needed so Frag can go in vector
|
||||
Frag(uint32 begin, PatchList end) : begin(begin), end(end) {}
|
||||
Frag(uint32_t begin, PatchList end) : begin(begin), end(end) {}
|
||||
};
|
||||
|
||||
// Input encodings.
|
||||
enum Encoding {
|
||||
kEncodingUTF8 = 1, // UTF-8 (0-10FFFF)
|
||||
kEncodingLatin1, // Latin1 (0-FF)
|
||||
kEncodingLatin1, // Latin-1 (0-FF)
|
||||
};
|
||||
|
||||
class Compiler : public Regexp::Walker<Frag> {
|
||||
@ -125,12 +132,11 @@ class Compiler : public Regexp::Walker<Frag> {
|
||||
// Caller is responsible for deleting Prog when finished with it.
|
||||
// If reversed is true, compiles for walking over the input
|
||||
// string backward (reverses all concatenations).
|
||||
static Prog *Compile(Regexp* re, bool reversed, int64 max_mem);
|
||||
static Prog *Compile(Regexp* re, bool reversed, int64_t max_mem);
|
||||
|
||||
// Compiles alternation of all the re to a new Prog.
|
||||
// Each re has a match with an id equal to its index in the vector.
|
||||
static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor,
|
||||
Regexp* re);
|
||||
static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem);
|
||||
|
||||
// Interface for Regexp::Walker, which helps traverse the Regexp.
|
||||
// The walk is purely post-recursive: given the machines for the
|
||||
@ -162,7 +168,7 @@ class Compiler : public Regexp::Walker<Frag> {
|
||||
Frag NoMatch();
|
||||
|
||||
// Returns a fragment that matches the empty string.
|
||||
Frag Match(int32 id);
|
||||
Frag Match(int32_t id);
|
||||
|
||||
// Returns a no-op fragment.
|
||||
Frag Nop();
|
||||
@ -178,9 +184,6 @@ class Compiler : public Regexp::Walker<Frag> {
|
||||
// Returns -1 if no more instructions are available.
|
||||
int AllocInst(int n);
|
||||
|
||||
// Deletes unused instructions.
|
||||
void Trim();
|
||||
|
||||
// Rune range compiler.
|
||||
|
||||
// Begins a new alternation.
|
||||
@ -193,19 +196,35 @@ class Compiler : public Regexp::Walker<Frag> {
|
||||
void Add_80_10ffff();
|
||||
|
||||
// New suffix that matches the byte range lo-hi, then goes to next.
|
||||
int RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next);
|
||||
int UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next);
|
||||
int UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next);
|
||||
int CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next);
|
||||
|
||||
// Returns true iff the suffix is cached.
|
||||
bool IsCachedRuneByteSuffix(int id);
|
||||
|
||||
// Adds a suffix to alternation.
|
||||
void AddSuffix(int id);
|
||||
|
||||
// Adds a suffix to the trie starting from the given root node.
|
||||
// Returns zero iff allocating an instruction fails. Otherwise, returns
|
||||
// the current root node, which might be different from what was given.
|
||||
int AddSuffixRecursive(int root, int id);
|
||||
|
||||
// Finds the trie node for the given suffix. Returns a Frag in order to
|
||||
// distinguish between pointing at the root node directly (end.p == 0)
|
||||
// and pointing at an Alt's out1 or out (end.p&1 == 1 or 0, respectively).
|
||||
Frag FindByteRange(int root, int id);
|
||||
|
||||
// Compares two ByteRanges and returns true iff they are equal.
|
||||
bool ByteRangeEqual(int id1, int id2);
|
||||
|
||||
// Returns the alternation of all the added suffixes.
|
||||
Frag EndRange();
|
||||
|
||||
// Single rune.
|
||||
Frag Literal(Rune r, bool foldcase);
|
||||
|
||||
void Setup(Regexp::ParseFlags, int64, RE2::Anchor);
|
||||
void Setup(Regexp::ParseFlags, int64_t, RE2::Anchor);
|
||||
Prog* Finish();
|
||||
|
||||
// Returns .* where dot = any byte
|
||||
@ -223,14 +242,15 @@ class Compiler : public Regexp::Walker<Frag> {
|
||||
int inst_len_; // Number of instructions used.
|
||||
int inst_cap_; // Number of instructions allocated.
|
||||
|
||||
int64 max_mem_; // Total memory budget.
|
||||
int64_t max_mem_; // Total memory budget.
|
||||
|
||||
map<uint64, int> rune_cache_;
|
||||
std::unordered_map<uint64_t, int> rune_cache_;
|
||||
Frag rune_range_;
|
||||
|
||||
RE2::Anchor anchor_; // anchor mode for RE2::Set
|
||||
|
||||
DISALLOW_EVIL_CONSTRUCTORS(Compiler);
|
||||
Compiler(const Compiler&) = delete;
|
||||
Compiler& operator=(const Compiler&) = delete;
|
||||
};
|
||||
|
||||
Compiler::Compiler() {
|
||||
@ -265,7 +285,8 @@ int Compiler::AllocInst(int n) {
|
||||
while (inst_len_ + n > inst_cap_)
|
||||
inst_cap_ *= 2;
|
||||
Prog::Inst* ip = new Prog::Inst[inst_cap_];
|
||||
memmove(ip, inst_, inst_len_ * sizeof ip[0]);
|
||||
if (inst_ != NULL)
|
||||
memmove(ip, inst_, inst_len_ * sizeof ip[0]);
|
||||
memset(ip + inst_len_, 0, (inst_cap_ - inst_len_) * sizeof ip[0]);
|
||||
delete[] inst_;
|
||||
inst_ = ip;
|
||||
@ -275,16 +296,6 @@ int Compiler::AllocInst(int n) {
|
||||
return id;
|
||||
}
|
||||
|
||||
void Compiler::Trim() {
|
||||
if (inst_len_ < inst_cap_) {
|
||||
Prog::Inst* ip = new Prog::Inst[inst_len_];
|
||||
memmove(ip, inst_, inst_len_ * sizeof ip[0]);
|
||||
delete[] inst_;
|
||||
inst_ = ip;
|
||||
inst_cap_ = inst_len_;
|
||||
}
|
||||
}
|
||||
|
||||
// These routines are somewhat hard to visualize in text --
|
||||
// see http://swtch.com/~rsc/regexp/regexp1.html for
|
||||
// pictures explaining what is going on here.
|
||||
@ -393,16 +404,6 @@ Frag Compiler::ByteRange(int lo, int hi, bool foldcase) {
|
||||
if (id < 0)
|
||||
return NoMatch();
|
||||
inst_[id].InitByteRange(lo, hi, foldcase, 0);
|
||||
prog_->byte_inst_count_++;
|
||||
prog_->MarkByteRange(lo, hi);
|
||||
if (foldcase && lo <= 'z' && hi >= 'a') {
|
||||
if (lo < 'a')
|
||||
lo = 'a';
|
||||
if (hi > 'z')
|
||||
hi = 'z';
|
||||
if (lo <= hi)
|
||||
prog_->MarkByteRange(lo + 'A' - 'a', hi + 'A' - 'a');
|
||||
}
|
||||
return Frag(id, PatchList::Mk(id << 1));
|
||||
}
|
||||
|
||||
@ -416,7 +417,7 @@ Frag Compiler::Nop() {
|
||||
}
|
||||
|
||||
// Returns a fragment that signals a match.
|
||||
Frag Compiler::Match(int32 match_id) {
|
||||
Frag Compiler::Match(int32_t match_id) {
|
||||
int id = AllocInst(1);
|
||||
if (id < 0)
|
||||
return NoMatch();
|
||||
@ -430,16 +431,6 @@ Frag Compiler::EmptyWidth(EmptyOp empty) {
|
||||
if (id < 0)
|
||||
return NoMatch();
|
||||
inst_[id].InitEmptyWidth(empty, 0);
|
||||
if (empty & (kEmptyBeginLine|kEmptyEndLine))
|
||||
prog_->MarkByteRange('\n', '\n');
|
||||
if (empty & (kEmptyWordBoundary|kEmptyNonWordBoundary)) {
|
||||
int j;
|
||||
for (int i = 0; i < 256; i = j) {
|
||||
for (j = i+1; j < 256 && Prog::IsWordChar(i) == Prog::IsWordChar(j); j++)
|
||||
;
|
||||
prog_->MarkByteRange(i, j-1);
|
||||
}
|
||||
}
|
||||
return Frag(id, PatchList::Mk(id << 1));
|
||||
}
|
||||
|
||||
@ -482,7 +473,7 @@ void Compiler::BeginRange() {
|
||||
rune_range_.end = nullPatchList;
|
||||
}
|
||||
|
||||
int Compiler::UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase,
|
||||
int Compiler::UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase,
|
||||
int next) {
|
||||
Frag f = ByteRange(lo, hi, foldcase);
|
||||
if (next != 0) {
|
||||
@ -493,18 +484,18 @@ int Compiler::UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase,
|
||||
return f.begin;
|
||||
}
|
||||
|
||||
int Compiler::RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next) {
|
||||
// In Latin1 mode, there's no point in caching.
|
||||
// In forward UTF-8 mode, only need to cache continuation bytes.
|
||||
if (encoding_ == kEncodingLatin1 ||
|
||||
(encoding_ == kEncodingUTF8 &&
|
||||
!reversed_ &&
|
||||
!(0x80 <= lo && hi <= 0xbf))) {
|
||||
return UncachedRuneByteSuffix(lo, hi, foldcase, next);
|
||||
}
|
||||
static uint64_t MakeRuneCacheKey(uint8_t lo, uint8_t hi, bool foldcase,
|
||||
int next) {
|
||||
return (uint64_t)next << 17 |
|
||||
(uint64_t)lo << 9 |
|
||||
(uint64_t)hi << 1 |
|
||||
(uint64_t)foldcase;
|
||||
}
|
||||
|
||||
uint64 key = ((uint64)next << 17) | (lo<<9) | (hi<<1) | foldcase;
|
||||
map<uint64, int>::iterator it = rune_cache_.find(key);
|
||||
int Compiler::CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase,
|
||||
int next) {
|
||||
uint64_t key = MakeRuneCacheKey(lo, hi, foldcase, next);
|
||||
std::unordered_map<uint64_t, int>::const_iterator it = rune_cache_.find(key);
|
||||
if (it != rune_cache_.end())
|
||||
return it->second;
|
||||
int id = UncachedRuneByteSuffix(lo, hi, foldcase, next);
|
||||
@ -512,12 +503,31 @@ int Compiler::RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next) {
|
||||
return id;
|
||||
}
|
||||
|
||||
bool Compiler::IsCachedRuneByteSuffix(int id) {
|
||||
uint8_t lo = inst_[id].lo_;
|
||||
uint8_t hi = inst_[id].hi_;
|
||||
bool foldcase = inst_[id].foldcase() != 0;
|
||||
int next = inst_[id].out();
|
||||
|
||||
uint64_t key = MakeRuneCacheKey(lo, hi, foldcase, next);
|
||||
return rune_cache_.find(key) != rune_cache_.end();
|
||||
}
|
||||
|
||||
void Compiler::AddSuffix(int id) {
|
||||
if (failed_)
|
||||
return;
|
||||
|
||||
if (rune_range_.begin == 0) {
|
||||
rune_range_.begin = id;
|
||||
return;
|
||||
}
|
||||
|
||||
if (encoding_ == kEncodingUTF8) {
|
||||
// Build a trie in order to reduce fanout.
|
||||
rune_range_.begin = AddSuffixRecursive(rune_range_.begin, id);
|
||||
return;
|
||||
}
|
||||
|
||||
int alt = AllocInst(1);
|
||||
if (alt < 0) {
|
||||
rune_range_.begin = 0;
|
||||
@ -527,6 +537,102 @@ void Compiler::AddSuffix(int id) {
|
||||
rune_range_.begin = alt;
|
||||
}
|
||||
|
||||
int Compiler::AddSuffixRecursive(int root, int id) {
|
||||
DCHECK(inst_[root].opcode() == kInstAlt ||
|
||||
inst_[root].opcode() == kInstByteRange);
|
||||
|
||||
Frag f = FindByteRange(root, id);
|
||||
if (IsNoMatch(f)) {
|
||||
int alt = AllocInst(1);
|
||||
if (alt < 0)
|
||||
return 0;
|
||||
inst_[alt].InitAlt(root, id);
|
||||
return alt;
|
||||
}
|
||||
|
||||
int br;
|
||||
if (f.end.p == 0)
|
||||
br = root;
|
||||
else if (f.end.p&1)
|
||||
br = inst_[f.begin].out1();
|
||||
else
|
||||
br = inst_[f.begin].out();
|
||||
|
||||
if (IsCachedRuneByteSuffix(br)) {
|
||||
// We can't fiddle with cached suffixes, so make a clone of the head.
|
||||
int byterange = AllocInst(1);
|
||||
if (byterange < 0)
|
||||
return 0;
|
||||
inst_[byterange].InitByteRange(inst_[br].lo(), inst_[br].hi(),
|
||||
inst_[br].foldcase(), inst_[br].out());
|
||||
|
||||
// Ensure that the parent points to the clone, not to the original.
|
||||
// Note that this could leave the head unreachable except via the cache.
|
||||
br = byterange;
|
||||
if (f.end.p == 0)
|
||||
root = br;
|
||||
else if (f.end.p&1)
|
||||
inst_[f.begin].out1_ = br;
|
||||
else
|
||||
inst_[f.begin].set_out(br);
|
||||
}
|
||||
|
||||
int out = inst_[id].out();
|
||||
if (!IsCachedRuneByteSuffix(id)) {
|
||||
// The head should be the instruction most recently allocated, so free it
|
||||
// instead of leaving it unreachable.
|
||||
DCHECK_EQ(id, inst_len_-1);
|
||||
inst_[id].out_opcode_ = 0;
|
||||
inst_[id].out1_ = 0;
|
||||
inst_len_--;
|
||||
}
|
||||
|
||||
out = AddSuffixRecursive(inst_[br].out(), out);
|
||||
if (out == 0)
|
||||
return 0;
|
||||
|
||||
inst_[br].set_out(out);
|
||||
return root;
|
||||
}
|
||||
|
||||
bool Compiler::ByteRangeEqual(int id1, int id2) {
|
||||
return inst_[id1].lo() == inst_[id2].lo() &&
|
||||
inst_[id1].hi() == inst_[id2].hi() &&
|
||||
inst_[id1].foldcase() == inst_[id2].foldcase();
|
||||
}
|
||||
|
||||
Frag Compiler::FindByteRange(int root, int id) {
|
||||
if (inst_[root].opcode() == kInstByteRange) {
|
||||
if (ByteRangeEqual(root, id))
|
||||
return Frag(root, nullPatchList);
|
||||
else
|
||||
return NoMatch();
|
||||
}
|
||||
|
||||
while (inst_[root].opcode() == kInstAlt) {
|
||||
int out1 = inst_[root].out1();
|
||||
if (ByteRangeEqual(out1, id))
|
||||
return Frag(root, PatchList::Mk((root << 1) | 1));
|
||||
|
||||
// CharClass is a sorted list of ranges, so if out1 of the root Alt wasn't
|
||||
// what we're looking for, then we can stop immediately. Unfortunately, we
|
||||
// can't short-circuit the search in reverse mode.
|
||||
if (!reversed_)
|
||||
return NoMatch();
|
||||
|
||||
int out = inst_[root].out();
|
||||
if (inst_[out].opcode() == kInstAlt)
|
||||
root = out;
|
||||
else if (ByteRangeEqual(out, id))
|
||||
return Frag(root, PatchList::Mk(root << 1));
|
||||
else
|
||||
return NoMatch();
|
||||
}
|
||||
|
||||
LOG(DFATAL) << "should never happen";
|
||||
return NoMatch();
|
||||
}
|
||||
|
||||
Frag Compiler::EndRange() {
|
||||
return rune_range_;
|
||||
}
|
||||
@ -550,12 +656,13 @@ void Compiler::AddRuneRange(Rune lo, Rune hi, bool foldcase) {
|
||||
}
|
||||
|
||||
void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) {
|
||||
// Latin1 is easy: runes *are* bytes.
|
||||
// Latin-1 is easy: runes *are* bytes.
|
||||
if (lo > hi || lo > 0xFF)
|
||||
return;
|
||||
if (hi > 0xFF)
|
||||
hi = 0xFF;
|
||||
AddSuffix(RuneByteSuffix(lo, hi, foldcase, 0));
|
||||
AddSuffix(UncachedRuneByteSuffix(static_cast<uint8_t>(lo),
|
||||
static_cast<uint8_t>(hi), foldcase, 0));
|
||||
}
|
||||
|
||||
// Table describing how to make a UTF-8 matching machine
|
||||
@ -591,12 +698,13 @@ static struct ByteRangeProg {
|
||||
|
||||
void Compiler::Add_80_10ffff() {
|
||||
int inst[arraysize(prog_80_10ffff)] = { 0 }; // does not need to be initialized; silences gcc warning
|
||||
for (size_t i = 0; i < arraysize(prog_80_10ffff); i++) {
|
||||
for (int i = 0; i < arraysize(prog_80_10ffff); i++) {
|
||||
const ByteRangeProg& p = prog_80_10ffff[i];
|
||||
int next = 0;
|
||||
if (p.next >= 0)
|
||||
next = inst[p.next];
|
||||
inst[i] = UncachedRuneByteSuffix(p.lo, p.hi, false, next);
|
||||
inst[i] = UncachedRuneByteSuffix(static_cast<uint8_t>(p.lo),
|
||||
static_cast<uint8_t>(p.hi), false, next);
|
||||
if ((p.lo & 0xC0) != 0x80)
|
||||
AddSuffix(inst[i]);
|
||||
}
|
||||
@ -625,13 +733,14 @@ void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) {
|
||||
|
||||
// ASCII range is always a special case.
|
||||
if (hi < Runeself) {
|
||||
AddSuffix(RuneByteSuffix(lo, hi, foldcase, 0));
|
||||
AddSuffix(UncachedRuneByteSuffix(static_cast<uint8_t>(lo),
|
||||
static_cast<uint8_t>(hi), foldcase, 0));
|
||||
return;
|
||||
}
|
||||
|
||||
// Split range into sections that agree on leading bytes.
|
||||
for (int i = 1; i < UTFmax; i++) {
|
||||
uint m = (1<<(6*i)) - 1; // last i bytes of a UTF-8 sequence
|
||||
uint32_t m = (1<<(6*i)) - 1; // last i bytes of a UTF-8 sequence
|
||||
if ((lo & ~m) != (hi & ~m)) {
|
||||
if ((lo & m) != 0) {
|
||||
AddRuneRangeUTF8(lo, lo|m, foldcase);
|
||||
@ -647,19 +756,55 @@ void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) {
|
||||
}
|
||||
|
||||
// Finally. Generate byte matching equivalent for lo-hi.
|
||||
uint8 ulo[UTFmax], uhi[UTFmax];
|
||||
uint8_t ulo[UTFmax], uhi[UTFmax];
|
||||
int n = runetochar(reinterpret_cast<char*>(ulo), &lo);
|
||||
int m = runetochar(reinterpret_cast<char*>(uhi), &hi);
|
||||
(void)m; // USED(m)
|
||||
DCHECK_EQ(n, m);
|
||||
|
||||
// The logic below encodes this thinking:
|
||||
//
|
||||
// 1. When we have built the whole suffix, we know that it cannot
|
||||
// possibly be a suffix of anything longer: in forward mode, nothing
|
||||
// else can occur before the leading byte; in reverse mode, nothing
|
||||
// else can occur after the last continuation byte or else the leading
|
||||
// byte would have to change. Thus, there is no benefit to caching
|
||||
// the first byte of the suffix whereas there is a cost involved in
|
||||
// cloning it if it begins a common prefix, which is fairly likely.
|
||||
//
|
||||
// 2. Conversely, the last byte of the suffix cannot possibly be a
|
||||
// prefix of anything because next == 0, so we will never want to
|
||||
// clone it, but it is fairly likely to be a common suffix. Perhaps
|
||||
// more so in reverse mode than in forward mode because the former is
|
||||
// "converging" towards lower entropy, but caching is still worthwhile
|
||||
// for the latter in cases such as 80-BF.
|
||||
//
|
||||
// 3. Handling the bytes between the first and the last is less
|
||||
// straightforward and, again, the approach depends on whether we are
|
||||
// "converging" towards lower entropy: in forward mode, a single byte
|
||||
// is unlikely to be part of a common suffix whereas a byte range
|
||||
// is more likely so; in reverse mode, a byte range is unlikely to
|
||||
// be part of a common suffix whereas a single byte is more likely
|
||||
// so. The same benefit versus cost argument applies here.
|
||||
int id = 0;
|
||||
if (reversed_) {
|
||||
for (int i = 0; i < n; i++)
|
||||
id = RuneByteSuffix(ulo[i], uhi[i], false, id);
|
||||
for (int i = 0; i < n; i++) {
|
||||
// In reverse UTF-8 mode: cache the leading byte; don't cache the last
|
||||
// continuation byte; cache anything else iff it's a single byte (XX-XX).
|
||||
if (i == 0 || (ulo[i] == uhi[i] && i != n-1))
|
||||
id = CachedRuneByteSuffix(ulo[i], uhi[i], false, id);
|
||||
else
|
||||
id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id);
|
||||
}
|
||||
} else {
|
||||
for (int i = n-1; i >= 0; i--)
|
||||
id = RuneByteSuffix(ulo[i], uhi[i], false, id);
|
||||
for (int i = n-1; i >= 0; i--) {
|
||||
// In forward UTF-8 mode: don't cache the leading byte; cache the last
|
||||
// continuation byte; cache anything else iff it's a byte range (XX-YY).
|
||||
if (i == n-1 || (ulo[i] < uhi[i] && i != 0))
|
||||
id = CachedRuneByteSuffix(ulo[i], uhi[i], false, id);
|
||||
else
|
||||
id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id);
|
||||
}
|
||||
}
|
||||
AddSuffix(id);
|
||||
}
|
||||
@ -699,11 +844,11 @@ Frag Compiler::Literal(Rune r, bool foldcase) {
|
||||
case kEncodingUTF8: {
|
||||
if (r < Runeself) // Make common case fast.
|
||||
return ByteRange(r, r, foldcase);
|
||||
uint8 buf[UTFmax];
|
||||
uint8_t buf[UTFmax];
|
||||
int n = runetochar(reinterpret_cast<char*>(buf), &r);
|
||||
Frag f = ByteRange((uint8)buf[0], buf[0], false);
|
||||
Frag f = ByteRange((uint8_t)buf[0], buf[0], false);
|
||||
for (int i = 1; i < n; i++)
|
||||
f = Cat(f, ByteRange((uint8)buf[i], buf[i], false));
|
||||
f = Cat(f, ByteRange((uint8_t)buf[i], buf[i], false));
|
||||
return f;
|
||||
}
|
||||
}
|
||||
@ -732,9 +877,11 @@ Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags,
|
||||
|
||||
case kRegexpHaveMatch: {
|
||||
Frag f = Match(re->match_id());
|
||||
// Remember unanchored match to end of string.
|
||||
if (anchor_ != RE2::ANCHOR_BOTH)
|
||||
f = Cat(DotStar(), Cat(EmptyWidth(kEmptyEndText), f));
|
||||
if (anchor_ == RE2::ANCHOR_BOTH) {
|
||||
// Append \z or else the subexpression will effectively be unanchored.
|
||||
// Complemented by the UNANCHORED case in CompileSet().
|
||||
f = Cat(EmptyWidth(kEmptyEndText), f);
|
||||
}
|
||||
return f;
|
||||
}
|
||||
|
||||
@ -753,16 +900,16 @@ Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags,
|
||||
}
|
||||
|
||||
case kRegexpStar:
|
||||
return Star(child_frags[0], re->parse_flags()&Regexp::NonGreedy);
|
||||
return Star(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
|
||||
|
||||
case kRegexpPlus:
|
||||
return Plus(child_frags[0], re->parse_flags()&Regexp::NonGreedy);
|
||||
return Plus(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
|
||||
|
||||
case kRegexpQuest:
|
||||
return Quest(child_frags[0], re->parse_flags()&Regexp::NonGreedy);
|
||||
return Quest(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
|
||||
|
||||
case kRegexpLiteral:
|
||||
return Literal(re->rune(), re->parse_flags()&Regexp::FoldCase);
|
||||
return Literal(re->rune(), (re->parse_flags()&Regexp::FoldCase) != 0);
|
||||
|
||||
case kRegexpLiteralString: {
|
||||
// Concatenation of literals.
|
||||
@ -770,7 +917,8 @@ Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags,
|
||||
return Nop();
|
||||
Frag f;
|
||||
for (int i = 0; i < re->nrunes(); i++) {
|
||||
Frag f1 = Literal(re->runes()[i], re->parse_flags()&Regexp::FoldCase);
|
||||
Frag f1 = Literal(re->runes()[i],
|
||||
(re->parse_flags()&Regexp::FoldCase) != 0);
|
||||
if (i == 0)
|
||||
f = f1;
|
||||
else
|
||||
@ -815,7 +963,8 @@ Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags,
|
||||
// If this range contains all of A-Za-z or none of it,
|
||||
// the fold flag is unnecessary; don't bother.
|
||||
bool fold = foldascii;
|
||||
if ((i->lo <= 'A' && 'z' <= i->hi) || i->hi < 'A' || 'z' < i->lo)
|
||||
if ((i->lo <= 'A' && 'z' <= i->hi) || i->hi < 'A' || 'z' < i->lo ||
|
||||
('Z' < i->lo && i->hi < 'a'))
|
||||
fold = false;
|
||||
|
||||
AddRuneRange(i->lo, i->hi, fold);
|
||||
@ -949,7 +1098,7 @@ static bool IsAnchorEnd(Regexp** pre, int depth) {
|
||||
return false;
|
||||
}
|
||||
|
||||
void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem,
|
||||
void Compiler::Setup(Regexp::ParseFlags flags, int64_t max_mem,
|
||||
RE2::Anchor anchor) {
|
||||
prog_->set_flags(flags);
|
||||
|
||||
@ -958,11 +1107,11 @@ void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem,
|
||||
max_mem_ = max_mem;
|
||||
if (max_mem <= 0) {
|
||||
max_inst_ = 100000; // more than enough
|
||||
} else if (max_mem <= static_cast<int64>(sizeof(Prog))) {
|
||||
} else if (static_cast<size_t>(max_mem) <= sizeof(Prog)) {
|
||||
// No room for anything.
|
||||
max_inst_ = 0;
|
||||
} else {
|
||||
int64 m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst);
|
||||
int64_t m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst);
|
||||
// Limit instruction count so that inst->id() fits nicely in an int.
|
||||
// SparseArray also assumes that the indices (inst->id()) are ints.
|
||||
// The call to WalkExponential uses 2*max_inst_ below,
|
||||
@ -978,7 +1127,7 @@ void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem,
|
||||
if (m > Prog::Inst::kMaxInst)
|
||||
m = Prog::Inst::kMaxInst;
|
||||
|
||||
max_inst_ = m;
|
||||
max_inst_ = static_cast<int>(m);
|
||||
}
|
||||
|
||||
anchor_ = anchor;
|
||||
@ -989,10 +1138,9 @@ void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem,
|
||||
// If reversed is true, compiles a program that expects
|
||||
// to run over the input string backward (reverses all concatenations).
|
||||
// The reversed flag is also recorded in the returned program.
|
||||
Prog* Compiler::Compile(Regexp* re, bool reversed, int64 max_mem) {
|
||||
Prog* Compiler::Compile(Regexp* re, bool reversed, int64_t max_mem) {
|
||||
Compiler c;
|
||||
|
||||
c.Setup(re->parse_flags(), max_mem, RE2::ANCHOR_BOTH /* unused */);
|
||||
c.Setup(re->parse_flags(), max_mem, RE2::UNANCHORED /* unused */);
|
||||
c.reversed_ = reversed;
|
||||
|
||||
// Simplify to remove things like counted repetitions
|
||||
@ -1007,7 +1155,7 @@ Prog* Compiler::Compile(Regexp* re, bool reversed, int64 max_mem) {
|
||||
bool is_anchor_end = IsAnchorEnd(&sre, 0);
|
||||
|
||||
// Generate fragment for entire regexp.
|
||||
Frag f = c.WalkExponential(sre, Frag(), 2*c.max_inst_);
|
||||
Frag all = c.WalkExponential(sre, Frag(), 2*c.max_inst_);
|
||||
sre->Decref();
|
||||
if (c.failed_)
|
||||
return NULL;
|
||||
@ -1016,10 +1164,10 @@ Prog* Compiler::Compile(Regexp* re, bool reversed, int64 max_mem) {
|
||||
// Turn off c.reversed_ (if it is set) to force the remaining concatenations
|
||||
// to behave normally.
|
||||
c.reversed_ = false;
|
||||
Frag all = c.Cat(f, c.Match(0));
|
||||
c.prog_->set_start(all.begin);
|
||||
all = c.Cat(all, c.Match(0));
|
||||
|
||||
if (reversed) {
|
||||
c.prog_->set_reversed(reversed);
|
||||
if (c.prog_->reversed()) {
|
||||
c.prog_->set_anchor_start(is_anchor_end);
|
||||
c.prog_->set_anchor_end(is_anchor_start);
|
||||
} else {
|
||||
@ -1027,15 +1175,12 @@ Prog* Compiler::Compile(Regexp* re, bool reversed, int64 max_mem) {
|
||||
c.prog_->set_anchor_end(is_anchor_end);
|
||||
}
|
||||
|
||||
// Also create unanchored version, which starts with a .*? loop.
|
||||
if (c.prog_->anchor_start()) {
|
||||
c.prog_->set_start_unanchored(c.prog_->start());
|
||||
} else {
|
||||
Frag unanchored = c.Cat(c.DotStar(), all);
|
||||
c.prog_->set_start_unanchored(unanchored.begin);
|
||||
c.prog_->set_start(all.begin);
|
||||
if (!c.prog_->anchor_start()) {
|
||||
// Also create unanchored version, which starts with a .*? loop.
|
||||
all = c.Cat(c.DotStar(), all);
|
||||
}
|
||||
|
||||
c.prog_->set_reversed(reversed);
|
||||
c.prog_->set_start_unanchored(all.begin);
|
||||
|
||||
// Hand ownership of prog_ to caller.
|
||||
return c.Finish();
|
||||
@ -1050,22 +1195,20 @@ Prog* Compiler::Finish() {
|
||||
inst_len_ = 1;
|
||||
}
|
||||
|
||||
// Trim instruction to minimum array and transfer to Prog.
|
||||
Trim();
|
||||
// Hand off the array to Prog.
|
||||
prog_->inst_ = inst_;
|
||||
prog_->size_ = inst_len_;
|
||||
inst_ = NULL;
|
||||
|
||||
// Compute byte map.
|
||||
prog_->ComputeByteMap();
|
||||
|
||||
prog_->Optimize();
|
||||
prog_->Flatten();
|
||||
prog_->ComputeByteMap();
|
||||
|
||||
// Record remaining memory for DFA.
|
||||
if (max_mem_ <= 0) {
|
||||
prog_->set_dfa_mem(1<<20);
|
||||
} else {
|
||||
int64 m = max_mem_ - sizeof(Prog) - inst_len_*sizeof(Prog::Inst);
|
||||
int64_t m = max_mem_ - sizeof(Prog) - prog_->size_*sizeof(Prog::Inst);
|
||||
if (m < 0)
|
||||
m = 0;
|
||||
prog_->set_dfa_mem(m);
|
||||
@ -1077,11 +1220,11 @@ Prog* Compiler::Finish() {
|
||||
}
|
||||
|
||||
// Converts Regexp to Prog.
|
||||
Prog* Regexp::CompileToProg(int64 max_mem) {
|
||||
Prog* Regexp::CompileToProg(int64_t max_mem) {
|
||||
return Compiler::Compile(this, false, max_mem);
|
||||
}
|
||||
|
||||
Prog* Regexp::CompileToReverseProg(int64 max_mem) {
|
||||
Prog* Regexp::CompileToReverseProg(int64_t max_mem) {
|
||||
return Compiler::Compile(this, true, max_mem);
|
||||
}
|
||||
|
||||
@ -1090,41 +1233,41 @@ Frag Compiler::DotStar() {
|
||||
}
|
||||
|
||||
// Compiles RE set to Prog.
|
||||
Prog* Compiler::CompileSet(const RE2::Options& options, RE2::Anchor anchor,
|
||||
Regexp* re) {
|
||||
Prog* Compiler::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) {
|
||||
Compiler c;
|
||||
c.Setup(re->parse_flags(), max_mem, anchor);
|
||||
|
||||
Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(options.ParseFlags());
|
||||
c.Setup(pf, options.max_mem(), anchor);
|
||||
Regexp* sre = re->Simplify();
|
||||
if (sre == NULL)
|
||||
return NULL;
|
||||
|
||||
// Compile alternation of fragments.
|
||||
Frag all = c.WalkExponential(re, Frag(), 2*c.max_inst_);
|
||||
re->Decref();
|
||||
Frag all = c.WalkExponential(sre, Frag(), 2*c.max_inst_);
|
||||
sre->Decref();
|
||||
if (c.failed_)
|
||||
return NULL;
|
||||
|
||||
if (anchor == RE2::UNANCHORED) {
|
||||
// The trailing .* was added while handling kRegexpHaveMatch.
|
||||
// We just have to add the leading one.
|
||||
all = c.Cat(c.DotStar(), all);
|
||||
}
|
||||
|
||||
c.prog_->set_start(all.begin);
|
||||
c.prog_->set_start_unanchored(all.begin);
|
||||
c.prog_->set_anchor_start(true);
|
||||
c.prog_->set_anchor_end(true);
|
||||
|
||||
if (anchor == RE2::UNANCHORED) {
|
||||
// Prepend .* or else the expression will effectively be anchored.
|
||||
// Complemented by the ANCHOR_BOTH case in PostVisit().
|
||||
all = c.Cat(c.DotStar(), all);
|
||||
}
|
||||
c.prog_->set_start(all.begin);
|
||||
c.prog_->set_start_unanchored(all.begin);
|
||||
|
||||
Prog* prog = c.Finish();
|
||||
if (prog == NULL)
|
||||
return NULL;
|
||||
|
||||
// Make sure DFA has enough memory to operate,
|
||||
// since we're not going to fall back to the NFA.
|
||||
bool failed;
|
||||
bool dfa_failed = false;
|
||||
StringPiece sp = "hello, world";
|
||||
prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch,
|
||||
NULL, &failed, NULL);
|
||||
if (failed) {
|
||||
NULL, &dfa_failed, NULL);
|
||||
if (dfa_failed) {
|
||||
delete prog;
|
||||
return NULL;
|
||||
}
|
||||
@ -1132,9 +1275,8 @@ Prog* Compiler::CompileSet(const RE2::Options& options, RE2::Anchor anchor,
|
||||
return prog;
|
||||
}
|
||||
|
||||
Prog* Prog::CompileSet(const RE2::Options& options, RE2::Anchor anchor,
|
||||
Regexp* re) {
|
||||
return Compiler::CompileSet(options, anchor, re);
|
||||
Prog* Prog::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) {
|
||||
return Compiler::CompileSet(re, anchor, max_mem);
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -2,9 +2,13 @@
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include <string>
|
||||
#include "util/util.h"
|
||||
#include "re2/filtered_re2.h"
|
||||
|
||||
#include <stddef.h>
|
||||
#include <string>
|
||||
|
||||
#include "util/util.h"
|
||||
#include "util/logging.h"
|
||||
#include "re2/prefilter.h"
|
||||
#include "re2/prefilter_tree.h"
|
||||
|
||||
@ -15,6 +19,11 @@ FilteredRE2::FilteredRE2()
|
||||
prefilter_tree_(new PrefilterTree()) {
|
||||
}
|
||||
|
||||
FilteredRE2::FilteredRE2(int min_atom_len)
|
||||
: compiled_(false),
|
||||
prefilter_tree_(new PrefilterTree(min_atom_len)) {
|
||||
}
|
||||
|
||||
FilteredRE2::~FilteredRE2() {
|
||||
for (size_t i = 0; i < re2_vec_.size(); i++)
|
||||
delete re2_vec_[i];
|
||||
@ -33,16 +42,21 @@ RE2::ErrorCode FilteredRE2::Add(const StringPiece& pattern,
|
||||
}
|
||||
delete re;
|
||||
} else {
|
||||
*id = re2_vec_.size();
|
||||
*id = static_cast<int>(re2_vec_.size());
|
||||
re2_vec_.push_back(re);
|
||||
}
|
||||
|
||||
return code;
|
||||
}
|
||||
|
||||
void FilteredRE2::Compile(vector<string>* atoms) {
|
||||
if (compiled_ || re2_vec_.size() == 0) {
|
||||
LOG(INFO) << "C: " << compiled_ << " S:" << re2_vec_.size();
|
||||
void FilteredRE2::Compile(std::vector<string>* atoms) {
|
||||
if (compiled_) {
|
||||
LOG(ERROR) << "Compile called already.";
|
||||
return;
|
||||
}
|
||||
|
||||
if (re2_vec_.empty()) {
|
||||
LOG(ERROR) << "Compile called before Add.";
|
||||
return;
|
||||
}
|
||||
|
||||
@ -58,17 +72,17 @@ void FilteredRE2::Compile(vector<string>* atoms) {
|
||||
int FilteredRE2::SlowFirstMatch(const StringPiece& text) const {
|
||||
for (size_t i = 0; i < re2_vec_.size(); i++)
|
||||
if (RE2::PartialMatch(text, *re2_vec_[i]))
|
||||
return i;
|
||||
return static_cast<int>(i);
|
||||
return -1;
|
||||
}
|
||||
|
||||
int FilteredRE2::FirstMatch(const StringPiece& text,
|
||||
const vector<int>& atoms) const {
|
||||
const std::vector<int>& atoms) const {
|
||||
if (!compiled_) {
|
||||
LOG(DFATAL) << "FirstMatch called before Compile";
|
||||
LOG(DFATAL) << "FirstMatch called before Compile.";
|
||||
return -1;
|
||||
}
|
||||
vector<int> regexps;
|
||||
std::vector<int> regexps;
|
||||
prefilter_tree_->RegexpsGivenStrings(atoms, ®exps);
|
||||
for (size_t i = 0; i < regexps.size(); i++)
|
||||
if (RE2::PartialMatch(text, *re2_vec_[regexps[i]]))
|
||||
@ -78,10 +92,10 @@ int FilteredRE2::FirstMatch(const StringPiece& text,
|
||||
|
||||
bool FilteredRE2::AllMatches(
|
||||
const StringPiece& text,
|
||||
const vector<int>& atoms,
|
||||
vector<int>* matching_regexps) const {
|
||||
const std::vector<int>& atoms,
|
||||
std::vector<int>* matching_regexps) const {
|
||||
matching_regexps->clear();
|
||||
vector<int> regexps;
|
||||
std::vector<int> regexps;
|
||||
prefilter_tree_->RegexpsGivenStrings(atoms, ®exps);
|
||||
for (size_t i = 0; i < regexps.size(); i++)
|
||||
if (RE2::PartialMatch(text, *re2_vec_[regexps[i]]))
|
||||
@ -89,11 +103,16 @@ bool FilteredRE2::AllMatches(
|
||||
return !matching_regexps->empty();
|
||||
}
|
||||
|
||||
void FilteredRE2::RegexpsGivenStrings(const vector<int>& matched_atoms,
|
||||
vector<int>* passed_regexps) {
|
||||
prefilter_tree_->RegexpsGivenStrings(matched_atoms, passed_regexps);
|
||||
void FilteredRE2::AllPotentials(
|
||||
const std::vector<int>& atoms,
|
||||
std::vector<int>* potential_regexps) const {
|
||||
prefilter_tree_->RegexpsGivenStrings(atoms, potential_regexps);
|
||||
}
|
||||
|
||||
void FilteredRE2::RegexpsGivenStrings(const std::vector<int>& matched_atoms,
|
||||
std::vector<int>* passed_regexps) {
|
||||
prefilter_tree_->RegexpsGivenStrings(matched_atoms, passed_regexps);
|
||||
}
|
||||
|
||||
void FilteredRE2::PrintPrefilter(int regexpid) {
|
||||
prefilter_tree_->PrintPrefilter(regexpid);
|
||||
|
@ -2,6 +2,9 @@
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef RE2_FILTERED_RE2_H_
|
||||
#define RE2_FILTERED_RE2_H_
|
||||
|
||||
// The class FilteredRE2 is used as a wrapper to multiple RE2 regexps.
|
||||
// It provides a prefilter mechanism that helps in cutting down the
|
||||
// number of regexps that need to be actually searched.
|
||||
@ -18,20 +21,19 @@
|
||||
// indices of strings that were found in the text to get the actual
|
||||
// regexp matches.
|
||||
|
||||
#ifndef RE2_FILTERED_RE2_H_
|
||||
#define RE2_FILTERED_RE2_H_
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "re2/re2.h"
|
||||
|
||||
namespace re2 {
|
||||
using std::vector;
|
||||
|
||||
class PrefilterTree;
|
||||
|
||||
class FilteredRE2 {
|
||||
public:
|
||||
FilteredRE2();
|
||||
explicit FilteredRE2(int min_atom_len);
|
||||
~FilteredRE2();
|
||||
|
||||
// Uses RE2 constructor to create a RE2 object (re). Returns
|
||||
@ -47,7 +49,7 @@ class FilteredRE2 {
|
||||
// the search text should be lowercased first to find matching
|
||||
// strings from the set of strings returned by Compile. Call after
|
||||
// all Add calls are done.
|
||||
void Compile(vector<string>* strings_to_match);
|
||||
void Compile(std::vector<string>* strings_to_match);
|
||||
|
||||
// Returns the index of the first matching regexp.
|
||||
// Returns -1 on no match. Can be called prior to Compile.
|
||||
@ -59,16 +61,24 @@ class FilteredRE2 {
|
||||
// Returns -1 on no match. Compile has to be called before
|
||||
// calling this.
|
||||
int FirstMatch(const StringPiece& text,
|
||||
const vector<int>& atoms) const;
|
||||
const std::vector<int>& atoms) const;
|
||||
|
||||
// Returns the indices of all matching regexps, after first clearing
|
||||
// matched_regexps.
|
||||
bool AllMatches(const StringPiece& text,
|
||||
const vector<int>& atoms,
|
||||
vector<int>* matching_regexps) const;
|
||||
const std::vector<int>& atoms,
|
||||
std::vector<int>* matching_regexps) const;
|
||||
|
||||
// Returns the indices of all potentially matching regexps after first
|
||||
// clearing potential_regexps.
|
||||
// A regexp is potentially matching if it passes the filter.
|
||||
// If a regexp passes the filter it may still not match.
|
||||
// A regexp that does not pass the filter is guaranteed to not match.
|
||||
void AllPotentials(const std::vector<int>& atoms,
|
||||
std::vector<int>* potential_regexps) const;
|
||||
|
||||
// The number of regexps added.
|
||||
int NumRegexps() const { return re2_vec_.size(); }
|
||||
int NumRegexps() const { return static_cast<int>(re2_vec_.size()); }
|
||||
|
||||
private:
|
||||
|
||||
@ -79,11 +89,11 @@ class FilteredRE2 {
|
||||
void PrintPrefilter(int regexpid);
|
||||
|
||||
// Useful for testing and debugging.
|
||||
void RegexpsGivenStrings(const vector<int>& matched_atoms,
|
||||
vector<int>* passed_regexps);
|
||||
void RegexpsGivenStrings(const std::vector<int>& matched_atoms,
|
||||
std::vector<int>* passed_regexps);
|
||||
|
||||
// All the regexps in the FilteredRE2.
|
||||
vector<RE2*> re2_vec_;
|
||||
std::vector<RE2*> re2_vec_;
|
||||
|
||||
// Has the FilteredRE2 been compiled using Compile()
|
||||
bool compiled_;
|
||||
@ -91,9 +101,8 @@ class FilteredRE2 {
|
||||
// An AND-OR tree of string atoms used for filtering regexps.
|
||||
PrefilterTree* prefilter_tree_;
|
||||
|
||||
//DISALLOW_EVIL_CONSTRUCTORS(FilteredRE2);
|
||||
FilteredRE2(const FilteredRE2&);
|
||||
void operator=(const FilteredRE2&);
|
||||
FilteredRE2(const FilteredRE2&) = delete;
|
||||
FilteredRE2& operator=(const FilteredRE2&) = delete;
|
||||
};
|
||||
|
||||
} // namespace re2
|
||||
|
@ -23,6 +23,7 @@
|
||||
// Regexp::MimicsPCRE checks for any of these conditions.
|
||||
|
||||
#include "util/util.h"
|
||||
#include "util/logging.h"
|
||||
#include "re2/regexp.h"
|
||||
#include "re2/walker-inl.h"
|
||||
|
||||
@ -124,7 +125,8 @@ class EmptyStringWalker : public Regexp::Walker<bool> {
|
||||
}
|
||||
|
||||
private:
|
||||
DISALLOW_EVIL_CONSTRUCTORS(EmptyStringWalker);
|
||||
EmptyStringWalker(const EmptyStringWalker&) = delete;
|
||||
EmptyStringWalker& operator=(const EmptyStringWalker&) = delete;
|
||||
};
|
||||
|
||||
// Called after visiting re's children. child_args contains the return
|
||||
|
@ -24,13 +24,24 @@
|
||||
// Like Thompson's original machine and like the DFA implementation, this
|
||||
// implementation notices a match only once it is one byte past it.
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <algorithm>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "re2/prog.h"
|
||||
#include "re2/regexp.h"
|
||||
#include "util/logging.h"
|
||||
#include "util/sparse_array.h"
|
||||
#include "util/sparse_set.h"
|
||||
#include "util/strutil.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
static const bool ExtraDebug = false;
|
||||
|
||||
class NFA {
|
||||
public:
|
||||
NFA(Prog* prog);
|
||||
@ -51,12 +62,10 @@ class NFA {
|
||||
bool anchored, bool longest,
|
||||
StringPiece* submatch, int nsubmatch);
|
||||
|
||||
static const int Debug = 0;
|
||||
|
||||
private:
|
||||
struct Thread {
|
||||
union {
|
||||
int id;
|
||||
int ref;
|
||||
Thread* next; // when on free list
|
||||
};
|
||||
const char** capture;
|
||||
@ -64,16 +73,15 @@ class NFA {
|
||||
|
||||
// State for explicit stack in AddToThreadq.
|
||||
struct AddState {
|
||||
int id; // Inst to process
|
||||
int j;
|
||||
const char* cap_j; // if j>=0, set capture[j] = cap_j before processing ip
|
||||
int id; // Inst to process
|
||||
Thread* t; // if not null, set t0 = t before processing id
|
||||
|
||||
AddState()
|
||||
: id(0), j(-1), cap_j(NULL) {}
|
||||
: id(0), t(NULL) {}
|
||||
explicit AddState(int id)
|
||||
: id(id), j(-1), cap_j(NULL) {}
|
||||
AddState(int id, const char* cap_j, int j)
|
||||
: id(id), j(j), cap_j(cap_j) {}
|
||||
: id(id), t(NULL) {}
|
||||
AddState(int id, Thread* t)
|
||||
: id(id), t(t) {}
|
||||
};
|
||||
|
||||
// Threadq is a list of threads. The list is sorted by the order
|
||||
@ -82,19 +90,24 @@ class NFA {
|
||||
typedef SparseArray<Thread*> Threadq;
|
||||
|
||||
inline Thread* AllocThread();
|
||||
inline void FreeThread(Thread*);
|
||||
inline Thread* Incref(Thread* t);
|
||||
inline void Decref(Thread* t);
|
||||
|
||||
// Add id (or its children, following unlabeled arrows)
|
||||
// to the workqueue q with associated capture info.
|
||||
void AddToThreadq(Threadq* q, int id, int flag,
|
||||
const char* p, const char** capture);
|
||||
// Follows all empty arrows from id0 and enqueues all the states reached.
|
||||
// Enqueues only the ByteRange instructions that match byte c.
|
||||
// The bits in flag (Bol, Eol, etc.) specify whether ^, $ and \b match.
|
||||
// p is the current input position, and t0 is the current thread.
|
||||
void AddToThreadq(Threadq* q, int id0, int c, int flag,
|
||||
const char* p, Thread* t0);
|
||||
|
||||
// Run runq on byte c, appending new states to nextq.
|
||||
// Updates matched_ and match_ as new, better matches are found.
|
||||
// p is position of the next byte (the one after c)
|
||||
// in the input string, used when processing capturing parens.
|
||||
// flag is the bitwise or of Bol, Eol, etc., specifying whether
|
||||
// ^, $ and \b match the current input point (after c).
|
||||
// p is the position of byte c in the input string for AddToThreadq;
|
||||
// p-1 will be used when processing Match instructions.
|
||||
// flag is the bitwise OR of Bol, Eol, etc., specifying whether
|
||||
// ^, $ and \b match the current input position (after c).
|
||||
// Frees all the threads on runq.
|
||||
// If there is a shortcut to the end, returns that shortcut.
|
||||
inline int Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p);
|
||||
|
||||
// Returns text version of capture information, for debugging.
|
||||
@ -102,10 +115,6 @@ class NFA {
|
||||
|
||||
inline void CopyCapture(const char** dst, const char** src);
|
||||
|
||||
// Computes whether all matches must begin with the same first
|
||||
// byte, and if so, returns that byte. If not, returns -1.
|
||||
int ComputeFirstByte();
|
||||
|
||||
Prog* prog_; // underlying program
|
||||
int start_; // start instruction in program
|
||||
int ncapture_; // number of submatches to track
|
||||
@ -118,16 +127,16 @@ class NFA {
|
||||
bool matched_; // any match so far?
|
||||
AddState* astack_; // pre-allocated for AddToThreadq
|
||||
int nastack_;
|
||||
int first_byte_; // required first byte for match, or -1 if none
|
||||
|
||||
Thread* free_threads_; // free list
|
||||
|
||||
DISALLOW_EVIL_CONSTRUCTORS(NFA);
|
||||
NFA(const NFA&) = delete;
|
||||
NFA& operator=(const NFA&) = delete;
|
||||
};
|
||||
|
||||
NFA::NFA(Prog* prog) {
|
||||
prog_ = prog;
|
||||
start_ = prog->start();
|
||||
start_ = prog_->start();
|
||||
ncapture_ = 0;
|
||||
longest_ = false;
|
||||
endmatch_ = false;
|
||||
@ -135,12 +144,14 @@ NFA::NFA(Prog* prog) {
|
||||
etext_ = NULL;
|
||||
q0_.resize(prog_->size());
|
||||
q1_.resize(prog_->size());
|
||||
nastack_ = 2*prog_->size();
|
||||
// See NFA::AddToThreadq() for why this is so.
|
||||
nastack_ = 2*prog_->inst_count(kInstCapture) +
|
||||
prog_->inst_count(kInstEmptyWidth) +
|
||||
prog_->inst_count(kInstNop) + 1; // + 1 for start inst
|
||||
astack_ = new AddState[nastack_];
|
||||
match_ = NULL;
|
||||
matched_ = false;
|
||||
free_threads_ = NULL;
|
||||
first_byte_ = ComputeFirstByte();
|
||||
}
|
||||
|
||||
NFA::~NFA() {
|
||||
@ -154,24 +165,36 @@ NFA::~NFA() {
|
||||
}
|
||||
}
|
||||
|
||||
void NFA::FreeThread(Thread *t) {
|
||||
if (t == NULL)
|
||||
return;
|
||||
t->next = free_threads_;
|
||||
free_threads_ = t;
|
||||
}
|
||||
|
||||
NFA::Thread* NFA::AllocThread() {
|
||||
Thread* t = free_threads_;
|
||||
if (t == NULL) {
|
||||
t = new Thread;
|
||||
t->ref = 1;
|
||||
t->capture = new const char*[ncapture_];
|
||||
return t;
|
||||
}
|
||||
free_threads_ = t->next;
|
||||
t->ref = 1;
|
||||
return t;
|
||||
}
|
||||
|
||||
NFA::Thread* NFA::Incref(Thread* t) {
|
||||
DCHECK(t != NULL);
|
||||
t->ref++;
|
||||
return t;
|
||||
}
|
||||
|
||||
void NFA::Decref(Thread* t) {
|
||||
if (t == NULL)
|
||||
return;
|
||||
t->ref--;
|
||||
if (t->ref > 0)
|
||||
return;
|
||||
DCHECK_EQ(t->ref, 0);
|
||||
t->next = free_threads_;
|
||||
free_threads_ = t;
|
||||
}
|
||||
|
||||
void NFA::CopyCapture(const char** dst, const char** src) {
|
||||
for (int i = 0; i < ncapture_; i+=2) {
|
||||
dst[i] = src[i];
|
||||
@ -180,35 +203,43 @@ void NFA::CopyCapture(const char** dst, const char** src) {
|
||||
}
|
||||
|
||||
// Follows all empty arrows from id0 and enqueues all the states reached.
|
||||
// Enqueues only the ByteRange instructions that match byte c.
|
||||
// The bits in flag (Bol, Eol, etc.) specify whether ^, $ and \b match.
|
||||
// The pointer p is the current input position, and m is the
|
||||
// current set of match boundaries.
|
||||
void NFA::AddToThreadq(Threadq* q, int id0, int flag,
|
||||
const char* p, const char** capture) {
|
||||
// p is the current input position, and t0 is the current thread.
|
||||
void NFA::AddToThreadq(Threadq* q, int id0, int c, int flag,
|
||||
const char* p, Thread* t0) {
|
||||
if (id0 == 0)
|
||||
return;
|
||||
|
||||
// Astack_ is pre-allocated to avoid resize operations.
|
||||
// It has room for 2*prog_->size() entries, which is enough:
|
||||
// Each inst in prog can be processed at most once,
|
||||
// pushing at most two entries on stk.
|
||||
|
||||
int nstk = 0;
|
||||
// Use astack_ to hold our stack of instructions yet to process.
|
||||
// It was preallocated as follows:
|
||||
// two entries per Capture;
|
||||
// one entry per EmptyWidth; and
|
||||
// one entry per Nop.
|
||||
// This reflects the maximum number of stack pushes that each can
|
||||
// perform. (Each instruction can be processed at most once.)
|
||||
AddState* stk = astack_;
|
||||
stk[nstk++] = AddState(id0);
|
||||
int nstk = 0;
|
||||
|
||||
stk[nstk++] = AddState(id0);
|
||||
while (nstk > 0) {
|
||||
DCHECK_LE(nstk, nastack_);
|
||||
const AddState& a = stk[--nstk];
|
||||
if (a.j >= 0)
|
||||
capture[a.j] = a.cap_j;
|
||||
AddState a = stk[--nstk];
|
||||
|
||||
Loop:
|
||||
if (a.t != NULL) {
|
||||
// t0 was a thread that we allocated and copied in order to
|
||||
// record the capture, so we must now decref it.
|
||||
Decref(t0);
|
||||
t0 = a.t;
|
||||
}
|
||||
|
||||
int id = a.id;
|
||||
if (id == 0)
|
||||
continue;
|
||||
if (q->has_index(id)) {
|
||||
if (Debug)
|
||||
fprintf(stderr, " [%d%s]\n", id, FormatCapture(capture).c_str());
|
||||
if (ExtraDebug)
|
||||
fprintf(stderr, " [%d%s]\n", id, FormatCapture(t0->capture).c_str());
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -231,62 +262,76 @@ void NFA::AddToThreadq(Threadq* q, int id0, int flag,
|
||||
|
||||
case kInstAltMatch:
|
||||
// Save state; will pick up at next byte.
|
||||
t = AllocThread();
|
||||
t->id = id;
|
||||
CopyCapture(t->capture, capture);
|
||||
t = Incref(t0);
|
||||
*tp = t;
|
||||
// fall through
|
||||
|
||||
case kInstAlt:
|
||||
// Explore alternatives.
|
||||
stk[nstk++] = AddState(ip->out1());
|
||||
stk[nstk++] = AddState(ip->out());
|
||||
break;
|
||||
DCHECK(!ip->last());
|
||||
a = AddState(id+1);
|
||||
goto Loop;
|
||||
|
||||
case kInstNop:
|
||||
if (!ip->last())
|
||||
stk[nstk++] = AddState(id+1);
|
||||
|
||||
// Continue on.
|
||||
stk[nstk++] = AddState(ip->out());
|
||||
break;
|
||||
a = AddState(ip->out());
|
||||
goto Loop;
|
||||
|
||||
case kInstCapture:
|
||||
if (!ip->last())
|
||||
stk[nstk++] = AddState(id+1);
|
||||
|
||||
if ((j=ip->cap()) < ncapture_) {
|
||||
// Push a dummy whose only job is to restore capture[j]
|
||||
// Push a dummy whose only job is to restore t0
|
||||
// once we finish exploring this possibility.
|
||||
stk[nstk++] = AddState(0, capture[j], j);
|
||||
stk[nstk++] = AddState(0, t0);
|
||||
|
||||
// Record capture.
|
||||
capture[j] = p;
|
||||
t = AllocThread();
|
||||
CopyCapture(t->capture, t0->capture);
|
||||
t->capture[j] = p;
|
||||
t0 = t;
|
||||
}
|
||||
stk[nstk++] = AddState(ip->out());
|
||||
break;
|
||||
a = AddState(ip->out());
|
||||
goto Loop;
|
||||
|
||||
case kInstByteRange:
|
||||
if (!ip->Matches(c))
|
||||
goto Next;
|
||||
FALLTHROUGH_INTENDED;
|
||||
|
||||
case kInstMatch:
|
||||
case kInstByteRange:
|
||||
// Save state; will pick up at next byte.
|
||||
t = AllocThread();
|
||||
t->id = id;
|
||||
CopyCapture(t->capture, capture);
|
||||
t = Incref(t0);
|
||||
*tp = t;
|
||||
if (Debug)
|
||||
fprintf(stderr, " + %d%s [%p]\n", id, FormatCapture(t->capture).c_str(), t);
|
||||
break;
|
||||
if (ExtraDebug)
|
||||
fprintf(stderr, " + %d%s\n", id, FormatCapture(t0->capture).c_str());
|
||||
|
||||
Next:
|
||||
if (ip->last())
|
||||
break;
|
||||
a = AddState(id+1);
|
||||
goto Loop;
|
||||
|
||||
case kInstEmptyWidth:
|
||||
if (!ip->last())
|
||||
stk[nstk++] = AddState(id+1);
|
||||
|
||||
// Continue on if we have all the right flag bits.
|
||||
if (ip->empty() & ~flag)
|
||||
break;
|
||||
stk[nstk++] = AddState(ip->out());
|
||||
break;
|
||||
a = AddState(ip->out());
|
||||
goto Loop;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Run runq on byte c, appending new states to nextq.
|
||||
// Updates match as new, better matches are found.
|
||||
// p is position of the byte c in the input string,
|
||||
// used when processing capturing parens.
|
||||
// flag is the bitwise or of Bol, Eol, etc., specifying whether
|
||||
// ^, $ and \b match the current input point (after c).
|
||||
// Updates matched_ and match_ as new, better matches are found.
|
||||
// p is the position of byte c in the input string for AddToThreadq;
|
||||
// p-1 will be used when processing Match instructions.
|
||||
// flag is the bitwise OR of Bol, Eol, etc., specifying whether
|
||||
// ^, $ and \b match the current input position (after c).
|
||||
// Frees all the threads on runq.
|
||||
// If there is a shortcut to the end, returns that shortcut.
|
||||
int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {
|
||||
@ -300,12 +345,12 @@ int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {
|
||||
if (longest_) {
|
||||
// Can skip any threads started after our current best match.
|
||||
if (matched_ && match_[0] < t->capture[0]) {
|
||||
FreeThread(t);
|
||||
Decref(t);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
int id = t->id;
|
||||
int id = i->index();
|
||||
Prog::Inst* ip = prog_->inst(id);
|
||||
|
||||
switch (ip->opcode()) {
|
||||
@ -315,8 +360,7 @@ int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {
|
||||
break;
|
||||
|
||||
case kInstByteRange:
|
||||
if (ip->Matches(c))
|
||||
AddToThreadq(nextq, ip->out(), flag, p+1, t->capture);
|
||||
AddToThreadq(nextq, ip->out(), c, flag, p, t);
|
||||
break;
|
||||
|
||||
case kInstAltMatch:
|
||||
@ -324,52 +368,58 @@ int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {
|
||||
break;
|
||||
// The match is ours if we want it.
|
||||
if (ip->greedy(prog_) || longest_) {
|
||||
CopyCapture((const char**)match_, t->capture);
|
||||
FreeThread(t);
|
||||
for (++i; i != runq->end(); ++i)
|
||||
FreeThread(i->second);
|
||||
runq->clear();
|
||||
CopyCapture(match_, t->capture);
|
||||
matched_ = true;
|
||||
|
||||
Decref(t);
|
||||
for (++i; i != runq->end(); ++i)
|
||||
Decref(i->second);
|
||||
runq->clear();
|
||||
if (ip->greedy(prog_))
|
||||
return ip->out1();
|
||||
return ip->out();
|
||||
}
|
||||
break;
|
||||
|
||||
case kInstMatch:
|
||||
if (endmatch_ && p != etext_)
|
||||
case kInstMatch: {
|
||||
// Avoid invoking undefined behavior when p happens
|
||||
// to be null - and p-1 would be meaningless anyway.
|
||||
if (p == NULL)
|
||||
break;
|
||||
|
||||
if (endmatch_ && p-1 != etext_)
|
||||
break;
|
||||
|
||||
const char* old = t->capture[1]; // previous end pointer
|
||||
t->capture[1] = p;
|
||||
if (longest_) {
|
||||
// Leftmost-longest mode: save this match only if
|
||||
// it is either farther to the left or at the same
|
||||
// point but longer than an existing match.
|
||||
if (!matched_ || t->capture[0] < match_[0] ||
|
||||
(t->capture[0] == match_[0] && t->capture[1] > match_[1]))
|
||||
CopyCapture((const char**)match_, t->capture);
|
||||
(t->capture[0] == match_[0] && p-1 > match_[1])) {
|
||||
CopyCapture(match_, t->capture);
|
||||
match_[1] = p-1;
|
||||
matched_ = true;
|
||||
}
|
||||
} else {
|
||||
// Leftmost-biased mode: this match is by definition
|
||||
// better than what we've already found (see next line).
|
||||
CopyCapture((const char**)match_, t->capture);
|
||||
CopyCapture(match_, t->capture);
|
||||
match_[1] = p-1;
|
||||
matched_ = true;
|
||||
|
||||
// Cut off the threads that can only find matches
|
||||
// worse than the one we just found: don't run the
|
||||
// rest of the current Threadq.
|
||||
t->capture[0] = old;
|
||||
FreeThread(t);
|
||||
Decref(t);
|
||||
for (++i; i != runq->end(); ++i)
|
||||
FreeThread(i->second);
|
||||
Decref(i->second);
|
||||
runq->clear();
|
||||
matched_ = true;
|
||||
return 0;
|
||||
}
|
||||
t->capture[0] = old;
|
||||
matched_ = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
FreeThread(t);
|
||||
Decref(t);
|
||||
}
|
||||
runq->clear();
|
||||
return 0;
|
||||
@ -391,12 +441,6 @@ string NFA::FormatCapture(const char** capture) {
|
||||
return s;
|
||||
}
|
||||
|
||||
// Returns whether haystack contains needle's memory.
|
||||
static bool StringPieceContains(const StringPiece haystack, const StringPiece needle) {
|
||||
return haystack.begin() <= needle.begin() &&
|
||||
haystack.end() >= needle.end();
|
||||
}
|
||||
|
||||
bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
|
||||
bool anchored, bool longest,
|
||||
StringPiece* submatch, int nsubmatch) {
|
||||
@ -407,12 +451,9 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
|
||||
if (context.begin() == NULL)
|
||||
context = text;
|
||||
|
||||
if (!StringPieceContains(context, text)) {
|
||||
LOG(FATAL) << "Bad args: context does not contain text "
|
||||
<< reinterpret_cast<const void*>(context.begin())
|
||||
<< "+" << context.size() << " "
|
||||
<< reinterpret_cast<const void*>(text.begin())
|
||||
<< "+" << text.size();
|
||||
// Sanity check: make sure that text lies within context.
|
||||
if (text.begin() < context.begin() || text.end() > context.end()) {
|
||||
LOG(DFATAL) << "context does not contain text";
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -445,16 +486,14 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
|
||||
|
||||
match_ = new const char*[ncapture_];
|
||||
matched_ = false;
|
||||
memset(match_, 0, ncapture_*sizeof match_[0]);
|
||||
|
||||
// For debugging prints.
|
||||
btext_ = context.begin();
|
||||
|
||||
if (Debug) {
|
||||
if (ExtraDebug)
|
||||
fprintf(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n",
|
||||
text.as_string().c_str(), context.as_string().c_str(), anchored,
|
||||
text.ToString().c_str(), context.ToString().c_str(), anchored,
|
||||
longest);
|
||||
}
|
||||
|
||||
// Set up search.
|
||||
Threadq* runq = &q0_;
|
||||
@ -462,14 +501,10 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
|
||||
runq->clear();
|
||||
nextq->clear();
|
||||
memset(&match_[0], 0, ncapture_*sizeof match_[0]);
|
||||
const char* bp = context.begin();
|
||||
int c = -1;
|
||||
int wasword = 0;
|
||||
|
||||
if (text.begin() > context.begin()) {
|
||||
c = text.begin()[-1] & 0xFF;
|
||||
wasword = Prog::IsWordChar(c);
|
||||
}
|
||||
if (text.begin() > context.begin())
|
||||
wasword = Prog::IsWordChar(text.begin()[-1] & 0xFF);
|
||||
|
||||
// Loop over the text, stepping the machine.
|
||||
for (const char* p = text.begin();; p++) {
|
||||
@ -498,24 +533,29 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
|
||||
else
|
||||
flag |= kEmptyNonWordBoundary;
|
||||
|
||||
if (Debug) {
|
||||
fprintf(stderr, "%c[%#x/%d/%d]:", p > text.end() ? '$' : p == bp ? '^' : c, flag, isword, wasword);
|
||||
if (ExtraDebug) {
|
||||
int c = 0;
|
||||
if (p == context.begin())
|
||||
c = '^';
|
||||
else if (p > text.end())
|
||||
c = '$';
|
||||
else if (p < text.end())
|
||||
c = p[0] & 0xFF;
|
||||
|
||||
fprintf(stderr, "%c[%#x/%d/%d]:", c, flag, isword, wasword);
|
||||
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
|
||||
Thread* t = i->second;
|
||||
if (t == NULL)
|
||||
continue;
|
||||
fprintf(stderr, " %d%s", t->id,
|
||||
FormatCapture((const char**)t->capture).c_str());
|
||||
fprintf(stderr, " %d%s", i->index(), FormatCapture(t->capture).c_str());
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
// Process previous character (waited until now to avoid
|
||||
// repeating the flag computation above).
|
||||
// This is a no-op the first time around the loop, because
|
||||
// runq is empty.
|
||||
int id = Step(runq, nextq, c, flag, p-1);
|
||||
// This is a no-op the first time around the loop because runq is empty.
|
||||
int id = Step(runq, nextq, p < text.end() ? p[0] & 0xFF : -1, flag, p);
|
||||
DCHECK_EQ(runq->size(), 0);
|
||||
using std::swap;
|
||||
swap(nextq, runq);
|
||||
nextq->clear();
|
||||
if (id != 0) {
|
||||
@ -529,7 +569,8 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
|
||||
break;
|
||||
|
||||
case kInstCapture:
|
||||
match_[ip->cap()] = p;
|
||||
if (ip->cap() < ncapture_)
|
||||
match_[ip->cap()] = p;
|
||||
id = ip->out();
|
||||
continue;
|
||||
|
||||
@ -541,14 +582,6 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
|
||||
match_[1] = p;
|
||||
matched_ = true;
|
||||
break;
|
||||
|
||||
case kInstEmptyWidth:
|
||||
if (ip->empty() & ~(kEmptyEndLine|kEmptyEndText)) {
|
||||
LOG(DFATAL) << "Unexpected empty-width in short circuit: " << ip->empty();
|
||||
break;
|
||||
}
|
||||
id = ip->out();
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
@ -566,10 +599,10 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
|
||||
// If there's a required first byte for an unanchored search
|
||||
// and we're not in the middle of any possible matches,
|
||||
// use memchr to search for the byte quickly.
|
||||
if (!anchored && first_byte_ >= 0 && runq->size() == 0 &&
|
||||
p < text.end() && (p[0] & 0xFF) != first_byte_) {
|
||||
p = reinterpret_cast<const char*>(memchr(p, first_byte_,
|
||||
text.end() - p));
|
||||
int fb = prog_->first_byte();
|
||||
if (!anchored && runq->size() == 0 &&
|
||||
fb >= 0 && p < text.end() && (p[0] & 0xFF) != fb) {
|
||||
p = reinterpret_cast<const char*>(memchr(p, fb, text.end() - p));
|
||||
if (p == NULL) {
|
||||
p = text.end();
|
||||
isword = 0;
|
||||
@ -579,59 +612,48 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
|
||||
flag = Prog::EmptyFlags(context, p);
|
||||
}
|
||||
|
||||
// Steal match storage (cleared but unused as of yet)
|
||||
// temporarily to hold match boundaries for new thread.
|
||||
match_[0] = p;
|
||||
AddToThreadq(runq, start_, flag, p, match_);
|
||||
match_[0] = NULL;
|
||||
Thread* t = AllocThread();
|
||||
CopyCapture(t->capture, match_);
|
||||
t->capture[0] = p;
|
||||
AddToThreadq(runq, start_, p < text.end() ? p[0] & 0xFF : -1, flag, p, t);
|
||||
Decref(t);
|
||||
}
|
||||
|
||||
// If all the threads have died, stop early.
|
||||
if (runq->size() == 0) {
|
||||
if (Debug)
|
||||
if (ExtraDebug)
|
||||
fprintf(stderr, "dead\n");
|
||||
break;
|
||||
}
|
||||
|
||||
if (p == text.end())
|
||||
c = 0;
|
||||
else
|
||||
c = *p & 0xFF;
|
||||
wasword = isword;
|
||||
|
||||
// Will run step(runq, nextq, c, ...) on next iteration. See above.
|
||||
}
|
||||
|
||||
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i)
|
||||
FreeThread(i->second);
|
||||
Decref(i->second);
|
||||
|
||||
if (matched_) {
|
||||
for (int i = 0; i < nsubmatch; i++)
|
||||
submatch[i].set(match_[2*i], match_[2*i+1] - match_[2*i]);
|
||||
if (Debug)
|
||||
fprintf(stderr, "match (%d,%d)\n",
|
||||
static_cast<int>(match_[0] - btext_),
|
||||
static_cast<int>(match_[1] - btext_));
|
||||
submatch[i] =
|
||||
StringPiece(match_[2 * i],
|
||||
static_cast<size_t>(match_[2 * i + 1] - match_[2 * i]));
|
||||
if (ExtraDebug)
|
||||
fprintf(stderr, "match (%td,%td)\n",
|
||||
match_[0] - btext_, match_[1] - btext_);
|
||||
return true;
|
||||
}
|
||||
VLOG(1) << "No matches found";
|
||||
return false;
|
||||
}
|
||||
|
||||
// Computes whether all successful matches have a common first byte,
|
||||
// and if so, returns that byte. If not, returns -1.
|
||||
int NFA::ComputeFirstByte() {
|
||||
if (start_ == 0)
|
||||
return -1;
|
||||
|
||||
int b = -1; // first byte, not yet computed
|
||||
|
||||
typedef SparseSet Workq;
|
||||
Workq q(prog_->size());
|
||||
q.insert(start_);
|
||||
for (Workq::iterator it = q.begin(); it != q.end(); ++it) {
|
||||
int Prog::ComputeFirstByte() {
|
||||
int b = -1;
|
||||
SparseSet q(size());
|
||||
q.insert(start());
|
||||
for (SparseSet::iterator it = q.begin(); it != q.end(); ++it) {
|
||||
int id = *it;
|
||||
Prog::Inst* ip = prog_->inst(id);
|
||||
Prog::Inst* ip = inst(id);
|
||||
switch (ip->opcode()) {
|
||||
default:
|
||||
LOG(DFATAL) << "unhandled " << ip->opcode() << " in ComputeFirstByte";
|
||||
@ -642,6 +664,9 @@ int NFA::ComputeFirstByte() {
|
||||
return -1;
|
||||
|
||||
case kInstByteRange:
|
||||
if (!ip->last())
|
||||
q.insert(id+1);
|
||||
|
||||
// Must match only a single byte
|
||||
if (ip->lo() != ip->hi())
|
||||
return -1;
|
||||
@ -658,6 +683,9 @@ int NFA::ComputeFirstByte() {
|
||||
case kInstNop:
|
||||
case kInstCapture:
|
||||
case kInstEmptyWidth:
|
||||
if (!ip->last())
|
||||
q.insert(id+1);
|
||||
|
||||
// Continue on.
|
||||
// Ignore ip->empty() flags for kInstEmptyWidth
|
||||
// in order to be as conservative as possible
|
||||
@ -666,13 +694,9 @@ int NFA::ComputeFirstByte() {
|
||||
q.insert(ip->out());
|
||||
break;
|
||||
|
||||
case kInstAlt:
|
||||
case kInstAltMatch:
|
||||
// Explore alternatives.
|
||||
if (ip->out())
|
||||
q.insert(ip->out());
|
||||
if (ip->out1())
|
||||
q.insert(ip->out1());
|
||||
DCHECK(!ip->last());
|
||||
q.insert(id+1);
|
||||
break;
|
||||
|
||||
case kInstFail:
|
||||
@ -686,7 +710,7 @@ bool
|
||||
Prog::SearchNFA(const StringPiece& text, const StringPiece& context,
|
||||
Anchor anchor, MatchKind kind,
|
||||
StringPiece* match, int nmatch) {
|
||||
if (NFA::Debug)
|
||||
if (ExtraDebug)
|
||||
Dump();
|
||||
|
||||
NFA nfa(this);
|
||||
@ -705,5 +729,63 @@ Prog::SearchNFA(const StringPiece& text, const StringPiece& context,
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
// For each instruction i in the program reachable from the start, compute the
|
||||
// number of instructions reachable from i by following only empty transitions
|
||||
// and record that count as fanout[i].
|
||||
//
|
||||
// fanout holds the results and is also the work queue for the outer iteration.
|
||||
// reachable holds the reached nodes for the inner iteration.
|
||||
void Prog::Fanout(SparseArray<int>* fanout) {
|
||||
DCHECK_EQ(fanout->max_size(), size());
|
||||
SparseSet reachable(size());
|
||||
fanout->clear();
|
||||
fanout->set_new(start(), 0);
|
||||
for (SparseArray<int>::iterator i = fanout->begin(); i != fanout->end(); ++i) {
|
||||
int* count = &i->second;
|
||||
reachable.clear();
|
||||
reachable.insert(i->index());
|
||||
for (SparseSet::iterator j = reachable.begin(); j != reachable.end(); ++j) {
|
||||
int id = *j;
|
||||
Prog::Inst* ip = inst(id);
|
||||
switch (ip->opcode()) {
|
||||
default:
|
||||
LOG(DFATAL) << "unhandled " << ip->opcode() << " in Prog::Fanout()";
|
||||
break;
|
||||
|
||||
case kInstByteRange:
|
||||
if (!ip->last())
|
||||
reachable.insert(id+1);
|
||||
|
||||
(*count)++;
|
||||
if (!fanout->has_index(ip->out())) {
|
||||
fanout->set_new(ip->out(), 0);
|
||||
}
|
||||
break;
|
||||
|
||||
case kInstAltMatch:
|
||||
DCHECK(!ip->last());
|
||||
reachable.insert(id+1);
|
||||
break;
|
||||
|
||||
case kInstCapture:
|
||||
case kInstEmptyWidth:
|
||||
case kInstNop:
|
||||
if (!ip->last())
|
||||
reachable.insert(id+1);
|
||||
|
||||
reachable.insert(ip->out());
|
||||
break;
|
||||
|
||||
case kInstMatch:
|
||||
if (!ip->last())
|
||||
reachable.insert(id+1);
|
||||
break;
|
||||
|
||||
case kInstFail:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
|
@ -50,17 +50,29 @@
|
||||
// See also Anne Brüggemann-Klein and Derick Wood,
|
||||
// "One-unambiguous regular languages", Information and Computation 142(2).
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <algorithm>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "util/util.h"
|
||||
#include "util/arena.h"
|
||||
#include "util/logging.h"
|
||||
#include "util/sparse_set.h"
|
||||
#include "util/strutil.h"
|
||||
#include "util/utf.h"
|
||||
#include "re2/prog.h"
|
||||
#include "re2/stringpiece.h"
|
||||
|
||||
// Silence "zero-sized array in struct/union" warning for OneState::action.
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning(disable: 4200)
|
||||
#endif
|
||||
|
||||
namespace re2 {
|
||||
|
||||
static const int Debug = 0;
|
||||
static const bool ExtraDebug = false;
|
||||
|
||||
// The key insight behind this implementation is that the
|
||||
// non-determinism in an NFA for a one-pass regular expression
|
||||
@ -126,19 +138,16 @@ static const int Debug = 0;
|
||||
// whether a set of conditions required to finish a match at that
|
||||
// point in the input rather than process the next byte.
|
||||
|
||||
// A state in the one-pass NFA (aka DFA) - just an array of actions.
|
||||
struct OneState;
|
||||
|
||||
// A state in the one-pass NFA - just an array of actions indexed
|
||||
// by the bytemap_[] of the next input byte. (The bytemap
|
||||
// maps next input bytes into equivalence classes, to reduce
|
||||
// the memory footprint.)
|
||||
struct OneState {
|
||||
uint32 matchcond; // conditions to match right now.
|
||||
uint32 action[1];
|
||||
uint32_t matchcond; // conditions to match right now.
|
||||
uint32_t action[];
|
||||
};
|
||||
|
||||
// The uint32 conditions in the action are a combination of
|
||||
// The uint32_t conditions in the action are a combination of
|
||||
// condition and capture bits and the next state. The bottom 16 bits
|
||||
// are the condition and capture bits, and the top 16 are the index of
|
||||
// the next state.
|
||||
@ -155,8 +164,8 @@ struct OneState {
|
||||
// and kEmptyNonWordBoundary, so we can use that as a sentinel
|
||||
// instead of needing an extra bit.
|
||||
|
||||
static const int kIndexShift = 16; // number of bits below index
|
||||
static const int kEmptyShift = 6; // number of empty flags in prog.h
|
||||
static const int kIndexShift = 16; // number of bits below index
|
||||
static const int kEmptyShift = 6; // number of empty flags in prog.h
|
||||
static const int kRealCapShift = kEmptyShift + 1;
|
||||
static const int kRealMaxCap = (kIndexShift - kRealCapShift) / 2 * 2;
|
||||
|
||||
@ -164,23 +173,23 @@ static const int kRealMaxCap = (kIndexShift - kRealCapShift) / 2 * 2;
|
||||
static const int kCapShift = kRealCapShift - 2;
|
||||
static const int kMaxCap = kRealMaxCap + 2;
|
||||
|
||||
static const uint32 kMatchWins = 1 << kEmptyShift;
|
||||
static const uint32 kCapMask = ((1 << kRealMaxCap) - 1) << kRealCapShift;
|
||||
static const uint32_t kMatchWins = 1 << kEmptyShift;
|
||||
static const uint32_t kCapMask = ((1 << kRealMaxCap) - 1) << kRealCapShift;
|
||||
|
||||
static const uint32 kImpossible = kEmptyWordBoundary | kEmptyNonWordBoundary;
|
||||
static const uint32_t kImpossible = kEmptyWordBoundary | kEmptyNonWordBoundary;
|
||||
|
||||
// Check, at compile time, that prog.h agrees with math above.
|
||||
// This function is never called.
|
||||
void OnePass_Checks() {
|
||||
COMPILE_ASSERT((1<<kEmptyShift)-1 == kEmptyAllFlags,
|
||||
kEmptyShift_disagrees_with_kEmptyAllFlags);
|
||||
static_assert((1<<kEmptyShift)-1 == kEmptyAllFlags,
|
||||
"kEmptyShift disagrees with kEmptyAllFlags");
|
||||
// kMaxCap counts pointers, kMaxOnePassCapture counts pairs.
|
||||
COMPILE_ASSERT(kMaxCap == Prog::kMaxOnePassCapture*2,
|
||||
kMaxCap_disagrees_with_kMaxOnePassCapture);
|
||||
static_assert(kMaxCap == Prog::kMaxOnePassCapture*2,
|
||||
"kMaxCap disagrees with kMaxOnePassCapture");
|
||||
}
|
||||
|
||||
static bool Satisfy(uint32 cond, const StringPiece& context, const char* p) {
|
||||
uint32 satisfied = Prog::EmptyFlags(context, p);
|
||||
static bool Satisfy(uint32_t cond, const StringPiece& context, const char* p) {
|
||||
uint32_t satisfied = Prog::EmptyFlags(context, p);
|
||||
if (cond & kEmptyAllFlags & ~satisfied)
|
||||
return false;
|
||||
return true;
|
||||
@ -188,20 +197,17 @@ static bool Satisfy(uint32 cond, const StringPiece& context, const char* p) {
|
||||
|
||||
// Apply the capture bits in cond, saving p to the appropriate
|
||||
// locations in cap[].
|
||||
static void ApplyCaptures(uint32 cond, const char* p,
|
||||
static void ApplyCaptures(uint32_t cond, const char* p,
|
||||
const char** cap, int ncap) {
|
||||
for (int i = 2; i < ncap; i++)
|
||||
if (cond & (1 << kCapShift << i))
|
||||
cap[i] = p;
|
||||
}
|
||||
|
||||
// Compute a node pointer.
|
||||
// Basically (OneState*)(nodes + statesize*nodeindex)
|
||||
// but the version with the C++ casts overflows 80 characters (and is ugly).
|
||||
static inline OneState* IndexToNode(volatile uint8* nodes, int statesize,
|
||||
// Computes the OneState* for the given nodeindex.
|
||||
static inline OneState* IndexToNode(uint8_t* nodes, int statesize,
|
||||
int nodeindex) {
|
||||
return reinterpret_cast<OneState*>(
|
||||
const_cast<uint8*>(nodes + statesize*nodeindex));
|
||||
return reinterpret_cast<OneState*>(nodes + statesize*nodeindex);
|
||||
}
|
||||
|
||||
bool Prog::SearchOnePass(const StringPiece& text,
|
||||
@ -237,30 +243,27 @@ bool Prog::SearchOnePass(const StringPiece& text,
|
||||
if (anchor_end())
|
||||
kind = kFullMatch;
|
||||
|
||||
// State and act are marked volatile to
|
||||
// keep the compiler from re-ordering the
|
||||
// memory accesses walking over the NFA.
|
||||
// This is worth about 5%.
|
||||
volatile OneState* state = onepass_start_;
|
||||
volatile uint8* nodes = onepass_nodes_;
|
||||
volatile uint32 statesize = onepass_statesize_;
|
||||
uint8* bytemap = bytemap_;
|
||||
uint8_t* nodes = onepass_nodes_;
|
||||
int statesize = sizeof(OneState) + bytemap_range()*sizeof(uint32_t);
|
||||
// start() is always mapped to the zeroth OneState.
|
||||
OneState* state = IndexToNode(nodes, statesize, 0);
|
||||
uint8_t* bytemap = bytemap_;
|
||||
const char* bp = text.begin();
|
||||
const char* ep = text.end();
|
||||
const char* p;
|
||||
bool matched = false;
|
||||
matchcap[0] = bp;
|
||||
cap[0] = bp;
|
||||
uint32 nextmatchcond = state->matchcond;
|
||||
uint32_t nextmatchcond = state->matchcond;
|
||||
for (p = bp; p < ep; p++) {
|
||||
int c = bytemap[*p & 0xFF];
|
||||
uint32 matchcond = nextmatchcond;
|
||||
uint32 cond = state->action[c];
|
||||
uint32_t matchcond = nextmatchcond;
|
||||
uint32_t cond = state->action[c];
|
||||
|
||||
// Determine whether we can reach act->next.
|
||||
// If so, advance state and nextmatchcond.
|
||||
if ((cond & kEmptyAllFlags) == 0 || Satisfy(cond, context, p)) {
|
||||
uint32 nextindex = cond >> kIndexShift;
|
||||
uint32_t nextindex = cond >> kIndexShift;
|
||||
state = IndexToNode(nodes, statesize, nextindex);
|
||||
nextmatchcond = state->matchcond;
|
||||
} else {
|
||||
@ -319,7 +322,7 @@ bool Prog::SearchOnePass(const StringPiece& text,
|
||||
|
||||
// Look for match at end of input.
|
||||
{
|
||||
uint32 matchcond = state->matchcond;
|
||||
uint32_t matchcond = state->matchcond;
|
||||
if (matchcond != kImpossible &&
|
||||
((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p))) {
|
||||
if (nmatch > 1 && (matchcond & kCapMask))
|
||||
@ -335,7 +338,9 @@ done:
|
||||
if (!matched)
|
||||
return false;
|
||||
for (int i = 0; i < nmatch; i++)
|
||||
match[i].set(matchcap[2*i], matchcap[2*i+1] - matchcap[2*i]);
|
||||
match[i] =
|
||||
StringPiece(matchcap[2 * i],
|
||||
static_cast<size_t>(matchcap[2 * i + 1] - matchcap[2 * i]));
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -357,7 +362,7 @@ static bool AddQ(Instq *q, int id) {
|
||||
|
||||
struct InstCond {
|
||||
int id;
|
||||
uint32 cond;
|
||||
uint32_t cond;
|
||||
};
|
||||
|
||||
// Returns whether this is a one-pass program; that is,
|
||||
@ -377,7 +382,7 @@ struct InstCond {
|
||||
// Constructs and saves corresponding one-pass NFA on success.
|
||||
bool Prog::IsOnePass() {
|
||||
if (did_onepass_)
|
||||
return onepass_start_ != NULL;
|
||||
return onepass_nodes_ != NULL;
|
||||
did_onepass_ = true;
|
||||
|
||||
if (start() == 0) // no match
|
||||
@ -387,32 +392,37 @@ bool Prog::IsOnePass() {
|
||||
// Willing to use at most 1/4 of the DFA budget (heuristic).
|
||||
// Limit max node count to 65000 as a conservative estimate to
|
||||
// avoid overflowing 16-bit node index in encoding.
|
||||
int maxnodes = 2 + byte_inst_count_;
|
||||
int statesize = sizeof(OneState) + (bytemap_range_-1)*sizeof(uint32);
|
||||
int maxnodes = 2 + inst_count(kInstByteRange);
|
||||
int statesize = sizeof(OneState) + bytemap_range()*sizeof(uint32_t);
|
||||
if (maxnodes >= 65000 || dfa_mem_ / 4 / statesize < maxnodes)
|
||||
return false;
|
||||
|
||||
// Flood the graph starting at the start state, and check
|
||||
// that in each reachable state, each possible byte leads
|
||||
// to a unique next state.
|
||||
int size = this->size();
|
||||
InstCond *stack = new InstCond[size];
|
||||
int stacksize = inst_count(kInstCapture) +
|
||||
inst_count(kInstEmptyWidth) +
|
||||
inst_count(kInstNop) + 1; // + 1 for start inst
|
||||
InstCond* stack = new InstCond[stacksize];
|
||||
|
||||
int size = this->size();
|
||||
int* nodebyid = new int[size]; // indexed by ip
|
||||
memset(nodebyid, 0xFF, size*sizeof nodebyid[0]);
|
||||
|
||||
uint8* nodes = new uint8[maxnodes*statesize];
|
||||
uint8* nodep = nodes;
|
||||
// Originally, nodes was a uint8_t[maxnodes*statesize], but that was
|
||||
// unnecessarily optimistic: why allocate a large amount of memory
|
||||
// upfront for a large program when it is unlikely to be one-pass?
|
||||
std::vector<uint8_t> nodes;
|
||||
|
||||
Instq tovisit(size), workq(size);
|
||||
AddQ(&tovisit, start());
|
||||
nodebyid[start()] = 0;
|
||||
nodep += statesize;
|
||||
int nalloc = 1;
|
||||
nodes.insert(nodes.end(), statesize, 0);
|
||||
for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
|
||||
int id = *it;
|
||||
int nodeindex = nodebyid[id];
|
||||
OneState* node = IndexToNode(nodes, statesize, nodeindex);
|
||||
OneState* node = IndexToNode(nodes.data(), statesize, nodeindex);
|
||||
|
||||
// Flood graph using manual stack, filling in actions as found.
|
||||
// Default is none.
|
||||
@ -427,93 +437,108 @@ bool Prog::IsOnePass() {
|
||||
stack[nstack++].cond = 0;
|
||||
while (nstack > 0) {
|
||||
int id = stack[--nstack].id;
|
||||
uint32_t cond = stack[nstack].cond;
|
||||
|
||||
Loop:
|
||||
Prog::Inst* ip = inst(id);
|
||||
uint32 cond = stack[nstack].cond;
|
||||
switch (ip->opcode()) {
|
||||
default:
|
||||
LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
|
||||
break;
|
||||
|
||||
case kInstAltMatch:
|
||||
// TODO(rsc): Ignoring kInstAltMatch optimization.
|
||||
// Should implement it in this engine, but it's subtle.
|
||||
// Fall through.
|
||||
case kInstAlt:
|
||||
DCHECK(!ip->last());
|
||||
// If already on work queue, (1) is violated: bail out.
|
||||
if (!AddQ(&workq, ip->out()) || !AddQ(&workq, ip->out1()))
|
||||
if (!AddQ(&workq, id+1))
|
||||
goto fail;
|
||||
stack[nstack].id = ip->out1();
|
||||
stack[nstack++].cond = cond;
|
||||
stack[nstack].id = ip->out();
|
||||
stack[nstack++].cond = cond;
|
||||
break;
|
||||
id = id+1;
|
||||
goto Loop;
|
||||
|
||||
case kInstByteRange: {
|
||||
int nextindex = nodebyid[ip->out()];
|
||||
if (nextindex == -1) {
|
||||
if (nalloc >= maxnodes) {
|
||||
if (Debug)
|
||||
LOG(ERROR)
|
||||
<< StringPrintf("Not OnePass: hit node limit %d > %d",
|
||||
nalloc, maxnodes);
|
||||
if (ExtraDebug)
|
||||
LOG(ERROR) << StringPrintf(
|
||||
"Not OnePass: hit node limit %d >= %d", nalloc, maxnodes);
|
||||
goto fail;
|
||||
}
|
||||
nextindex = nalloc;
|
||||
nodep += statesize;
|
||||
nodebyid[ip->out()] = nextindex;
|
||||
nalloc++;
|
||||
AddQ(&tovisit, ip->out());
|
||||
nodebyid[ip->out()] = nalloc;
|
||||
nalloc++;
|
||||
nodes.insert(nodes.end(), statesize, 0);
|
||||
// Update node because it might have been invalidated.
|
||||
node = IndexToNode(nodes.data(), statesize, nodeindex);
|
||||
}
|
||||
if (matched)
|
||||
cond |= kMatchWins;
|
||||
for (int c = ip->lo(); c <= ip->hi(); c++) {
|
||||
int b = bytemap_[c];
|
||||
c = unbytemap_[b]; // last c in byte class
|
||||
uint32 act = node->action[b];
|
||||
uint32 newact = (nextindex << kIndexShift) | cond;
|
||||
// Skip any bytes immediately after c that are also in b.
|
||||
while (c < 256-1 && bytemap_[c+1] == b)
|
||||
c++;
|
||||
uint32_t act = node->action[b];
|
||||
uint32_t newact = (nextindex << kIndexShift) | cond;
|
||||
if (matched)
|
||||
newact |= kMatchWins;
|
||||
if ((act & kImpossible) == kImpossible) {
|
||||
node->action[b] = newact;
|
||||
} else if (act != newact) {
|
||||
if (Debug) {
|
||||
LOG(ERROR)
|
||||
<< StringPrintf("Not OnePass: conflict on byte "
|
||||
"%#x at state %d",
|
||||
c, *it);
|
||||
}
|
||||
if (ExtraDebug)
|
||||
LOG(ERROR) << StringPrintf(
|
||||
"Not OnePass: conflict on byte %#x at state %d", c, *it);
|
||||
goto fail;
|
||||
}
|
||||
}
|
||||
if (ip->foldcase()) {
|
||||
Rune lo = max<Rune>(ip->lo(), 'a') + 'A' - 'a';
|
||||
Rune hi = min<Rune>(ip->hi(), 'z') + 'A' - 'a';
|
||||
Rune lo = std::max<Rune>(ip->lo(), 'a') + 'A' - 'a';
|
||||
Rune hi = std::min<Rune>(ip->hi(), 'z') + 'A' - 'a';
|
||||
for (int c = lo; c <= hi; c++) {
|
||||
int b = bytemap_[c];
|
||||
c = unbytemap_[b]; // last c in class
|
||||
uint32 act = node->action[b];
|
||||
uint32 newact = (nextindex << kIndexShift) | cond;
|
||||
// Skip any bytes immediately after c that are also in b.
|
||||
while (c < 256-1 && bytemap_[c+1] == b)
|
||||
c++;
|
||||
uint32_t act = node->action[b];
|
||||
uint32_t newact = (nextindex << kIndexShift) | cond;
|
||||
if (matched)
|
||||
newact |= kMatchWins;
|
||||
if ((act & kImpossible) == kImpossible) {
|
||||
node->action[b] = newact;
|
||||
} else if (act != newact) {
|
||||
if (Debug) {
|
||||
LOG(ERROR)
|
||||
<< StringPrintf("Not OnePass: conflict on byte "
|
||||
"%#x at state %d",
|
||||
c, *it);
|
||||
}
|
||||
if (ExtraDebug)
|
||||
LOG(ERROR) << StringPrintf(
|
||||
"Not OnePass: conflict on byte %#x at state %d", c, *it);
|
||||
goto fail;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
if (ip->last())
|
||||
break;
|
||||
// If already on work queue, (1) is violated: bail out.
|
||||
if (!AddQ(&workq, id+1))
|
||||
goto fail;
|
||||
id = id+1;
|
||||
goto Loop;
|
||||
}
|
||||
|
||||
case kInstCapture:
|
||||
if (ip->cap() < kMaxCap)
|
||||
cond |= (1 << kCapShift) << ip->cap();
|
||||
goto QueueEmpty;
|
||||
|
||||
case kInstEmptyWidth:
|
||||
cond |= ip->empty();
|
||||
goto QueueEmpty;
|
||||
|
||||
case kInstNop:
|
||||
QueueEmpty:
|
||||
if (!ip->last()) {
|
||||
// If already on work queue, (1) is violated: bail out.
|
||||
if (!AddQ(&workq, id+1))
|
||||
goto fail;
|
||||
stack[nstack].id = id+1;
|
||||
stack[nstack++].cond = cond;
|
||||
}
|
||||
|
||||
if (ip->opcode() == kInstCapture && ip->cap() < kMaxCap)
|
||||
cond |= (1 << kCapShift) << ip->cap();
|
||||
if (ip->opcode() == kInstEmptyWidth)
|
||||
cond |= ip->empty();
|
||||
|
||||
// kInstCapture and kInstNop always proceed to ip->out().
|
||||
// kInstEmptyWidth only sometimes proceeds to ip->out(),
|
||||
// but as a conservative approximation we assume it always does.
|
||||
@ -522,29 +547,32 @@ bool Prog::IsOnePass() {
|
||||
|
||||
// If already on work queue, (1) is violated: bail out.
|
||||
if (!AddQ(&workq, ip->out())) {
|
||||
if (Debug) {
|
||||
LOG(ERROR) << StringPrintf("Not OnePass: multiple paths"
|
||||
" %d -> %d\n",
|
||||
*it, ip->out());
|
||||
}
|
||||
if (ExtraDebug)
|
||||
LOG(ERROR) << StringPrintf(
|
||||
"Not OnePass: multiple paths %d -> %d\n", *it, ip->out());
|
||||
goto fail;
|
||||
}
|
||||
stack[nstack].id = ip->out();
|
||||
stack[nstack++].cond = cond;
|
||||
break;
|
||||
id = ip->out();
|
||||
goto Loop;
|
||||
|
||||
case kInstMatch:
|
||||
if (matched) {
|
||||
// (3) is violated
|
||||
if (Debug) {
|
||||
LOG(ERROR) << StringPrintf("Not OnePass: multiple matches"
|
||||
" from %d\n", *it);
|
||||
}
|
||||
if (ExtraDebug)
|
||||
LOG(ERROR) << StringPrintf(
|
||||
"Not OnePass: multiple matches from %d\n", *it);
|
||||
goto fail;
|
||||
}
|
||||
matched = true;
|
||||
node->matchcond = cond;
|
||||
break;
|
||||
|
||||
if (ip->last())
|
||||
break;
|
||||
// If already on work queue, (1) is violated: bail out.
|
||||
if (!AddQ(&workq, id+1))
|
||||
goto fail;
|
||||
id = id+1;
|
||||
goto Loop;
|
||||
|
||||
case kInstFail:
|
||||
break;
|
||||
@ -552,29 +580,22 @@ bool Prog::IsOnePass() {
|
||||
}
|
||||
}
|
||||
|
||||
if (Debug) { // For debugging, dump one-pass NFA to LOG(ERROR).
|
||||
string dump = "prog dump:\n" + Dump() + "node dump\n";
|
||||
map<int, int> idmap;
|
||||
if (ExtraDebug) { // For debugging, dump one-pass NFA to LOG(ERROR).
|
||||
LOG(ERROR) << "bytemap:\n" << DumpByteMap();
|
||||
LOG(ERROR) << "prog:\n" << Dump();
|
||||
|
||||
std::map<int, int> idmap;
|
||||
for (int i = 0; i < size; i++)
|
||||
if (nodebyid[i] != -1)
|
||||
idmap[nodebyid[i]] = i;
|
||||
|
||||
StringAppendF(&dump, "byte ranges:\n");
|
||||
int i = 0;
|
||||
for (int b = 0; b < bytemap_range_; b++) {
|
||||
int lo = i;
|
||||
while (bytemap_[i] == b)
|
||||
i++;
|
||||
StringAppendF(&dump, "\t%d: %#x-%#x\n", b, lo, i - 1);
|
||||
}
|
||||
|
||||
string dump;
|
||||
for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
|
||||
int id = *it;
|
||||
int nodeindex = nodebyid[id];
|
||||
if (nodeindex == -1)
|
||||
continue;
|
||||
OneState* node = IndexToNode(nodes, statesize, nodeindex);
|
||||
string s;
|
||||
continue;
|
||||
OneState* node = IndexToNode(nodes.data(), statesize, nodeindex);
|
||||
StringAppendF(&dump, "node %d id=%d: matchcond=%#x\n",
|
||||
nodeindex, id, node->matchcond);
|
||||
for (int i = 0; i < bytemap_range_; i++) {
|
||||
@ -586,19 +607,12 @@ bool Prog::IsOnePass() {
|
||||
idmap[node->action[i] >> kIndexShift]);
|
||||
}
|
||||
}
|
||||
LOG(ERROR) << dump;
|
||||
LOG(ERROR) << "nodes:\n" << dump;
|
||||
}
|
||||
|
||||
// Overallocated earlier; cut down to actual size.
|
||||
nodep = new uint8[nalloc*statesize];
|
||||
memmove(nodep, nodes, nalloc*statesize);
|
||||
delete[] nodes;
|
||||
nodes = nodep;
|
||||
|
||||
onepass_start_ = IndexToNode(nodes, statesize, nodebyid[start()]);
|
||||
onepass_nodes_ = nodes;
|
||||
onepass_statesize_ = statesize;
|
||||
dfa_mem_ -= nalloc*statesize;
|
||||
onepass_nodes_ = new uint8_t[nalloc*statesize];
|
||||
memmove(onepass_nodes_, nodes.data(), nalloc*statesize);
|
||||
|
||||
delete[] stack;
|
||||
delete[] nodebyid;
|
||||
@ -607,7 +621,6 @@ bool Prog::IsOnePass() {
|
||||
fail:
|
||||
delete[] stack;
|
||||
delete[] nodebyid;
|
||||
delete[] nodes;
|
||||
return false;
|
||||
}
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -2,34 +2,38 @@
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "util/util.h"
|
||||
#include "re2/prefilter.h"
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "util/util.h"
|
||||
#include "util/logging.h"
|
||||
#include "util/strutil.h"
|
||||
#include "util/utf.h"
|
||||
#include "re2/re2.h"
|
||||
#include "re2/unicode_casefold.h"
|
||||
#include "re2/walker-inl.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
static const int Trace = false;
|
||||
static const bool ExtraDebug = false;
|
||||
|
||||
typedef set<string>::iterator SSIter;
|
||||
typedef set<string>::const_iterator ConstSSIter;
|
||||
typedef std::set<string>::iterator SSIter;
|
||||
typedef std::set<string>::const_iterator ConstSSIter;
|
||||
|
||||
static int alloc_id = 100000; // Used for debugging.
|
||||
// Initializes a Prefilter, allocating subs_ as necessary.
|
||||
Prefilter::Prefilter(Op op) {
|
||||
op_ = op;
|
||||
subs_ = NULL;
|
||||
if (op_ == AND || op_ == OR)
|
||||
subs_ = new vector<Prefilter*>;
|
||||
|
||||
alloc_id_ = alloc_id++;
|
||||
VLOG(10) << "alloc_id: " << alloc_id_;
|
||||
subs_ = new std::vector<Prefilter*>;
|
||||
}
|
||||
|
||||
// Destroys a Prefilter.
|
||||
Prefilter::~Prefilter() {
|
||||
VLOG(10) << "Deleted: " << alloc_id_;
|
||||
if (subs_) {
|
||||
for (size_t i = 0; i < subs_->size(); i++)
|
||||
delete (*subs_)[i];
|
||||
@ -45,7 +49,7 @@ Prefilter* Prefilter::Simplify() {
|
||||
}
|
||||
|
||||
// Nothing left in the AND/OR.
|
||||
if (subs_->size() == 0) {
|
||||
if (subs_->empty()) {
|
||||
if (op_ == AND)
|
||||
op_ = ALL; // AND of nothing is true
|
||||
else
|
||||
@ -136,7 +140,7 @@ Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) {
|
||||
return AndOr(OR, a, b);
|
||||
}
|
||||
|
||||
static void SimplifyStringSet(set<string> *ss) {
|
||||
static void SimplifyStringSet(std::set<string> *ss) {
|
||||
// Now make sure that the strings aren't redundant. For example, if
|
||||
// we know "ab" is a required string, then it doesn't help at all to
|
||||
// know that "abc" is also a required string, so delete "abc". This
|
||||
@ -157,7 +161,7 @@ static void SimplifyStringSet(set<string> *ss) {
|
||||
}
|
||||
}
|
||||
|
||||
Prefilter* Prefilter::OrStrings(set<string>* ss) {
|
||||
Prefilter* Prefilter::OrStrings(std::set<string>* ss) {
|
||||
SimplifyStringSet(ss);
|
||||
Prefilter* or_prefilter = NULL;
|
||||
if (!ss->empty()) {
|
||||
@ -176,7 +180,7 @@ static Rune ToLowerRune(Rune r) {
|
||||
}
|
||||
|
||||
const CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r);
|
||||
if (f == NULL || r < static_cast<Rune>(f->lo))
|
||||
if (f == NULL || r < f->lo)
|
||||
return r;
|
||||
return ApplyFold(f, r);
|
||||
}
|
||||
@ -222,14 +226,14 @@ class Prefilter::Info {
|
||||
// Caller takes ownership of the Prefilter.
|
||||
Prefilter* TakeMatch();
|
||||
|
||||
set<string>& exact() { return exact_; }
|
||||
std::set<string>& exact() { return exact_; }
|
||||
|
||||
bool is_exact() const { return is_exact_; }
|
||||
|
||||
class Walker;
|
||||
|
||||
private:
|
||||
set<string> exact_;
|
||||
std::set<string> exact_;
|
||||
|
||||
// When is_exact_ is true, the strings that match
|
||||
// are placed in exact_. When it is no longer an exact
|
||||
@ -268,7 +272,9 @@ string Prefilter::Info::ToString() {
|
||||
if (is_exact_) {
|
||||
int n = 0;
|
||||
string s;
|
||||
for (set<string>::iterator i = exact_.begin(); i != exact_.end(); ++i) {
|
||||
for (std::set<string>::iterator i = exact_.begin();
|
||||
i != exact_.end();
|
||||
++i) {
|
||||
if (n++ > 0)
|
||||
s += ",";
|
||||
s += *i;
|
||||
@ -283,16 +289,17 @@ string Prefilter::Info::ToString() {
|
||||
}
|
||||
|
||||
// Add the strings from src to dst.
|
||||
static void CopyIn(const set<string>& src, set<string>* dst) {
|
||||
static void CopyIn(const std::set<string>& src,
|
||||
std::set<string>* dst) {
|
||||
for (ConstSSIter i = src.begin(); i != src.end(); ++i)
|
||||
dst->insert(*i);
|
||||
}
|
||||
|
||||
// Add the cross-product of a and b to dst.
|
||||
// (For each string i in a and j in b, add i+j.)
|
||||
static void CrossProduct(const set<string>& a,
|
||||
const set<string>& b,
|
||||
set<string>* dst) {
|
||||
static void CrossProduct(const std::set<string>& a,
|
||||
const std::set<string>& b,
|
||||
std::set<string>* dst) {
|
||||
for (ConstSSIter i = a.begin(); i != a.end(); ++i)
|
||||
for (ConstSSIter j = b.begin(); j != b.end(); ++j)
|
||||
dst->insert(*i + *j);
|
||||
@ -446,10 +453,10 @@ Prefilter::Info* Prefilter::Info::EmptyString() {
|
||||
typedef CharClass::iterator CCIter;
|
||||
Prefilter::Info* Prefilter::Info::CClass(CharClass *cc,
|
||||
bool latin1) {
|
||||
if (Trace) {
|
||||
VLOG(0) << "CharClassInfo:";
|
||||
if (ExtraDebug) {
|
||||
LOG(ERROR) << "CharClassInfo:";
|
||||
for (CCIter i = cc->begin(); i != cc->end(); ++i)
|
||||
VLOG(0) << " " << i->lo << "-" << i->hi;
|
||||
LOG(ERROR) << " " << i->lo << "-" << i->hi;
|
||||
}
|
||||
|
||||
// If the class is too large, it's okay to overestimate.
|
||||
@ -469,9 +476,8 @@ Prefilter::Info* Prefilter::Info::CClass(CharClass *cc,
|
||||
|
||||
a->is_exact_ = true;
|
||||
|
||||
if (Trace) {
|
||||
VLOG(0) << " = " << a->ToString();
|
||||
}
|
||||
if (ExtraDebug)
|
||||
LOG(ERROR) << " = " << a->ToString();
|
||||
|
||||
return a;
|
||||
}
|
||||
@ -492,15 +498,16 @@ class Prefilter::Info::Walker : public Regexp::Walker<Prefilter::Info*> {
|
||||
bool latin1() { return latin1_; }
|
||||
private:
|
||||
bool latin1_;
|
||||
DISALLOW_EVIL_CONSTRUCTORS(Walker);
|
||||
|
||||
Walker(const Walker&) = delete;
|
||||
Walker& operator=(const Walker&) = delete;
|
||||
};
|
||||
|
||||
Prefilter::Info* Prefilter::BuildInfo(Regexp* re) {
|
||||
if (Trace) {
|
||||
LOG(INFO) << "BuildPrefilter::Info: " << re->ToString();
|
||||
}
|
||||
if (ExtraDebug)
|
||||
LOG(ERROR) << "BuildPrefilter::Info: " << re->ToString();
|
||||
|
||||
bool latin1 = re->parse_flags() & Regexp::Latin1;
|
||||
bool latin1 = (re->parse_flags() & Regexp::Latin1) != 0;
|
||||
Prefilter::Info::Walker w(latin1);
|
||||
Prefilter::Info* info = w.WalkExponential(re, NULL, 100000);
|
||||
|
||||
@ -600,7 +607,6 @@ Prefilter::Info* Prefilter::Info::Walker::PostVisit(
|
||||
info = child_args[0];
|
||||
for (int i = 1; i < nchild_args; i++)
|
||||
info = Alt(info, child_args[i]);
|
||||
VLOG(10) << "Alt: " << info->ToString();
|
||||
break;
|
||||
|
||||
case kRegexpStar:
|
||||
@ -630,10 +636,9 @@ Prefilter::Info* Prefilter::Info::Walker::PostVisit(
|
||||
break;
|
||||
}
|
||||
|
||||
if (Trace) {
|
||||
VLOG(0) << "BuildInfo " << re->ToString()
|
||||
<< ": " << (info ? info->ToString() : "");
|
||||
}
|
||||
if (ExtraDebug)
|
||||
LOG(ERROR) << "BuildInfo " << re->ToString()
|
||||
<< ": " << (info ? info->ToString() : "");
|
||||
|
||||
return info;
|
||||
}
|
||||
|
@ -2,14 +2,19 @@
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef RE2_PREFILTER_H_
|
||||
#define RE2_PREFILTER_H_
|
||||
|
||||
// Prefilter is the class used to extract string guards from regexps.
|
||||
// Rather than using Prefilter class directly, use FilteredRE2.
|
||||
// See filtered_re2.h
|
||||
|
||||
#ifndef RE2_PREFILTER_H_
|
||||
#define RE2_PREFILTER_H_
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "util/util.h"
|
||||
#include "util/logging.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
@ -37,14 +42,14 @@ class Prefilter {
|
||||
int unique_id() const { return unique_id_; }
|
||||
|
||||
// The children of the Prefilter node.
|
||||
vector<Prefilter*>* subs() {
|
||||
CHECK(op_ == AND || op_ == OR);
|
||||
std::vector<Prefilter*>* subs() {
|
||||
DCHECK(op_ == AND || op_ == OR);
|
||||
return subs_;
|
||||
}
|
||||
|
||||
// Set the children vector. Prefilter takes ownership of subs and
|
||||
// subs_ will be deleted when Prefilter is deleted.
|
||||
void set_subs(vector<Prefilter*>* subs) { subs_ = subs; }
|
||||
void set_subs(std::vector<Prefilter*>* subs) { subs_ = subs; }
|
||||
|
||||
// Given a RE2, return a Prefilter. The caller takes ownership of
|
||||
// the Prefilter and should deallocate it. Returns NULL if Prefilter
|
||||
@ -72,7 +77,7 @@ class Prefilter {
|
||||
|
||||
static Prefilter* FromString(const string& str);
|
||||
|
||||
static Prefilter* OrStrings(set<string>* ss);
|
||||
static Prefilter* OrStrings(std::set<string>* ss);
|
||||
|
||||
static Info* BuildInfo(Regexp* re);
|
||||
|
||||
@ -82,7 +87,7 @@ class Prefilter {
|
||||
Op op_;
|
||||
|
||||
// Sub-matches for AND or OR Prefilter.
|
||||
vector<Prefilter*>* subs_;
|
||||
std::vector<Prefilter*>* subs_;
|
||||
|
||||
// Actual string to match in leaf node.
|
||||
string atom_;
|
||||
@ -94,10 +99,8 @@ class Prefilter {
|
||||
// and -1 for duplicate nodes.
|
||||
int unique_id_;
|
||||
|
||||
// Used for debugging, helps in tracking memory leaks.
|
||||
int alloc_id_;
|
||||
|
||||
DISALLOW_EVIL_CONSTRUCTORS(Prefilter);
|
||||
Prefilter(const Prefilter&) = delete;
|
||||
Prefilter& operator=(const Prefilter&) = delete;
|
||||
};
|
||||
|
||||
} // namespace re2
|
||||
|
@ -2,20 +2,35 @@
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "util/util.h"
|
||||
#include "util/flags.h"
|
||||
#include "re2/prefilter.h"
|
||||
#include "re2/prefilter_tree.h"
|
||||
#include "re2/re2.h"
|
||||
|
||||
DEFINE_int32(filtered_re2_min_atom_len,
|
||||
3,
|
||||
"Strings less than this length are not stored as atoms");
|
||||
#include <stddef.h>
|
||||
#include <algorithm>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "util/util.h"
|
||||
#include "util/logging.h"
|
||||
#include "util/strutil.h"
|
||||
#include "re2/prefilter.h"
|
||||
#include "re2/re2.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
static const bool ExtraDebug = false;
|
||||
|
||||
PrefilterTree::PrefilterTree()
|
||||
: compiled_(false) {
|
||||
: compiled_(false),
|
||||
min_atom_len_(3) {
|
||||
}
|
||||
|
||||
PrefilterTree::PrefilterTree(int min_atom_len)
|
||||
: compiled_(false),
|
||||
min_atom_len_(min_atom_len) {
|
||||
}
|
||||
|
||||
PrefilterTree::~PrefilterTree() {
|
||||
@ -26,62 +41,22 @@ PrefilterTree::~PrefilterTree() {
|
||||
delete entries_[i].parents;
|
||||
}
|
||||
|
||||
// Functions used for adding and Compiling prefilters to the
|
||||
// PrefilterTree.
|
||||
static bool KeepPart(Prefilter* prefilter, int level) {
|
||||
if (prefilter == NULL)
|
||||
return false;
|
||||
|
||||
switch (prefilter->op()) {
|
||||
default:
|
||||
LOG(DFATAL) << "Unexpected op in KeepPart: "
|
||||
<< prefilter->op();
|
||||
return false;
|
||||
|
||||
case Prefilter::ALL:
|
||||
return false;
|
||||
|
||||
case Prefilter::ATOM:
|
||||
return prefilter->atom().size() >=
|
||||
static_cast<size_t>(FLAGS_filtered_re2_min_atom_len);
|
||||
|
||||
case Prefilter::AND: {
|
||||
int j = 0;
|
||||
vector<Prefilter*>* subs = prefilter->subs();
|
||||
for (size_t i = 0; i < subs->size(); i++)
|
||||
if (KeepPart((*subs)[i], level + 1))
|
||||
(*subs)[j++] = (*subs)[i];
|
||||
else
|
||||
delete (*subs)[i];
|
||||
|
||||
subs->resize(j);
|
||||
return j > 0;
|
||||
}
|
||||
|
||||
case Prefilter::OR:
|
||||
for (size_t i = 0; i < prefilter->subs()->size(); i++)
|
||||
if (!KeepPart((*prefilter->subs())[i], level + 1))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
void PrefilterTree::Add(Prefilter *f) {
|
||||
void PrefilterTree::Add(Prefilter* prefilter) {
|
||||
if (compiled_) {
|
||||
LOG(DFATAL) << "Add after Compile.";
|
||||
LOG(DFATAL) << "Add called after Compile.";
|
||||
return;
|
||||
}
|
||||
if (f != NULL && !KeepPart(f, 0)) {
|
||||
delete f;
|
||||
f = NULL;
|
||||
if (prefilter != NULL && !KeepNode(prefilter)) {
|
||||
delete prefilter;
|
||||
prefilter = NULL;
|
||||
}
|
||||
|
||||
prefilter_vec_.push_back(f);
|
||||
prefilter_vec_.push_back(prefilter);
|
||||
}
|
||||
|
||||
void PrefilterTree::Compile(vector<string>* atom_vec) {
|
||||
void PrefilterTree::Compile(std::vector<string>* atom_vec) {
|
||||
if (compiled_) {
|
||||
LOG(DFATAL) << "Compile after Compile.";
|
||||
LOG(DFATAL) << "Compile called already.";
|
||||
return;
|
||||
}
|
||||
|
||||
@ -93,7 +68,9 @@ void PrefilterTree::Compile(vector<string>* atom_vec) {
|
||||
|
||||
compiled_ = true;
|
||||
|
||||
AssignUniqueIds(atom_vec);
|
||||
// TODO(junyer): Use std::unordered_set<Prefilter*> instead?
|
||||
NodeMap nodes;
|
||||
AssignUniqueIds(&nodes, atom_vec);
|
||||
|
||||
// Identify nodes that are too common among prefilters and are
|
||||
// triggering too many parents. Then get rid of them if possible.
|
||||
@ -109,9 +86,11 @@ void PrefilterTree::Compile(vector<string>* atom_vec) {
|
||||
// this trigger. TODO(vsri): Adjust the threshold appropriately,
|
||||
// make it a function of total number of nodes?
|
||||
bool have_other_guard = true;
|
||||
for (StdIntMap::iterator it = parents->begin(); it != parents->end(); ++it)
|
||||
for (StdIntMap::iterator it = parents->begin();
|
||||
it != parents->end(); ++it) {
|
||||
have_other_guard = have_other_guard &&
|
||||
(entries_[it->first].propagate_up_at_count > 1);
|
||||
}
|
||||
|
||||
if (have_other_guard) {
|
||||
for (StdIntMap::iterator it = parents->begin();
|
||||
@ -123,50 +102,82 @@ void PrefilterTree::Compile(vector<string>* atom_vec) {
|
||||
}
|
||||
}
|
||||
|
||||
PrintDebugInfo();
|
||||
if (ExtraDebug)
|
||||
PrintDebugInfo(&nodes);
|
||||
}
|
||||
|
||||
Prefilter* PrefilterTree::CanonicalNode(Prefilter* node) {
|
||||
Prefilter* PrefilterTree::CanonicalNode(NodeMap* nodes, Prefilter* node) {
|
||||
string node_string = NodeString(node);
|
||||
map<string, Prefilter*>::iterator iter = node_map_.find(node_string);
|
||||
if (iter == node_map_.end())
|
||||
std::map<string, Prefilter*>::iterator iter = nodes->find(node_string);
|
||||
if (iter == nodes->end())
|
||||
return NULL;
|
||||
return (*iter).second;
|
||||
}
|
||||
|
||||
static string Itoa(int n) {
|
||||
char buf[100];
|
||||
snprintf(buf, sizeof buf, "%d", n);
|
||||
return string(buf);
|
||||
}
|
||||
|
||||
string PrefilterTree::NodeString(Prefilter* node) const {
|
||||
// Adding the operation disambiguates AND/OR/atom nodes.
|
||||
string s = Itoa(node->op()) + ":";
|
||||
string s = StringPrintf("%d", node->op()) + ":";
|
||||
if (node->op() == Prefilter::ATOM) {
|
||||
s += node->atom();
|
||||
} else {
|
||||
for (size_t i = 0; i < node->subs()->size() ; i++) {
|
||||
for (size_t i = 0; i < node->subs()->size(); i++) {
|
||||
if (i > 0)
|
||||
s += ',';
|
||||
s += Itoa((*node->subs())[i]->unique_id());
|
||||
s += StringPrintf("%d", (*node->subs())[i]->unique_id());
|
||||
}
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) {
|
||||
bool PrefilterTree::KeepNode(Prefilter* node) const {
|
||||
if (node == NULL)
|
||||
return false;
|
||||
|
||||
switch (node->op()) {
|
||||
default:
|
||||
LOG(DFATAL) << "Unexpected op in KeepNode: " << node->op();
|
||||
return false;
|
||||
|
||||
case Prefilter::ALL:
|
||||
return false;
|
||||
|
||||
case Prefilter::ATOM:
|
||||
return node->atom().size() >= static_cast<size_t>(min_atom_len_);
|
||||
|
||||
case Prefilter::AND: {
|
||||
int j = 0;
|
||||
std::vector<Prefilter*>* subs = node->subs();
|
||||
for (size_t i = 0; i < subs->size(); i++)
|
||||
if (KeepNode((*subs)[i]))
|
||||
(*subs)[j++] = (*subs)[i];
|
||||
else
|
||||
delete (*subs)[i];
|
||||
|
||||
subs->resize(j);
|
||||
return j > 0;
|
||||
}
|
||||
|
||||
case Prefilter::OR:
|
||||
for (size_t i = 0; i < node->subs()->size(); i++)
|
||||
if (!KeepNode((*node->subs())[i]))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
void PrefilterTree::AssignUniqueIds(NodeMap* nodes,
|
||||
std::vector<string>* atom_vec) {
|
||||
atom_vec->clear();
|
||||
|
||||
// Build vector of all filter nodes, sorted topologically
|
||||
// from top to bottom in v.
|
||||
vector<Prefilter*> v;
|
||||
std::vector<Prefilter*> v;
|
||||
|
||||
// Add the top level nodes of each regexp prefilter.
|
||||
for (size_t i = 0; i < prefilter_vec_.size(); i++) {
|
||||
Prefilter* f = prefilter_vec_[i];
|
||||
if (f == NULL)
|
||||
unfiltered_.push_back(i);
|
||||
unfiltered_.push_back(static_cast<int>(i));
|
||||
|
||||
// We push NULL also on to v, so that we maintain the
|
||||
// mapping of index==regexpid for level=0 prefilter nodes.
|
||||
@ -179,7 +190,7 @@ void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) {
|
||||
if (f == NULL)
|
||||
continue;
|
||||
if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) {
|
||||
const vector<Prefilter*>& subs = *f->subs();
|
||||
const std::vector<Prefilter*>& subs = *f->subs();
|
||||
for (size_t j = 0; j < subs.size(); j++)
|
||||
v.push_back(subs[j]);
|
||||
}
|
||||
@ -187,16 +198,16 @@ void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) {
|
||||
|
||||
// Identify unique nodes.
|
||||
int unique_id = 0;
|
||||
for (int i = v.size() - 1; i >= 0; i--) {
|
||||
for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
|
||||
Prefilter *node = v[i];
|
||||
if (node == NULL)
|
||||
continue;
|
||||
node->set_unique_id(-1);
|
||||
Prefilter* canonical = CanonicalNode(node);
|
||||
Prefilter* canonical = CanonicalNode(nodes, node);
|
||||
if (canonical == NULL) {
|
||||
// Any further nodes that have the same node string
|
||||
// will find this node as the canonical node.
|
||||
node_map_[NodeString(node)] = node;
|
||||
nodes->emplace(NodeString(node), node);
|
||||
if (node->op() == Prefilter::ATOM) {
|
||||
atom_vec->push_back(node->atom());
|
||||
atom_index_to_id_.push_back(unique_id);
|
||||
@ -206,15 +217,15 @@ void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) {
|
||||
node->set_unique_id(canonical->unique_id());
|
||||
}
|
||||
}
|
||||
entries_.resize(node_map_.size());
|
||||
entries_.resize(nodes->size());
|
||||
|
||||
// Create parent IntMap for the entries.
|
||||
for (int i = v.size() - 1; i >= 0; i--) {
|
||||
// Create parent StdIntMap for the entries.
|
||||
for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
|
||||
Prefilter* prefilter = v[i];
|
||||
if (prefilter == NULL)
|
||||
continue;
|
||||
|
||||
if (CanonicalNode(prefilter) != prefilter)
|
||||
if (CanonicalNode(nodes, prefilter) != prefilter)
|
||||
continue;
|
||||
|
||||
Entry* entry = &entries_[prefilter->unique_id()];
|
||||
@ -222,12 +233,12 @@ void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) {
|
||||
}
|
||||
|
||||
// Fill the entries.
|
||||
for (int i = v.size() - 1; i >= 0; i--) {
|
||||
for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
|
||||
Prefilter* prefilter = v[i];
|
||||
if (prefilter == NULL)
|
||||
continue;
|
||||
|
||||
if (CanonicalNode(prefilter) != prefilter)
|
||||
if (CanonicalNode(nodes, prefilter) != prefilter)
|
||||
continue;
|
||||
|
||||
Entry* entry = &entries_[prefilter->unique_id()];
|
||||
@ -244,10 +255,10 @@ void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) {
|
||||
|
||||
case Prefilter::OR:
|
||||
case Prefilter::AND: {
|
||||
set<int> uniq_child;
|
||||
for (size_t j = 0; j < prefilter->subs()->size() ; j++) {
|
||||
std::set<int> uniq_child;
|
||||
for (size_t j = 0; j < prefilter->subs()->size(); j++) {
|
||||
Prefilter* child = (*prefilter->subs())[j];
|
||||
Prefilter* canonical = CanonicalNode(child);
|
||||
Prefilter* canonical = CanonicalNode(nodes, child);
|
||||
if (canonical == NULL) {
|
||||
LOG(DFATAL) << "Null canonical node";
|
||||
return;
|
||||
@ -256,11 +267,14 @@ void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) {
|
||||
uniq_child.insert(child_id);
|
||||
// To the child, we want to add to parent indices.
|
||||
Entry* child_entry = &entries_[child_id];
|
||||
if (child_entry->parents->find(prefilter->unique_id()) == child_entry->parents->end())
|
||||
if (child_entry->parents->find(prefilter->unique_id()) ==
|
||||
child_entry->parents->end()) {
|
||||
(*child_entry->parents)[prefilter->unique_id()] = 1;
|
||||
}
|
||||
}
|
||||
entry->propagate_up_at_count =
|
||||
prefilter->op() == Prefilter::AND ? uniq_child.size() : 1;
|
||||
entry->propagate_up_at_count = prefilter->op() == Prefilter::AND
|
||||
? static_cast<int>(uniq_child.size())
|
||||
: 1;
|
||||
|
||||
break;
|
||||
}
|
||||
@ -271,29 +285,28 @@ void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) {
|
||||
for (size_t i = 0; i < prefilter_vec_.size(); i++) {
|
||||
if (prefilter_vec_[i] == NULL)
|
||||
continue;
|
||||
int id = CanonicalNode(prefilter_vec_[i])->unique_id();
|
||||
int id = CanonicalNode(nodes, prefilter_vec_[i])->unique_id();
|
||||
DCHECK_LE(0, id);
|
||||
Entry* entry = &entries_[id];
|
||||
entry->regexps.push_back(i);
|
||||
entry->regexps.push_back(static_cast<int>(i));
|
||||
}
|
||||
}
|
||||
|
||||
// Functions for triggering during search.
|
||||
void PrefilterTree::RegexpsGivenStrings(
|
||||
const vector<int>& matched_atoms,
|
||||
vector<int>* regexps) const {
|
||||
const std::vector<int>& matched_atoms,
|
||||
std::vector<int>* regexps) const {
|
||||
regexps->clear();
|
||||
if (!compiled_) {
|
||||
LOG(WARNING) << "Compile() not called";
|
||||
LOG(ERROR) << "RegexpsGivenStrings called before Compile.";
|
||||
for (size_t i = 0; i < prefilter_vec_.size(); ++i)
|
||||
regexps->push_back(i);
|
||||
regexps->push_back(static_cast<int>(i));
|
||||
} else {
|
||||
if (!prefilter_vec_.empty()) {
|
||||
IntMap regexps_map(prefilter_vec_.size());
|
||||
vector<int> matched_atom_ids;
|
||||
IntMap regexps_map(static_cast<int>(prefilter_vec_.size()));
|
||||
std::vector<int> matched_atom_ids;
|
||||
for (size_t j = 0; j < matched_atoms.size(); j++) {
|
||||
matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]);
|
||||
VLOG(10) << "Atom id:" << atom_index_to_id_[matched_atoms[j]];
|
||||
}
|
||||
PropagateMatch(matched_atom_ids, ®exps_map);
|
||||
for (IntMap::iterator it = regexps_map.begin();
|
||||
@ -304,23 +317,20 @@ void PrefilterTree::RegexpsGivenStrings(
|
||||
regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end());
|
||||
}
|
||||
}
|
||||
sort(regexps->begin(), regexps->end());
|
||||
std::sort(regexps->begin(), regexps->end());
|
||||
}
|
||||
|
||||
void PrefilterTree::PropagateMatch(const vector<int>& atom_ids,
|
||||
void PrefilterTree::PropagateMatch(const std::vector<int>& atom_ids,
|
||||
IntMap* regexps) const {
|
||||
IntMap count(entries_.size());
|
||||
IntMap work(entries_.size());
|
||||
IntMap count(static_cast<int>(entries_.size()));
|
||||
IntMap work(static_cast<int>(entries_.size()));
|
||||
for (size_t i = 0; i < atom_ids.size(); i++)
|
||||
work.set(atom_ids[i], 1);
|
||||
for (IntMap::iterator it = work.begin(); it != work.end(); ++it) {
|
||||
const Entry& entry = entries_[it->index()];
|
||||
VLOG(10) << "Processing: " << it->index();
|
||||
// Record regexps triggered.
|
||||
for (size_t i = 0; i < entry.regexps.size(); i++) {
|
||||
VLOG(10) << "Regexp triggered: " << entry.regexps[i];
|
||||
for (size_t i = 0; i < entry.regexps.size(); i++)
|
||||
regexps->set(entry.regexps[i], 1);
|
||||
}
|
||||
int c;
|
||||
// Pass trigger up to parents.
|
||||
for (StdIntMap::iterator it = entry.parents->begin();
|
||||
@ -328,7 +338,6 @@ void PrefilterTree::PropagateMatch(const vector<int>& atom_ids,
|
||||
++it) {
|
||||
int j = it->first;
|
||||
const Entry& parent = entries_[j];
|
||||
VLOG(10) << " parent= " << j << " trig= " << parent.propagate_up_at_count;
|
||||
// Delay until all the children have succeeded.
|
||||
if (parent.propagate_up_at_count > 1) {
|
||||
if (count.has_index(j)) {
|
||||
@ -341,7 +350,6 @@ void PrefilterTree::PropagateMatch(const vector<int>& atom_ids,
|
||||
if (c < parent.propagate_up_at_count)
|
||||
continue;
|
||||
}
|
||||
VLOG(10) << "Triggering: " << j;
|
||||
// Trigger the parent.
|
||||
work.set(j, 1);
|
||||
}
|
||||
@ -350,26 +358,26 @@ void PrefilterTree::PropagateMatch(const vector<int>& atom_ids,
|
||||
|
||||
// Debugging help.
|
||||
void PrefilterTree::PrintPrefilter(int regexpid) {
|
||||
LOG(INFO) << DebugNodeString(prefilter_vec_[regexpid]);
|
||||
LOG(ERROR) << DebugNodeString(prefilter_vec_[regexpid]);
|
||||
}
|
||||
|
||||
void PrefilterTree::PrintDebugInfo() {
|
||||
VLOG(10) << "#Unique Atoms: " << atom_index_to_id_.size();
|
||||
VLOG(10) << "#Unique Nodes: " << entries_.size();
|
||||
void PrefilterTree::PrintDebugInfo(NodeMap* nodes) {
|
||||
LOG(ERROR) << "#Unique Atoms: " << atom_index_to_id_.size();
|
||||
LOG(ERROR) << "#Unique Nodes: " << entries_.size();
|
||||
|
||||
for (size_t i = 0; i < entries_.size(); ++i) {
|
||||
StdIntMap* parents = entries_[i].parents;
|
||||
const vector<int>& regexps = entries_[i].regexps;
|
||||
VLOG(10) << "EntryId: " << i
|
||||
<< " N: " << parents->size() << " R: " << regexps.size();
|
||||
const std::vector<int>& regexps = entries_[i].regexps;
|
||||
LOG(ERROR) << "EntryId: " << i
|
||||
<< " N: " << parents->size() << " R: " << regexps.size();
|
||||
for (StdIntMap::iterator it = parents->begin(); it != parents->end(); ++it)
|
||||
VLOG(10) << it->first;
|
||||
LOG(ERROR) << it->first;
|
||||
}
|
||||
VLOG(10) << "Map:";
|
||||
for (map<string, Prefilter*>::const_iterator iter = node_map_.begin();
|
||||
iter != node_map_.end(); ++iter)
|
||||
VLOG(10) << "NodeId: " << (*iter).second->unique_id()
|
||||
<< " Str: " << (*iter).first;
|
||||
LOG(ERROR) << "Map:";
|
||||
for (std::map<string, Prefilter*>::const_iterator iter = nodes->begin();
|
||||
iter != nodes->end(); ++iter)
|
||||
LOG(ERROR) << "NodeId: " << (*iter).second->unique_id()
|
||||
<< " Str: " << (*iter).first;
|
||||
}
|
||||
|
||||
string PrefilterTree::DebugNodeString(Prefilter* node) const {
|
||||
@ -382,10 +390,10 @@ string PrefilterTree::DebugNodeString(Prefilter* node) const {
|
||||
// Adding the operation disambiguates AND and OR nodes.
|
||||
node_string += node->op() == Prefilter::AND ? "AND" : "OR";
|
||||
node_string += "(";
|
||||
for (size_t i = 0; i < node->subs()->size() ; i++) {
|
||||
for (size_t i = 0; i < node->subs()->size(); i++) {
|
||||
if (i > 0)
|
||||
node_string += ',';
|
||||
node_string += Itoa((*node->subs())[i]->unique_id());
|
||||
node_string += StringPrintf("%d", (*node->subs())[i]->unique_id());
|
||||
node_string += ":";
|
||||
node_string += DebugNodeString((*node->subs())[i]);
|
||||
}
|
||||
|
@ -2,6 +2,9 @@
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef RE2_PREFILTER_TREE_H_
|
||||
#define RE2_PREFILTER_TREE_H_
|
||||
|
||||
// The PrefilterTree class is used to form an AND-OR tree of strings
|
||||
// that would trigger each regexp. The 'prefilter' of each regexp is
|
||||
// added tp PrefilterTree, and then PrefilterTree is used to find all
|
||||
@ -12,23 +15,21 @@
|
||||
// favorite engine. PrefilterTree provides a set of strings (called
|
||||
// atoms) that the user of this class should use to do the string
|
||||
// matching.
|
||||
//
|
||||
#ifndef RE2_PREFILTER_TREE_H_
|
||||
#define RE2_PREFILTER_TREE_H_
|
||||
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "util/util.h"
|
||||
#include "util/sparse_array.h"
|
||||
#include "re2/prefilter.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
typedef SparseArray<int> IntMap;
|
||||
typedef map<int,int> StdIntMap;
|
||||
|
||||
class Prefilter;
|
||||
|
||||
class PrefilterTree {
|
||||
public:
|
||||
PrefilterTree();
|
||||
explicit PrefilterTree(int min_atom_len);
|
||||
~PrefilterTree();
|
||||
|
||||
// Adds the prefilter for the next regexp. Note that we assume that
|
||||
@ -42,20 +43,24 @@ class PrefilterTree {
|
||||
// The caller should use the returned set of strings to do string matching.
|
||||
// Each time a string matches, the corresponding index then has to be
|
||||
// and passed to RegexpsGivenStrings below.
|
||||
void Compile(vector<string>* atom_vec);
|
||||
void Compile(std::vector<string>* atom_vec);
|
||||
|
||||
// Given the indices of the atoms that matched, returns the indexes
|
||||
// of regexps that should be searched. The matched_atoms should
|
||||
// contain all the ids of string atoms that were found to match the
|
||||
// content. The caller can use any string match engine to perform
|
||||
// this function. This function is thread safe.
|
||||
void RegexpsGivenStrings(const vector<int>& matched_atoms,
|
||||
vector<int>* regexps) const;
|
||||
void RegexpsGivenStrings(const std::vector<int>& matched_atoms,
|
||||
std::vector<int>* regexps) const;
|
||||
|
||||
// Print debug prefilter. Also prints unique ids associated with
|
||||
// nodes of the prefilter of the regexp.
|
||||
void PrintPrefilter(int regexpid);
|
||||
|
||||
private:
|
||||
typedef SparseArray<int> IntMap;
|
||||
typedef std::map<int, int> StdIntMap;
|
||||
typedef std::map<string, Prefilter*> NodeMap;
|
||||
|
||||
// Each unique node has a corresponding Entry that helps in
|
||||
// passing the matching trigger information along the tree.
|
||||
@ -76,22 +81,24 @@ class PrefilterTree {
|
||||
|
||||
// When this node is ready to trigger the parent, what are the
|
||||
// regexps that are triggered.
|
||||
vector<int> regexps;
|
||||
std::vector<int> regexps;
|
||||
};
|
||||
|
||||
private:
|
||||
// Returns true if the prefilter node should be kept.
|
||||
bool KeepNode(Prefilter* node) const;
|
||||
|
||||
// This function assigns unique ids to various parts of the
|
||||
// prefilter, by looking at if these nodes are already in the
|
||||
// PrefilterTree.
|
||||
void AssignUniqueIds(vector<string>* atom_vec);
|
||||
void AssignUniqueIds(NodeMap* nodes, std::vector<string>* atom_vec);
|
||||
|
||||
// Given the matching atoms, find the regexps to be triggered.
|
||||
void PropagateMatch(const vector<int>& atom_ids,
|
||||
void PropagateMatch(const std::vector<int>& atom_ids,
|
||||
IntMap* regexps) const;
|
||||
|
||||
// Returns the prefilter node that has the same NodeString as this
|
||||
// node. For the canonical node, returns node.
|
||||
Prefilter* CanonicalNode(Prefilter* node);
|
||||
Prefilter* CanonicalNode(NodeMap* nodes, Prefilter* node);
|
||||
|
||||
// A string that uniquely identifies the node. Assumes that the
|
||||
// children of node has already been assigned unique ids.
|
||||
@ -101,29 +108,30 @@ class PrefilterTree {
|
||||
string DebugNodeString(Prefilter* node) const;
|
||||
|
||||
// Used for debugging.
|
||||
void PrintDebugInfo();
|
||||
void PrintDebugInfo(NodeMap* nodes);
|
||||
|
||||
// These are all the nodes formed by Compile. Essentially, there is
|
||||
// one node for each unique atom and each unique AND/OR node.
|
||||
vector<Entry> entries_;
|
||||
|
||||
// Map node string to canonical Prefilter node.
|
||||
map<string, Prefilter*> node_map_;
|
||||
std::vector<Entry> entries_;
|
||||
|
||||
// indices of regexps that always pass through the filter (since we
|
||||
// found no required literals in these regexps).
|
||||
vector<int> unfiltered_;
|
||||
std::vector<int> unfiltered_;
|
||||
|
||||
// vector of Prefilter for all regexps.
|
||||
vector<Prefilter*> prefilter_vec_;
|
||||
std::vector<Prefilter*> prefilter_vec_;
|
||||
|
||||
// Atom index in returned strings to entry id mapping.
|
||||
vector<int> atom_index_to_id_;
|
||||
std::vector<int> atom_index_to_id_;
|
||||
|
||||
// Has the prefilter tree been compiled.
|
||||
bool compiled_;
|
||||
|
||||
DISALLOW_EVIL_CONSTRUCTORS(PrefilterTree);
|
||||
// Strings less than this length are not stored as atoms.
|
||||
const int min_atom_len_;
|
||||
|
||||
PrefilterTree(const PrefilterTree&) = delete;
|
||||
PrefilterTree& operator=(const PrefilterTree&) = delete;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
@ -5,48 +5,57 @@
|
||||
// Compiled regular expression representation.
|
||||
// Tested by compile_test.cc
|
||||
|
||||
#include "util/util.h"
|
||||
#include "util/sparse_set.h"
|
||||
#include "re2/prog.h"
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <algorithm>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
|
||||
#include "util/util.h"
|
||||
#include "util/logging.h"
|
||||
#include "util/strutil.h"
|
||||
#include "re2/bitmap256.h"
|
||||
#include "re2/stringpiece.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Constructors per Inst opcode
|
||||
|
||||
void Prog::Inst::InitAlt(uint32 out, uint32 out1) {
|
||||
void Prog::Inst::InitAlt(uint32_t out, uint32_t out1) {
|
||||
DCHECK_EQ(out_opcode_, 0);
|
||||
set_out_opcode(out, kInstAlt);
|
||||
out1_ = out1;
|
||||
}
|
||||
|
||||
void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32 out) {
|
||||
void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32_t out) {
|
||||
DCHECK_EQ(out_opcode_, 0);
|
||||
set_out_opcode(out, kInstByteRange);
|
||||
lo_ = lo & 0xFF;
|
||||
hi_ = hi & 0xFF;
|
||||
foldcase_ = foldcase;
|
||||
foldcase_ = foldcase & 0xFF;
|
||||
}
|
||||
|
||||
void Prog::Inst::InitCapture(int cap, uint32 out) {
|
||||
void Prog::Inst::InitCapture(int cap, uint32_t out) {
|
||||
DCHECK_EQ(out_opcode_, 0);
|
||||
set_out_opcode(out, kInstCapture);
|
||||
cap_ = cap;
|
||||
}
|
||||
|
||||
void Prog::Inst::InitEmptyWidth(EmptyOp empty, uint32 out) {
|
||||
void Prog::Inst::InitEmptyWidth(EmptyOp empty, uint32_t out) {
|
||||
DCHECK_EQ(out_opcode_, 0);
|
||||
set_out_opcode(out, kInstEmptyWidth);
|
||||
empty_ = empty;
|
||||
}
|
||||
|
||||
void Prog::Inst::InitMatch(int32 id) {
|
||||
void Prog::Inst::InitMatch(int32_t id) {
|
||||
DCHECK_EQ(out_opcode_, 0);
|
||||
set_opcode(kInstMatch);
|
||||
match_id_ = id;
|
||||
}
|
||||
|
||||
void Prog::Inst::InitNop(uint32 out) {
|
||||
void Prog::Inst::InitNop(uint32_t out) {
|
||||
DCHECK_EQ(out_opcode_, 0);
|
||||
set_opcode(kInstNop);
|
||||
}
|
||||
@ -94,34 +103,27 @@ Prog::Prog()
|
||||
: anchor_start_(false),
|
||||
anchor_end_(false),
|
||||
reversed_(false),
|
||||
did_flatten_(false),
|
||||
did_onepass_(false),
|
||||
start_(0),
|
||||
start_unanchored_(0),
|
||||
size_(0),
|
||||
byte_inst_count_(0),
|
||||
bytemap_range_(0),
|
||||
first_byte_(-1),
|
||||
flags_(0),
|
||||
onepass_statesize_(0),
|
||||
list_count_(0),
|
||||
inst_(NULL),
|
||||
dfa_first_(NULL),
|
||||
dfa_longest_(NULL),
|
||||
dfa_mem_(0),
|
||||
delete_dfa_(NULL),
|
||||
unbytemap_(NULL),
|
||||
onepass_nodes_(NULL),
|
||||
onepass_start_(NULL) {
|
||||
dfa_mem_(0),
|
||||
dfa_first_(NULL),
|
||||
dfa_longest_(NULL) {
|
||||
}
|
||||
|
||||
Prog::~Prog() {
|
||||
if (delete_dfa_) {
|
||||
if (dfa_first_)
|
||||
delete_dfa_(dfa_first_);
|
||||
if (dfa_longest_)
|
||||
delete_dfa_(dfa_longest_);
|
||||
}
|
||||
DeleteDFA(dfa_longest_);
|
||||
DeleteDFA(dfa_first_);
|
||||
delete[] onepass_nodes_;
|
||||
delete[] inst_;
|
||||
delete[] unbytemap_;
|
||||
}
|
||||
|
||||
typedef SparseSet Workq;
|
||||
@ -133,7 +135,6 @@ static inline void AddToQueue(Workq* q, int id) {
|
||||
|
||||
static string ProgToString(Prog* prog, Workq* q) {
|
||||
string s;
|
||||
|
||||
for (Workq::iterator i = q->begin(); i != q->end(); ++i) {
|
||||
int id = *i;
|
||||
Prog::Inst* ip = prog->inst(id);
|
||||
@ -145,29 +146,56 @@ static string ProgToString(Prog* prog, Workq* q) {
|
||||
return s;
|
||||
}
|
||||
|
||||
string Prog::Dump() {
|
||||
string map;
|
||||
if (false) { // Debugging
|
||||
int lo = 0;
|
||||
StringAppendF(&map, "byte map:\n");
|
||||
for (int i = 0; i < bytemap_range_; i++) {
|
||||
StringAppendF(&map, "\t%d. [%02x-%02x]\n", i, lo, unbytemap_[i]);
|
||||
lo = unbytemap_[i] + 1;
|
||||
}
|
||||
StringAppendF(&map, "\n");
|
||||
static string FlattenedProgToString(Prog* prog, int start) {
|
||||
string s;
|
||||
for (int id = start; id < prog->size(); id++) {
|
||||
Prog::Inst* ip = prog->inst(id);
|
||||
if (ip->last())
|
||||
StringAppendF(&s, "%d. %s\n", id, ip->Dump().c_str());
|
||||
else
|
||||
StringAppendF(&s, "%d+ %s\n", id, ip->Dump().c_str());
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
string Prog::Dump() {
|
||||
if (did_flatten_)
|
||||
return FlattenedProgToString(this, start_);
|
||||
|
||||
Workq q(size_);
|
||||
AddToQueue(&q, start_);
|
||||
return map + ProgToString(this, &q);
|
||||
return ProgToString(this, &q);
|
||||
}
|
||||
|
||||
string Prog::DumpUnanchored() {
|
||||
if (did_flatten_)
|
||||
return FlattenedProgToString(this, start_unanchored_);
|
||||
|
||||
Workq q(size_);
|
||||
AddToQueue(&q, start_unanchored_);
|
||||
return ProgToString(this, &q);
|
||||
}
|
||||
|
||||
string Prog::DumpByteMap() {
|
||||
string map;
|
||||
for (int c = 0; c < 256; c++) {
|
||||
int b = bytemap_[c];
|
||||
int lo = c;
|
||||
while (c < 256-1 && bytemap_[c+1] == b)
|
||||
c++;
|
||||
int hi = c;
|
||||
StringAppendF(&map, "[%02x-%02x] -> %d\n", lo, hi, b);
|
||||
}
|
||||
return map;
|
||||
}
|
||||
|
||||
int Prog::first_byte() {
|
||||
std::call_once(first_byte_once_, [](Prog* prog) {
|
||||
prog->first_byte_ = prog->ComputeFirstByte();
|
||||
}, this);
|
||||
return first_byte_;
|
||||
}
|
||||
|
||||
static bool IsMatch(Prog*, Prog::Inst*);
|
||||
|
||||
// Peep-hole optimizer.
|
||||
@ -260,7 +288,7 @@ static bool IsMatch(Prog* prog, Prog::Inst* ip) {
|
||||
}
|
||||
}
|
||||
|
||||
uint32 Prog::EmptyFlags(const StringPiece& text, const char* p) {
|
||||
uint32_t Prog::EmptyFlags(const StringPiece& text, const char* p) {
|
||||
int flags = 0;
|
||||
|
||||
// ^ and \A
|
||||
@ -294,50 +322,505 @@ uint32 Prog::EmptyFlags(const StringPiece& text, const char* p) {
|
||||
return flags;
|
||||
}
|
||||
|
||||
void Prog::MarkByteRange(int lo, int hi) {
|
||||
// ByteMapBuilder implements a coloring algorithm.
|
||||
//
|
||||
// The first phase is a series of "mark and merge" batches: we mark one or more
|
||||
// [lo-hi] ranges, then merge them into our internal state. Batching is not for
|
||||
// performance; rather, it means that the ranges are treated indistinguishably.
|
||||
//
|
||||
// Internally, the ranges are represented using a bitmap that stores the splits
|
||||
// and a vector that stores the colors; both of them are indexed by the ranges'
|
||||
// last bytes. Thus, in order to merge a [lo-hi] range, we split at lo-1 and at
|
||||
// hi (if not already split), then recolor each range in between. The color map
|
||||
// (i.e. from the old color to the new color) is maintained for the lifetime of
|
||||
// the batch and so underpins this somewhat obscure approach to set operations.
|
||||
//
|
||||
// The second phase builds the bytemap from our internal state: we recolor each
|
||||
// range, then store the new color (which is now the byte class) in each of the
|
||||
// corresponding array elements. Finally, we output the number of byte classes.
|
||||
class ByteMapBuilder {
|
||||
public:
|
||||
ByteMapBuilder() {
|
||||
// Initial state: the [0-255] range has color 256.
|
||||
// This will avoid problems during the second phase,
|
||||
// in which we assign byte classes numbered from 0.
|
||||
splits_.Set(255);
|
||||
colors_.resize(256);
|
||||
colors_[255] = 256;
|
||||
nextcolor_ = 257;
|
||||
}
|
||||
|
||||
void Mark(int lo, int hi);
|
||||
void Merge();
|
||||
void Build(uint8_t* bytemap, int* bytemap_range);
|
||||
|
||||
private:
|
||||
int Recolor(int oldcolor);
|
||||
|
||||
Bitmap256 splits_;
|
||||
std::vector<int> colors_;
|
||||
int nextcolor_;
|
||||
std::vector<std::pair<int, int>> colormap_;
|
||||
std::vector<std::pair<int, int>> ranges_;
|
||||
|
||||
ByteMapBuilder(const ByteMapBuilder&) = delete;
|
||||
ByteMapBuilder& operator=(const ByteMapBuilder&) = delete;
|
||||
};
|
||||
|
||||
void ByteMapBuilder::Mark(int lo, int hi) {
|
||||
DCHECK_GE(lo, 0);
|
||||
DCHECK_GE(hi, 0);
|
||||
DCHECK_LE(lo, 255);
|
||||
DCHECK_LE(hi, 255);
|
||||
DCHECK_LE(lo, hi);
|
||||
if (0 < lo && lo <= 255)
|
||||
byterange_.Set(lo - 1);
|
||||
if (0 <= hi && hi <= 255)
|
||||
byterange_.Set(hi);
|
||||
|
||||
// Ignore any [0-255] ranges. They cause us to recolor every range, which
|
||||
// has no effect on the eventual result and is therefore a waste of time.
|
||||
if (lo == 0 && hi == 255)
|
||||
return;
|
||||
|
||||
ranges_.emplace_back(lo, hi);
|
||||
}
|
||||
|
||||
void ByteMapBuilder::Merge() {
|
||||
for (std::vector<std::pair<int, int>>::const_iterator it = ranges_.begin();
|
||||
it != ranges_.end();
|
||||
++it) {
|
||||
int lo = it->first-1;
|
||||
int hi = it->second;
|
||||
|
||||
if (0 <= lo && !splits_.Test(lo)) {
|
||||
splits_.Set(lo);
|
||||
int next = splits_.FindNextSetBit(lo+1);
|
||||
colors_[lo] = colors_[next];
|
||||
}
|
||||
if (!splits_.Test(hi)) {
|
||||
splits_.Set(hi);
|
||||
int next = splits_.FindNextSetBit(hi+1);
|
||||
colors_[hi] = colors_[next];
|
||||
}
|
||||
|
||||
int c = lo+1;
|
||||
while (c < 256) {
|
||||
int next = splits_.FindNextSetBit(c);
|
||||
colors_[next] = Recolor(colors_[next]);
|
||||
if (next == hi)
|
||||
break;
|
||||
c = next+1;
|
||||
}
|
||||
}
|
||||
colormap_.clear();
|
||||
ranges_.clear();
|
||||
}
|
||||
|
||||
void ByteMapBuilder::Build(uint8_t* bytemap, int* bytemap_range) {
|
||||
// Assign byte classes numbered from 0.
|
||||
nextcolor_ = 0;
|
||||
|
||||
int c = 0;
|
||||
while (c < 256) {
|
||||
int next = splits_.FindNextSetBit(c);
|
||||
uint8_t b = static_cast<uint8_t>(Recolor(colors_[next]));
|
||||
while (c <= next) {
|
||||
bytemap[c] = b;
|
||||
c++;
|
||||
}
|
||||
}
|
||||
|
||||
*bytemap_range = nextcolor_;
|
||||
}
|
||||
|
||||
int ByteMapBuilder::Recolor(int oldcolor) {
|
||||
// Yes, this is a linear search. There can be at most 256
|
||||
// colors and there will typically be far fewer than that.
|
||||
// Also, we need to consider keys *and* values in order to
|
||||
// avoid recoloring a given range more than once per batch.
|
||||
std::vector<std::pair<int, int>>::const_iterator it =
|
||||
std::find_if(colormap_.begin(), colormap_.end(),
|
||||
[=](const std::pair<int, int>& kv) -> bool {
|
||||
return kv.first == oldcolor || kv.second == oldcolor;
|
||||
});
|
||||
if (it != colormap_.end())
|
||||
return it->second;
|
||||
int newcolor = nextcolor_;
|
||||
nextcolor_++;
|
||||
colormap_.emplace_back(oldcolor, newcolor);
|
||||
return newcolor;
|
||||
}
|
||||
|
||||
void Prog::ComputeByteMap() {
|
||||
// Fill in bytemap with byte classes for prog_.
|
||||
// Ranges of bytes that are treated as indistinguishable
|
||||
// by the regexp program are mapped to a single byte class.
|
||||
// The vector prog_->byterange() marks the end of each
|
||||
// such range.
|
||||
const Bitmap<256>& v = byterange();
|
||||
// Fill in bytemap with byte classes for the program.
|
||||
// Ranges of bytes that are treated indistinguishably
|
||||
// will be mapped to a single byte class.
|
||||
ByteMapBuilder builder;
|
||||
|
||||
COMPILE_ASSERT(8*sizeof(v.Word(0)) == 32, wordsize);
|
||||
uint8 n = 0;
|
||||
uint32 bits = 0;
|
||||
for (int i = 0; i < 256; i++) {
|
||||
if ((i&31) == 0)
|
||||
bits = v.Word(i >> 5);
|
||||
bytemap_[i] = n;
|
||||
n += bits & 1;
|
||||
bits >>= 1;
|
||||
}
|
||||
bytemap_range_ = bytemap_[255] + 1;
|
||||
unbytemap_ = new uint8[bytemap_range_];
|
||||
for (int i = 0; i < 256; i++)
|
||||
unbytemap_[bytemap_[i]] = i;
|
||||
// Don't repeat the work for ^ and $.
|
||||
bool marked_line_boundaries = false;
|
||||
// Don't repeat the work for \b and \B.
|
||||
bool marked_word_boundaries = false;
|
||||
|
||||
if (0) { // For debugging: use trivial byte map.
|
||||
for (int i = 0; i < 256; i++) {
|
||||
bytemap_[i] = i;
|
||||
unbytemap_[i] = i;
|
||||
for (int id = 0; id < size(); id++) {
|
||||
Inst* ip = inst(id);
|
||||
if (ip->opcode() == kInstByteRange) {
|
||||
int lo = ip->lo();
|
||||
int hi = ip->hi();
|
||||
builder.Mark(lo, hi);
|
||||
if (ip->foldcase() && lo <= 'z' && hi >= 'a') {
|
||||
int foldlo = lo;
|
||||
int foldhi = hi;
|
||||
if (foldlo < 'a')
|
||||
foldlo = 'a';
|
||||
if (foldhi > 'z')
|
||||
foldhi = 'z';
|
||||
if (foldlo <= foldhi)
|
||||
builder.Mark(foldlo + 'A' - 'a', foldhi + 'A' - 'a');
|
||||
}
|
||||
// If this Inst is not the last Inst in its list AND the next Inst is
|
||||
// also a ByteRange AND the Insts have the same out, defer the merge.
|
||||
if (!ip->last() &&
|
||||
inst(id+1)->opcode() == kInstByteRange &&
|
||||
ip->out() == inst(id+1)->out())
|
||||
continue;
|
||||
builder.Merge();
|
||||
} else if (ip->opcode() == kInstEmptyWidth) {
|
||||
if (ip->empty() & (kEmptyBeginLine|kEmptyEndLine) &&
|
||||
!marked_line_boundaries) {
|
||||
builder.Mark('\n', '\n');
|
||||
builder.Merge();
|
||||
marked_line_boundaries = true;
|
||||
}
|
||||
if (ip->empty() & (kEmptyWordBoundary|kEmptyNonWordBoundary) &&
|
||||
!marked_word_boundaries) {
|
||||
// We require two batches here: the first for ranges that are word
|
||||
// characters, the second for ranges that are not word characters.
|
||||
for (bool isword : {true, false}) {
|
||||
int j;
|
||||
for (int i = 0; i < 256; i = j) {
|
||||
for (j = i + 1; j < 256 &&
|
||||
Prog::IsWordChar(static_cast<uint8_t>(i)) ==
|
||||
Prog::IsWordChar(static_cast<uint8_t>(j));
|
||||
j++)
|
||||
;
|
||||
if (Prog::IsWordChar(static_cast<uint8_t>(i)) == isword)
|
||||
builder.Mark(i, j - 1);
|
||||
}
|
||||
builder.Merge();
|
||||
}
|
||||
marked_word_boundaries = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
builder.Build(bytemap_, &bytemap_range_);
|
||||
|
||||
if (0) { // For debugging, use trivial bytemap.
|
||||
LOG(ERROR) << "Using trivial bytemap.";
|
||||
for (int i = 0; i < 256; i++)
|
||||
bytemap_[i] = static_cast<uint8_t>(i);
|
||||
bytemap_range_ = 256;
|
||||
LOG(INFO) << "Using trivial bytemap.";
|
||||
}
|
||||
}
|
||||
|
||||
// Prog::Flatten() implements a graph rewriting algorithm.
|
||||
//
|
||||
// The overall process is similar to epsilon removal, but retains some epsilon
|
||||
// transitions: those from Capture and EmptyWidth instructions; and those from
|
||||
// nullable subexpressions. (The latter avoids quadratic blowup in transitions
|
||||
// in the worst case.) It might be best thought of as Alt instruction elision.
|
||||
//
|
||||
// In conceptual terms, it divides the Prog into "trees" of instructions, then
|
||||
// traverses the "trees" in order to produce "lists" of instructions. A "tree"
|
||||
// is one or more instructions that grow from one "root" instruction to one or
|
||||
// more "leaf" instructions; if a "tree" has exactly one instruction, then the
|
||||
// "root" is also the "leaf". In most cases, a "root" is the successor of some
|
||||
// "leaf" (i.e. the "leaf" instruction's out() returns the "root" instruction)
|
||||
// and is considered a "successor root". A "leaf" can be a ByteRange, Capture,
|
||||
// EmptyWidth or Match instruction. However, this is insufficient for handling
|
||||
// nested nullable subexpressions correctly, so in some cases, a "root" is the
|
||||
// dominator of the instructions reachable from some "successor root" (i.e. it
|
||||
// has an unreachable predecessor) and is considered a "dominator root". Since
|
||||
// only Alt instructions can be "dominator roots" (other instructions would be
|
||||
// "leaves"), only Alt instructions are required to be marked as predecessors.
|
||||
//
|
||||
// Dividing the Prog into "trees" comprises two passes: marking the "successor
|
||||
// roots" and the predecessors; and marking the "dominator roots". Sorting the
|
||||
// "successor roots" by their bytecode offsets enables iteration in order from
|
||||
// greatest to least during the second pass; by working backwards in this case
|
||||
// and flooding the graph no further than "leaves" and already marked "roots",
|
||||
// it becomes possible to mark "dominator roots" without doing excessive work.
|
||||
//
|
||||
// Traversing the "trees" is just iterating over the "roots" in order of their
|
||||
// marking and flooding the graph no further than "leaves" and "roots". When a
|
||||
// "leaf" is reached, the instruction is copied with its successor remapped to
|
||||
// its "root" number. When a "root" is reached, a Nop instruction is generated
|
||||
// with its successor remapped similarly. As each "list" is produced, its last
|
||||
// instruction is marked as such. After all of the "lists" have been produced,
|
||||
// a pass over their instructions remaps their successors to bytecode offsets.
|
||||
void Prog::Flatten() {
|
||||
if (did_flatten_)
|
||||
return;
|
||||
did_flatten_ = true;
|
||||
|
||||
// Scratch structures. It's important that these are reused by functions
|
||||
// that we call in loops because they would thrash the heap otherwise.
|
||||
SparseSet reachable(size());
|
||||
std::vector<int> stk;
|
||||
stk.reserve(size());
|
||||
|
||||
// First pass: Marks "successor roots" and predecessors.
|
||||
// Builds the mapping from inst-ids to root-ids.
|
||||
SparseArray<int> rootmap(size());
|
||||
SparseArray<int> predmap(size());
|
||||
std::vector<std::vector<int>> predvec;
|
||||
MarkSuccessors(&rootmap, &predmap, &predvec, &reachable, &stk);
|
||||
|
||||
// Second pass: Marks "dominator roots".
|
||||
SparseArray<int> sorted(rootmap);
|
||||
std::sort(sorted.begin(), sorted.end(), sorted.less);
|
||||
for (SparseArray<int>::const_iterator i = sorted.end() - 1;
|
||||
i != sorted.begin();
|
||||
--i) {
|
||||
if (i->index() != start_unanchored() && i->index() != start())
|
||||
MarkDominator(i->index(), &rootmap, &predmap, &predvec, &reachable, &stk);
|
||||
}
|
||||
|
||||
// Third pass: Emits "lists". Remaps outs to root-ids.
|
||||
// Builds the mapping from root-ids to flat-ids.
|
||||
std::vector<int> flatmap(rootmap.size());
|
||||
std::vector<Inst> flat;
|
||||
flat.reserve(size());
|
||||
for (SparseArray<int>::const_iterator i = rootmap.begin();
|
||||
i != rootmap.end();
|
||||
++i) {
|
||||
flatmap[i->value()] = static_cast<int>(flat.size());
|
||||
EmitList(i->index(), &rootmap, &flat, &reachable, &stk);
|
||||
flat.back().set_last();
|
||||
}
|
||||
|
||||
list_count_ = static_cast<int>(flatmap.size());
|
||||
for (int i = 0; i < kNumInst; i++)
|
||||
inst_count_[i] = 0;
|
||||
|
||||
// Fourth pass: Remaps outs to flat-ids.
|
||||
// Counts instructions by opcode.
|
||||
for (int id = 0; id < static_cast<int>(flat.size()); id++) {
|
||||
Inst* ip = &flat[id];
|
||||
if (ip->opcode() != kInstAltMatch) // handled in EmitList()
|
||||
ip->set_out(flatmap[ip->out()]);
|
||||
inst_count_[ip->opcode()]++;
|
||||
}
|
||||
|
||||
int total = 0;
|
||||
for (int i = 0; i < kNumInst; i++)
|
||||
total += inst_count_[i];
|
||||
DCHECK_EQ(total, static_cast<int>(flat.size()));
|
||||
|
||||
// Remap start_unanchored and start.
|
||||
if (start_unanchored() == 0) {
|
||||
DCHECK_EQ(start(), 0);
|
||||
} else if (start_unanchored() == start()) {
|
||||
set_start_unanchored(flatmap[1]);
|
||||
set_start(flatmap[1]);
|
||||
} else {
|
||||
set_start_unanchored(flatmap[1]);
|
||||
set_start(flatmap[2]);
|
||||
}
|
||||
|
||||
// Finally, replace the old instructions with the new instructions.
|
||||
size_ = static_cast<int>(flat.size());
|
||||
delete[] inst_;
|
||||
inst_ = new Inst[size_];
|
||||
memmove(inst_, flat.data(), size_ * sizeof *inst_);
|
||||
}
|
||||
|
||||
void Prog::MarkSuccessors(SparseArray<int>* rootmap,
|
||||
SparseArray<int>* predmap,
|
||||
std::vector<std::vector<int>>* predvec,
|
||||
SparseSet* reachable, std::vector<int>* stk) {
|
||||
// Mark the kInstFail instruction.
|
||||
rootmap->set_new(0, rootmap->size());
|
||||
|
||||
// Mark the start_unanchored and start instructions.
|
||||
if (!rootmap->has_index(start_unanchored()))
|
||||
rootmap->set_new(start_unanchored(), rootmap->size());
|
||||
if (!rootmap->has_index(start()))
|
||||
rootmap->set_new(start(), rootmap->size());
|
||||
|
||||
reachable->clear();
|
||||
stk->clear();
|
||||
stk->push_back(start_unanchored());
|
||||
while (!stk->empty()) {
|
||||
int id = stk->back();
|
||||
stk->pop_back();
|
||||
Loop:
|
||||
if (reachable->contains(id))
|
||||
continue;
|
||||
reachable->insert_new(id);
|
||||
|
||||
Inst* ip = inst(id);
|
||||
switch (ip->opcode()) {
|
||||
default:
|
||||
LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
|
||||
break;
|
||||
|
||||
case kInstAltMatch:
|
||||
case kInstAlt:
|
||||
// Mark this instruction as a predecessor of each out.
|
||||
for (int out : {ip->out(), ip->out1()}) {
|
||||
if (!predmap->has_index(out)) {
|
||||
predmap->set_new(out, static_cast<int>(predvec->size()));
|
||||
predvec->emplace_back();
|
||||
}
|
||||
(*predvec)[predmap->get_existing(out)].emplace_back(id);
|
||||
}
|
||||
stk->push_back(ip->out1());
|
||||
id = ip->out();
|
||||
goto Loop;
|
||||
|
||||
case kInstByteRange:
|
||||
case kInstCapture:
|
||||
case kInstEmptyWidth:
|
||||
// Mark the out of this instruction as a "root".
|
||||
if (!rootmap->has_index(ip->out()))
|
||||
rootmap->set_new(ip->out(), rootmap->size());
|
||||
id = ip->out();
|
||||
goto Loop;
|
||||
|
||||
case kInstNop:
|
||||
id = ip->out();
|
||||
goto Loop;
|
||||
|
||||
case kInstMatch:
|
||||
case kInstFail:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Prog::MarkDominator(int root, SparseArray<int>* rootmap,
|
||||
SparseArray<int>* predmap,
|
||||
std::vector<std::vector<int>>* predvec,
|
||||
SparseSet* reachable, std::vector<int>* stk) {
|
||||
reachable->clear();
|
||||
stk->clear();
|
||||
stk->push_back(root);
|
||||
while (!stk->empty()) {
|
||||
int id = stk->back();
|
||||
stk->pop_back();
|
||||
Loop:
|
||||
if (reachable->contains(id))
|
||||
continue;
|
||||
reachable->insert_new(id);
|
||||
|
||||
if (id != root && rootmap->has_index(id)) {
|
||||
// We reached another "tree" via epsilon transition.
|
||||
continue;
|
||||
}
|
||||
|
||||
Inst* ip = inst(id);
|
||||
switch (ip->opcode()) {
|
||||
default:
|
||||
LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
|
||||
break;
|
||||
|
||||
case kInstAltMatch:
|
||||
case kInstAlt:
|
||||
stk->push_back(ip->out1());
|
||||
id = ip->out();
|
||||
goto Loop;
|
||||
|
||||
case kInstByteRange:
|
||||
case kInstCapture:
|
||||
case kInstEmptyWidth:
|
||||
break;
|
||||
|
||||
case kInstNop:
|
||||
id = ip->out();
|
||||
goto Loop;
|
||||
|
||||
case kInstMatch:
|
||||
case kInstFail:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (SparseSet::const_iterator i = reachable->begin();
|
||||
i != reachable->end();
|
||||
++i) {
|
||||
int id = *i;
|
||||
if (predmap->has_index(id)) {
|
||||
for (int pred : (*predvec)[predmap->get_existing(id)]) {
|
||||
if (!reachable->contains(pred)) {
|
||||
// id has a predecessor that cannot be reached from root!
|
||||
// Therefore, id must be a "root" too - mark it as such.
|
||||
if (!rootmap->has_index(id))
|
||||
rootmap->set_new(id, rootmap->size());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Prog::EmitList(int root, SparseArray<int>* rootmap,
|
||||
std::vector<Inst>* flat,
|
||||
SparseSet* reachable, std::vector<int>* stk) {
|
||||
reachable->clear();
|
||||
stk->clear();
|
||||
stk->push_back(root);
|
||||
while (!stk->empty()) {
|
||||
int id = stk->back();
|
||||
stk->pop_back();
|
||||
Loop:
|
||||
if (reachable->contains(id))
|
||||
continue;
|
||||
reachable->insert_new(id);
|
||||
|
||||
if (id != root && rootmap->has_index(id)) {
|
||||
// We reached another "tree" via epsilon transition. Emit a kInstNop
|
||||
// instruction so that the Prog does not become quadratically larger.
|
||||
flat->emplace_back();
|
||||
flat->back().set_opcode(kInstNop);
|
||||
flat->back().set_out(rootmap->get_existing(id));
|
||||
continue;
|
||||
}
|
||||
|
||||
Inst* ip = inst(id);
|
||||
switch (ip->opcode()) {
|
||||
default:
|
||||
LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
|
||||
break;
|
||||
|
||||
case kInstAltMatch:
|
||||
flat->emplace_back();
|
||||
flat->back().set_opcode(kInstAltMatch);
|
||||
flat->back().set_out(static_cast<int>(flat->size()));
|
||||
flat->back().out1_ = static_cast<uint32_t>(flat->size())+1;
|
||||
FALLTHROUGH_INTENDED;
|
||||
|
||||
case kInstAlt:
|
||||
stk->push_back(ip->out1());
|
||||
id = ip->out();
|
||||
goto Loop;
|
||||
|
||||
case kInstByteRange:
|
||||
case kInstCapture:
|
||||
case kInstEmptyWidth:
|
||||
flat->emplace_back();
|
||||
memmove(&flat->back(), ip, sizeof *ip);
|
||||
flat->back().set_out(rootmap->get_existing(ip->out()));
|
||||
break;
|
||||
|
||||
case kInstNop:
|
||||
id = ip->out();
|
||||
goto Loop;
|
||||
|
||||
case kInstMatch:
|
||||
case kInstFail:
|
||||
flat->emplace_back();
|
||||
memmove(&flat->back(), ip, sizeof *ip);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
|
||||
|
@ -2,50 +2,27 @@
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef RE2_PROG_H_
|
||||
#define RE2_PROG_H_
|
||||
|
||||
// Compiled representation of regular expressions.
|
||||
// See regexp.h for the Regexp class, which represents a regular
|
||||
// expression symbolically.
|
||||
|
||||
#ifndef RE2_PROG_H__
|
||||
#define RE2_PROG_H__
|
||||
#include <stdint.h>
|
||||
#include <functional>
|
||||
#include <mutex>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "util/util.h"
|
||||
#include "util/logging.h"
|
||||
#include "util/sparse_array.h"
|
||||
#include "util/sparse_set.h"
|
||||
#include "re2/re2.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Simple fixed-size bitmap.
|
||||
template<int Bits>
|
||||
class Bitmap {
|
||||
public:
|
||||
Bitmap() { Reset(); }
|
||||
int Size() { return Bits; }
|
||||
|
||||
void Reset() {
|
||||
for (int i = 0; i < Words; i++)
|
||||
w_[i] = 0;
|
||||
}
|
||||
bool Get(int k) const {
|
||||
return w_[k >> WordLog] & (1<<(k & 31));
|
||||
}
|
||||
void Set(int k) {
|
||||
w_[k >> WordLog] |= 1<<(k & 31);
|
||||
}
|
||||
void Clear(int k) {
|
||||
w_[k >> WordLog] &= ~(1<<(k & 31));
|
||||
}
|
||||
uint32 Word(int i) const {
|
||||
return w_[i];
|
||||
}
|
||||
|
||||
private:
|
||||
static const int WordLog = 5;
|
||||
static const int Words = (Bits+31)/32;
|
||||
uint32 w_[Words];
|
||||
DISALLOW_EVIL_CONSTRUCTORS(Bitmap);
|
||||
};
|
||||
|
||||
|
||||
// Opcodes for Inst
|
||||
enum InstOp {
|
||||
kInstAlt = 0, // choose between out_ and out1_
|
||||
@ -56,6 +33,7 @@ enum InstOp {
|
||||
kInstMatch, // found a match!
|
||||
kInstNop, // no-op; occasionally unavoidable
|
||||
kInstFail, // never match; occasionally unavoidable
|
||||
kNumInst,
|
||||
};
|
||||
|
||||
// Bit flags for empty-width specials
|
||||
@ -69,10 +47,8 @@ enum EmptyOp {
|
||||
kEmptyAllFlags = (1<<6)-1,
|
||||
};
|
||||
|
||||
class Regexp;
|
||||
|
||||
class DFA;
|
||||
struct OneState;
|
||||
class Regexp;
|
||||
|
||||
// Compiled form of regexp program.
|
||||
class Prog {
|
||||
@ -83,31 +59,39 @@ class Prog {
|
||||
// Single instruction in regexp program.
|
||||
class Inst {
|
||||
public:
|
||||
Inst() : out_opcode_(0), out1_(0) { }
|
||||
Inst() : out_opcode_(0), out1_(0) {}
|
||||
|
||||
// Copyable.
|
||||
Inst(const Inst&) = default;
|
||||
Inst& operator=(const Inst&) = default;
|
||||
|
||||
// Constructors per opcode
|
||||
void InitAlt(uint32 out, uint32 out1);
|
||||
void InitByteRange(int lo, int hi, int foldcase, uint32 out);
|
||||
void InitCapture(int cap, uint32 out);
|
||||
void InitEmptyWidth(EmptyOp empty, uint32 out);
|
||||
void InitAlt(uint32_t out, uint32_t out1);
|
||||
void InitByteRange(int lo, int hi, int foldcase, uint32_t out);
|
||||
void InitCapture(int cap, uint32_t out);
|
||||
void InitEmptyWidth(EmptyOp empty, uint32_t out);
|
||||
void InitMatch(int id);
|
||||
void InitNop(uint32 out);
|
||||
void InitNop(uint32_t out);
|
||||
void InitFail();
|
||||
|
||||
// Getters
|
||||
int id(Prog* p) { return this - p->inst_; }
|
||||
int id(Prog* p) { return static_cast<int>(this - p->inst_); }
|
||||
InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); }
|
||||
int out() { return out_opcode_>>3; }
|
||||
int out1() { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; }
|
||||
int last() { return (out_opcode_>>3)&1; }
|
||||
int out() { return out_opcode_>>4; }
|
||||
int out1() { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; }
|
||||
int cap() { DCHECK_EQ(opcode(), kInstCapture); return cap_; }
|
||||
int lo() { DCHECK_EQ(opcode(), kInstByteRange); return lo_; }
|
||||
int hi() { DCHECK_EQ(opcode(), kInstByteRange); return hi_; }
|
||||
int foldcase() { DCHECK_EQ(opcode(), kInstByteRange); return foldcase_; }
|
||||
int match_id() { DCHECK_EQ(opcode(), kInstMatch); return match_id_; }
|
||||
EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; }
|
||||
bool greedy(Prog *p) {
|
||||
|
||||
bool greedy(Prog* p) {
|
||||
DCHECK_EQ(opcode(), kInstAltMatch);
|
||||
return p->inst(out())->opcode() == kInstByteRange;
|
||||
return p->inst(out())->opcode() == kInstByteRange ||
|
||||
(p->inst(out())->opcode() == kInstNop &&
|
||||
p->inst(p->inst(out())->out())->opcode() == kInstByteRange);
|
||||
}
|
||||
|
||||
// Does this inst (an kInstByteRange) match c?
|
||||
@ -122,52 +106,54 @@ class Prog {
|
||||
string Dump();
|
||||
|
||||
// Maximum instruction id.
|
||||
// (Must fit in out_opcode_, and PatchList steals another bit.)
|
||||
// (Must fit in out_opcode_. PatchList/last steal another bit.)
|
||||
static const int kMaxInst = (1<<28) - 1;
|
||||
|
||||
private:
|
||||
void set_opcode(InstOp opcode) {
|
||||
out_opcode_ = (out()<<3) | opcode;
|
||||
out_opcode_ = (out()<<4) | (last()<<3) | opcode;
|
||||
}
|
||||
|
||||
void set_last() {
|
||||
out_opcode_ = (out()<<4) | (1<<3) | opcode();
|
||||
}
|
||||
|
||||
void set_out(int out) {
|
||||
out_opcode_ = (out<<3) | opcode();
|
||||
out_opcode_ = (out<<4) | (last()<<3) | opcode();
|
||||
}
|
||||
|
||||
void set_out_opcode(int out, InstOp opcode) {
|
||||
out_opcode_ = (out<<3) | opcode;
|
||||
out_opcode_ = (out<<4) | (last()<<3) | opcode;
|
||||
}
|
||||
|
||||
uint32 out_opcode_; // 29 bits of out, 3 (low) bits opcode
|
||||
union { // additional instruction arguments:
|
||||
uint32 out1_; // opcode == kInstAlt
|
||||
// alternate next instruction
|
||||
uint32_t out_opcode_; // 28 bits: out, 1 bit: last, 3 (low) bits: opcode
|
||||
union { // additional instruction arguments:
|
||||
uint32_t out1_; // opcode == kInstAlt
|
||||
// alternate next instruction
|
||||
|
||||
int32 cap_; // opcode == kInstCapture
|
||||
// Index of capture register (holds text
|
||||
// position recorded by capturing parentheses).
|
||||
// For \n (the submatch for the nth parentheses),
|
||||
// the left parenthesis captures into register 2*n
|
||||
// and the right one captures into register 2*n+1.
|
||||
int32_t cap_; // opcode == kInstCapture
|
||||
// Index of capture register (holds text
|
||||
// position recorded by capturing parentheses).
|
||||
// For \n (the submatch for the nth parentheses),
|
||||
// the left parenthesis captures into register 2*n
|
||||
// and the right one captures into register 2*n+1.
|
||||
|
||||
int32 match_id_; // opcode == kInstMatch
|
||||
// Match ID to identify this match (for re2::Set).
|
||||
int32_t match_id_; // opcode == kInstMatch
|
||||
// Match ID to identify this match (for re2::Set).
|
||||
|
||||
struct { // opcode == kInstByteRange
|
||||
uint8 lo_; // byte range is lo_-hi_ inclusive
|
||||
uint8 hi_; //
|
||||
uint8 foldcase_; // convert A-Z to a-z before checking range.
|
||||
struct { // opcode == kInstByteRange
|
||||
uint8_t lo_; // byte range is lo_-hi_ inclusive
|
||||
uint8_t hi_; //
|
||||
uint8_t foldcase_; // convert A-Z to a-z before checking range.
|
||||
};
|
||||
|
||||
EmptyOp empty_; // opcode == kInstEmptyWidth
|
||||
// empty_ is bitwise OR of kEmpty* flags above.
|
||||
EmptyOp empty_; // opcode == kInstEmptyWidth
|
||||
// empty_ is bitwise OR of kEmpty* flags above.
|
||||
};
|
||||
|
||||
friend class Compiler;
|
||||
friend struct PatchList;
|
||||
friend class Prog;
|
||||
|
||||
DISALLOW_EVIL_CONSTRUCTORS(Inst);
|
||||
};
|
||||
|
||||
// Whether to anchor the search.
|
||||
@ -200,13 +186,13 @@ class Prog {
|
||||
int start_unanchored() { return start_unanchored_; }
|
||||
void set_start(int start) { start_ = start; }
|
||||
void set_start_unanchored(int start) { start_unanchored_ = start; }
|
||||
int64 size() { return size_; }
|
||||
int size() { return size_; }
|
||||
bool reversed() { return reversed_; }
|
||||
void set_reversed(bool reversed) { reversed_ = reversed; }
|
||||
int64 byte_inst_count() { return byte_inst_count_; }
|
||||
const Bitmap<256>& byterange() { return byterange_; }
|
||||
void set_dfa_mem(int64 dfa_mem) { dfa_mem_ = dfa_mem; }
|
||||
int64 dfa_mem() { return dfa_mem_; }
|
||||
int list_count() { return list_count_; }
|
||||
int inst_count(InstOp op) { return inst_count_[op]; }
|
||||
void set_dfa_mem(int64_t dfa_mem) { dfa_mem_ = dfa_mem; }
|
||||
int64_t dfa_mem() { return dfa_mem_; }
|
||||
int flags() { return flags_; }
|
||||
void set_flags(int flags) { flags_ = flags; }
|
||||
bool anchor_start() { return anchor_start_; }
|
||||
@ -214,21 +200,19 @@ class Prog {
|
||||
bool anchor_end() { return anchor_end_; }
|
||||
void set_anchor_end(bool b) { anchor_end_ = b; }
|
||||
int bytemap_range() { return bytemap_range_; }
|
||||
const uint8* bytemap() { return bytemap_; }
|
||||
const uint8_t* bytemap() { return bytemap_; }
|
||||
|
||||
// Lazily computed.
|
||||
int first_byte();
|
||||
|
||||
// Returns string representation of program for debugging.
|
||||
string Dump();
|
||||
string DumpUnanchored();
|
||||
|
||||
// Record that at some point in the prog, the bytes in the range
|
||||
// lo-hi (inclusive) are treated as different from bytes outside the range.
|
||||
// Tracking this lets the DFA collapse commonly-treated byte ranges
|
||||
// when recording state pointers, greatly reducing its memory footprint.
|
||||
void MarkByteRange(int lo, int hi);
|
||||
string DumpByteMap();
|
||||
|
||||
// Returns the set of kEmpty flags that are in effect at
|
||||
// position p within context.
|
||||
static uint32 EmptyFlags(const StringPiece& context, const char* p);
|
||||
static uint32_t EmptyFlags(const StringPiece& context, const char* p);
|
||||
|
||||
// Returns whether byte c is a word character: ASCII only.
|
||||
// Used by the implementation of \b and \B.
|
||||
@ -237,7 +221,7 @@ class Prog {
|
||||
// (the DFA has only one-byte lookahead).
|
||||
// - even if the lookahead were possible, the Progs would be huge.
|
||||
// This crude approximation is the same one PCRE uses.
|
||||
static bool IsWordChar(uint8 c) {
|
||||
static bool IsWordChar(uint8_t c) {
|
||||
return ('A' <= c && c <= 'Z') ||
|
||||
('a' <= c && c <= 'z') ||
|
||||
('0' <= c && c <= '9') ||
|
||||
@ -270,19 +254,37 @@ class Prog {
|
||||
// If matches != NULL and kind == kManyMatch and there is a match,
|
||||
// SearchDFA fills matches with the match IDs of the final matching state.
|
||||
bool SearchDFA(const StringPiece& text, const StringPiece& context,
|
||||
Anchor anchor, MatchKind kind,
|
||||
StringPiece* match0, bool* failed,
|
||||
vector<int>* matches);
|
||||
Anchor anchor, MatchKind kind, StringPiece* match0,
|
||||
bool* failed, SparseSet* matches);
|
||||
|
||||
// Build the entire DFA for the given match kind. FOR TESTING ONLY.
|
||||
// The callback issued after building each DFA state with BuildEntireDFA().
|
||||
// If next is null, then the memory budget has been exhausted and building
|
||||
// will halt. Otherwise, the state has been built and next points to an array
|
||||
// of bytemap_range()+1 slots holding the next states as per the bytemap and
|
||||
// kByteEndText. The number of the state is implied by the callback sequence:
|
||||
// the first callback is for state 0, the second callback is for state 1, ...
|
||||
// match indicates whether the state is a matching state.
|
||||
using DFAStateCallback = std::function<void(const int* next, bool match)>;
|
||||
|
||||
// Build the entire DFA for the given match kind.
|
||||
// Usually the DFA is built out incrementally, as needed, which
|
||||
// avoids lots of unnecessary work. This function is useful only
|
||||
// for testing purposes. Returns number of states.
|
||||
int BuildEntireDFA(MatchKind kind);
|
||||
// avoids lots of unnecessary work.
|
||||
// If cb is not empty, it receives one callback per state built.
|
||||
// Returns the number of states built.
|
||||
// FOR TESTING OR EXPERIMENTAL PURPOSES ONLY.
|
||||
int BuildEntireDFA(MatchKind kind, const DFAStateCallback& cb);
|
||||
|
||||
// Compute byte map.
|
||||
// Controls whether the DFA should bail out early if the NFA would be faster.
|
||||
// FOR TESTING ONLY.
|
||||
static void TEST_dfa_should_bail_when_slow(bool b);
|
||||
|
||||
// Compute bytemap.
|
||||
void ComputeByteMap();
|
||||
|
||||
// Computes whether all matches must begin with the same first
|
||||
// byte, and if so, returns that byte. If not, returns -1.
|
||||
int ComputeFirstByte();
|
||||
|
||||
// Run peep-hole optimizer on program.
|
||||
void Optimize();
|
||||
|
||||
@ -329,48 +331,80 @@ class Prog {
|
||||
// Returns true on success, false on error.
|
||||
bool PossibleMatchRange(string* min, string* max, int maxlen);
|
||||
|
||||
// EXPERIMENTAL! SUBJECT TO CHANGE!
|
||||
// Outputs the program fanout into the given sparse array.
|
||||
void Fanout(SparseArray<int>* fanout);
|
||||
|
||||
// Compiles a collection of regexps to Prog. Each regexp will have
|
||||
// its own Match instruction recording the index in the vector.
|
||||
static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor,
|
||||
Regexp* re);
|
||||
// its own Match instruction recording the index in the output vector.
|
||||
static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem);
|
||||
|
||||
// Flattens the Prog from "tree" form to "list" form. This is an in-place
|
||||
// operation in the sense that the old instructions are lost.
|
||||
void Flatten();
|
||||
|
||||
// Walks the Prog; the "successor roots" or predecessors of the reachable
|
||||
// instructions are marked in rootmap or predmap/predvec, respectively.
|
||||
// reachable and stk are preallocated scratch structures.
|
||||
void MarkSuccessors(SparseArray<int>* rootmap,
|
||||
SparseArray<int>* predmap,
|
||||
std::vector<std::vector<int>>* predvec,
|
||||
SparseSet* reachable, std::vector<int>* stk);
|
||||
|
||||
// Walks the Prog from the given "root" instruction; the "dominator root"
|
||||
// of the reachable instructions (if such exists) is marked in rootmap.
|
||||
// reachable and stk are preallocated scratch structures.
|
||||
void MarkDominator(int root, SparseArray<int>* rootmap,
|
||||
SparseArray<int>* predmap,
|
||||
std::vector<std::vector<int>>* predvec,
|
||||
SparseSet* reachable, std::vector<int>* stk);
|
||||
|
||||
// Walks the Prog from the given "root" instruction; the reachable
|
||||
// instructions are emitted in "list" form and appended to flat.
|
||||
// reachable and stk are preallocated scratch structures.
|
||||
void EmitList(int root, SparseArray<int>* rootmap,
|
||||
std::vector<Inst>* flat,
|
||||
SparseSet* reachable, std::vector<int>* stk);
|
||||
|
||||
private:
|
||||
friend class Compiler;
|
||||
|
||||
DFA* GetDFA(MatchKind kind);
|
||||
void DeleteDFA(DFA* dfa);
|
||||
|
||||
bool anchor_start_; // regexp has explicit start anchor
|
||||
bool anchor_end_; // regexp has explicit end anchor
|
||||
bool reversed_; // whether program runs backward over input
|
||||
bool did_flatten_; // has Flatten been called?
|
||||
bool did_onepass_; // has IsOnePass been called?
|
||||
|
||||
int start_; // entry point for program
|
||||
int start_unanchored_; // unanchored entry point for program
|
||||
int size_; // number of instructions
|
||||
int byte_inst_count_; // number of kInstByteRange instructions
|
||||
int bytemap_range_; // bytemap_[x] < bytemap_range_
|
||||
int first_byte_; // required first byte for match, or -1 if none
|
||||
int flags_; // regexp parse flags
|
||||
int onepass_statesize_; // byte size of each OneState* node
|
||||
|
||||
int list_count_; // count of lists (see above)
|
||||
int inst_count_[kNumInst]; // count of instructions by opcode
|
||||
|
||||
Inst* inst_; // pointer to instruction array
|
||||
uint8_t* onepass_nodes_; // data for OnePass nodes
|
||||
|
||||
Mutex dfa_mutex_; // Protects dfa_first_, dfa_longest_
|
||||
DFA* volatile dfa_first_; // DFA cached for kFirstMatch
|
||||
DFA* volatile dfa_longest_; // DFA cached for kLongestMatch and kFullMatch
|
||||
int64 dfa_mem_; // Maximum memory for DFAs.
|
||||
void (*delete_dfa_)(DFA* dfa);
|
||||
int64_t dfa_mem_; // Maximum memory for DFAs.
|
||||
DFA* dfa_first_; // DFA cached for kFirstMatch/kManyMatch
|
||||
DFA* dfa_longest_; // DFA cached for kLongestMatch/kFullMatch
|
||||
|
||||
Bitmap<256> byterange_; // byterange.Get(x) true if x ends a
|
||||
// commonly-treated byte range.
|
||||
uint8 bytemap_[256]; // map from input bytes to byte classes
|
||||
uint8 *unbytemap_; // bytemap_[unbytemap_[x]] == x
|
||||
uint8_t bytemap_[256]; // map from input bytes to byte classes
|
||||
|
||||
uint8* onepass_nodes_; // data for OnePass nodes
|
||||
OneState* onepass_start_; // start node for OnePass program
|
||||
std::once_flag first_byte_once_;
|
||||
std::once_flag dfa_first_once_;
|
||||
std::once_flag dfa_longest_once_;
|
||||
|
||||
DISALLOW_EVIL_CONSTRUCTORS(Prog);
|
||||
Prog(const Prog&) = delete;
|
||||
Prog& operator=(const Prog&) = delete;
|
||||
};
|
||||
|
||||
} // namespace re2
|
||||
|
||||
#endif // RE2_PROG_H__
|
||||
#endif // RE2_PROG_H_
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -2,8 +2,8 @@
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef RE2_RE2_H
|
||||
#define RE2_RE2_H
|
||||
#ifndef RE2_RE2_H_
|
||||
#define RE2_RE2_H_
|
||||
|
||||
// C++ interface to the re2 regular-expression library.
|
||||
// RE2 supports Perl-style regular expressions (with extensions like
|
||||
@ -17,7 +17,7 @@
|
||||
// some of the more complicated things thrown away. In particular,
|
||||
// backreferences and generalized assertions are not available, nor is \Z.
|
||||
//
|
||||
// See http://code.google.com/p/re2/wiki/Syntax for the syntax
|
||||
// See https://github.com/google/re2/wiki/Syntax for the syntax
|
||||
// supported by RE2, and a comparison with PCRE and PERL regexps.
|
||||
//
|
||||
// For those not familiar with Perl's regular expressions,
|
||||
@ -179,38 +179,24 @@
|
||||
// RE2::Octal(&a), RE2::Hex(&b), RE2::CRadix(&c), RE2::CRadix(&d));
|
||||
// will leave 64 in a, b, c, and d.
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <algorithm>
|
||||
#include <map>
|
||||
#include <mutex>
|
||||
#include <string>
|
||||
#include "re2/stringpiece.h"
|
||||
#include "re2/variadic_function.h"
|
||||
|
||||
#ifndef RE2_HAVE_LONGLONG
|
||||
#define RE2_HAVE_LONGLONG 1
|
||||
#endif
|
||||
#include "re2/stringpiece.h"
|
||||
|
||||
namespace re2 {
|
||||
class Prog;
|
||||
class Regexp;
|
||||
} // namespace re2
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// TODO(junyer): Get rid of this.
|
||||
using std::string;
|
||||
using std::map;
|
||||
class Mutex;
|
||||
class Prog;
|
||||
class Regexp;
|
||||
|
||||
// The following enum should be used only as a constructor argument to indicate
|
||||
// that the variable has static storage class, and that the constructor should
|
||||
// do nothing to its state. It indicates to the reader that it is legal to
|
||||
// declare a static instance of the class, provided the constructor is given
|
||||
// the LINKER_INITIALIZED argument. Normally, it is unsafe to declare a
|
||||
// static variable that has a constructor or a destructor because invocation
|
||||
// order is undefined. However, IF the type can be initialized by filling with
|
||||
// zeroes (which the loader does for static variables), AND the type's
|
||||
// destructor does nothing to the storage, then a constructor for static
|
||||
// initialization can be declared as
|
||||
// explicit MyClass(LinkerInitialized x) {}
|
||||
// and invoked as
|
||||
// static MyClass my_variable_name(LINKER_INITIALIZED);
|
||||
enum LinkerInitialized { LINKER_INITIALIZED };
|
||||
|
||||
// Interface for regular expression matching. Also corresponds to a
|
||||
// pre-compiled regular expression. An "RE2" object is safe for
|
||||
@ -266,7 +252,7 @@ class RE2 {
|
||||
RE2(const string& pattern);
|
||||
#endif
|
||||
RE2(const StringPiece& pattern);
|
||||
RE2(const StringPiece& pattern, const Options& option);
|
||||
RE2(const StringPiece& pattern, const Options& options);
|
||||
~RE2();
|
||||
|
||||
// Returns whether RE2 was created properly.
|
||||
@ -293,6 +279,11 @@ class RE2 {
|
||||
// Larger numbers are more expensive than smaller numbers.
|
||||
int ProgramSize() const;
|
||||
|
||||
// EXPERIMENTAL! SUBJECT TO CHANGE!
|
||||
// Outputs the program fanout as a histogram bucketed by powers of 2.
|
||||
// Returns the number of the largest non-empty bucket.
|
||||
int ProgramFanout(std::map<int, int>* histogram) const;
|
||||
|
||||
// Returns the underlying Regexp; not for general use.
|
||||
// Returns entire_regexp_ so that callers don't need
|
||||
// to know about prefix_ and prefix_foldcase_.
|
||||
@ -300,21 +291,21 @@ class RE2 {
|
||||
|
||||
/***** The useful part: the matching interface *****/
|
||||
|
||||
// Matches "text" against "pattern". If pointer arguments are
|
||||
// Matches "text" against "re". If pointer arguments are
|
||||
// supplied, copies matched sub-patterns into them.
|
||||
//
|
||||
// You can pass in a "const char*" or a "string" for "text".
|
||||
// You can pass in a "const char*" or a "string" or a "RE2" for "pattern".
|
||||
// You can pass in a "const char*" or a "string" or a "RE2" for "re".
|
||||
//
|
||||
// The provided pointer arguments can be pointers to any scalar numeric
|
||||
// type, or one of:
|
||||
// string (matched piece is copied to string)
|
||||
// StringPiece (StringPiece is mutated to point to matched piece)
|
||||
// T (where "bool T::ParseFrom(const char*, int)" exists)
|
||||
// T (where "bool T::ParseFrom(const char*, size_t)" exists)
|
||||
// (void*)NULL (the corresponding matched sub-pattern is not copied)
|
||||
//
|
||||
// Returns true iff all of the following conditions are satisfied:
|
||||
// a. "text" matches "pattern" exactly
|
||||
// a. "text" matches "re" exactly
|
||||
// b. The number of matched sub-patterns is >= number of supplied pointers
|
||||
// c. The "i"th argument has a suitable type for holding the
|
||||
// string captured as the "i"th sub-pattern. If you pass in
|
||||
@ -330,32 +321,65 @@ class RE2 {
|
||||
// RE2::FullMatch("abc", "[a-z]+(\\d+)?", &number);
|
||||
static bool FullMatchN(const StringPiece& text, const RE2& re,
|
||||
const Arg* const args[], int argc);
|
||||
static const VariadicFunction2<
|
||||
bool, const StringPiece&, const RE2&, Arg, RE2::FullMatchN> FullMatch;
|
||||
|
||||
// Exactly like FullMatch(), except that "pattern" is allowed to match
|
||||
// Exactly like FullMatch(), except that "re" is allowed to match
|
||||
// a substring of "text".
|
||||
static bool PartialMatchN(const StringPiece& text, const RE2& re, // 3..16 args
|
||||
static bool PartialMatchN(const StringPiece& text, const RE2& re,
|
||||
const Arg* const args[], int argc);
|
||||
static const VariadicFunction2<
|
||||
bool, const StringPiece&, const RE2&, Arg, RE2::PartialMatchN> PartialMatch;
|
||||
|
||||
// Like FullMatch() and PartialMatch(), except that pattern has to
|
||||
// match a prefix of "text", and "input" is advanced past the matched
|
||||
// Like FullMatch() and PartialMatch(), except that "re" has to match
|
||||
// a prefix of the text, and "input" is advanced past the matched
|
||||
// text. Note: "input" is modified iff this routine returns true.
|
||||
static bool ConsumeN(StringPiece* input, const RE2& pattern, // 3..16 args
|
||||
static bool ConsumeN(StringPiece* input, const RE2& re,
|
||||
const Arg* const args[], int argc);
|
||||
static const VariadicFunction2<
|
||||
bool, StringPiece*, const RE2&, Arg, RE2::ConsumeN> Consume;
|
||||
|
||||
// Like Consume(..), but does not anchor the match at the beginning of the
|
||||
// string. That is, "pattern" need not start its match at the beginning of
|
||||
// "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds the next
|
||||
// word in "s" and stores it in "word".
|
||||
static bool FindAndConsumeN(StringPiece* input, const RE2& pattern,
|
||||
const Arg* const args[], int argc);
|
||||
static const VariadicFunction2<
|
||||
bool, StringPiece*, const RE2&, Arg, RE2::FindAndConsumeN> FindAndConsume;
|
||||
// Like Consume(), but does not anchor the match at the beginning of
|
||||
// the text. That is, "re" need not start its match at the beginning
|
||||
// of "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds
|
||||
// the next word in "s" and stores it in "word".
|
||||
static bool FindAndConsumeN(StringPiece* input, const RE2& re,
|
||||
const Arg* const args[], int argc);
|
||||
|
||||
#ifndef SWIG
|
||||
private:
|
||||
template <typename F, typename SP>
|
||||
static inline bool Apply(F f, SP sp, const RE2& re) {
|
||||
return f(sp, re, NULL, 0);
|
||||
}
|
||||
|
||||
template <typename F, typename SP, typename... A>
|
||||
static inline bool Apply(F f, SP sp, const RE2& re, const A&... a) {
|
||||
const Arg* const args[] = {&a...};
|
||||
const int argc = sizeof...(a);
|
||||
return f(sp, re, args, argc);
|
||||
}
|
||||
|
||||
public:
|
||||
// In order to allow FullMatch() et al. to be called with a varying number
|
||||
// of arguments of varying types, we use two layers of variadic templates.
|
||||
// The first layer constructs the temporary Arg objects. The second layer
|
||||
// (above) constructs the array of pointers to the temporary Arg objects.
|
||||
|
||||
template <typename... A>
|
||||
static bool FullMatch(const StringPiece& text, const RE2& re, A&&... a) {
|
||||
return Apply(FullMatchN, text, re, Arg(std::forward<A>(a))...);
|
||||
}
|
||||
|
||||
template <typename... A>
|
||||
static bool PartialMatch(const StringPiece& text, const RE2& re, A&&... a) {
|
||||
return Apply(PartialMatchN, text, re, Arg(std::forward<A>(a))...);
|
||||
}
|
||||
|
||||
template <typename... A>
|
||||
static bool Consume(StringPiece* input, const RE2& re, A&&... a) {
|
||||
return Apply(ConsumeN, input, re, Arg(std::forward<A>(a))...);
|
||||
}
|
||||
|
||||
template <typename... A>
|
||||
static bool FindAndConsume(StringPiece* input, const RE2& re, A&&... a) {
|
||||
return Apply(FindAndConsumeN, input, re, Arg(std::forward<A>(a))...);
|
||||
}
|
||||
#endif
|
||||
|
||||
// Replace the first match of "pattern" in "str" with "rewrite".
|
||||
// Within "rewrite", backslash-escaped digits (\1 to \9) can be
|
||||
@ -397,6 +421,8 @@ class RE2 {
|
||||
//
|
||||
// Returns true iff a match occurred and the extraction happened
|
||||
// successfully; if no match occurs, the string is left unaffected.
|
||||
//
|
||||
// REQUIRES: "text" must not alias any part of "*out".
|
||||
static bool Extract(const StringPiece &text,
|
||||
const RE2& pattern,
|
||||
const StringPiece &rewrite,
|
||||
@ -440,17 +466,16 @@ class RE2 {
|
||||
// does not count: if the regexp is "(a)(b)", returns 2.
|
||||
int NumberOfCapturingGroups() const;
|
||||
|
||||
|
||||
// Return a map from names to capturing indices.
|
||||
// The map records the index of the leftmost group
|
||||
// with the given name.
|
||||
// Only valid until the re is deleted.
|
||||
const map<string, int>& NamedCapturingGroups() const;
|
||||
const std::map<string, int>& NamedCapturingGroups() const;
|
||||
|
||||
// Return a map from capturing indices to names.
|
||||
// The map has no entries for unnamed groups.
|
||||
// Only valid until the re is deleted.
|
||||
const map<int, string>& CapturingGroupNames() const;
|
||||
const std::map<int, string>& CapturingGroupNames() const;
|
||||
|
||||
// General matching routine.
|
||||
// Match against text starting at offset startpos
|
||||
@ -459,8 +484,8 @@ class RE2 {
|
||||
// On a successful match, fills in match[] (up to nmatch entries)
|
||||
// with information about submatches.
|
||||
// I.e. matching RE2("(foo)|(bar)baz") on "barbazbla" will return true,
|
||||
// setting match[0] = "barbaz", match[1] = NULL, match[2] = "bar",
|
||||
// match[3] = NULL, ..., up to match[nmatch-1] = NULL.
|
||||
// setting match[0] = "barbaz", match[1].data() = NULL, match[2] = "bar",
|
||||
// match[3].data() = NULL, ..., up to match[nmatch-1].data() = NULL.
|
||||
//
|
||||
// Don't ask for more match information than you will use:
|
||||
// runs much faster with nmatch == 1 than nmatch > 1, and
|
||||
@ -471,10 +496,10 @@ class RE2 {
|
||||
// Passing text == StringPiece(NULL, 0) will be handled like any other
|
||||
// empty string, but note that on return, it will not be possible to tell
|
||||
// whether submatch i matched the empty string or did not match:
|
||||
// either way, match[i] == NULL.
|
||||
// either way, match[i].data() == NULL.
|
||||
bool Match(const StringPiece& text,
|
||||
int startpos,
|
||||
int endpos,
|
||||
size_t startpos,
|
||||
size_t endpos,
|
||||
Anchor anchor,
|
||||
StringPiece *match,
|
||||
int nmatch) const;
|
||||
@ -498,8 +523,8 @@ class RE2 {
|
||||
// Returns true on success. This method can fail because of a malformed
|
||||
// rewrite string. CheckRewriteString guarantees that the rewrite will
|
||||
// be sucessful.
|
||||
bool Rewrite(string *out,
|
||||
const StringPiece &rewrite,
|
||||
bool Rewrite(string* out,
|
||||
const StringPiece& rewrite,
|
||||
const StringPiece* vec,
|
||||
int veclen) const;
|
||||
|
||||
@ -632,19 +657,7 @@ class RE2 {
|
||||
void set_one_line(bool b) { one_line_ = b; }
|
||||
|
||||
void Copy(const Options& src) {
|
||||
encoding_ = src.encoding_;
|
||||
posix_syntax_ = src.posix_syntax_;
|
||||
longest_match_ = src.longest_match_;
|
||||
log_errors_ = src.log_errors_;
|
||||
max_mem_ = src.max_mem_;
|
||||
literal_ = src.literal_;
|
||||
never_nl_ = src.never_nl_;
|
||||
dot_nl_ = src.dot_nl_;
|
||||
never_capture_ = src.never_capture_;
|
||||
case_sensitive_ = src.case_sensitive_;
|
||||
perl_classes_ = src.perl_classes_;
|
||||
word_boundary_ = src.word_boundary_;
|
||||
one_line_ = src.one_line_;
|
||||
*this = src;
|
||||
}
|
||||
|
||||
int ParseFlags() const;
|
||||
@ -663,10 +676,6 @@ class RE2 {
|
||||
bool perl_classes_;
|
||||
bool word_boundary_;
|
||||
bool one_line_;
|
||||
|
||||
//DISALLOW_EVIL_CONSTRUCTORS(Options);
|
||||
Options(const Options&);
|
||||
void operator=(const Options&);
|
||||
};
|
||||
|
||||
// Returns the options set in the constructor.
|
||||
@ -679,10 +688,8 @@ class RE2 {
|
||||
static inline Arg CRadix(unsigned int* x);
|
||||
static inline Arg CRadix(long* x);
|
||||
static inline Arg CRadix(unsigned long* x);
|
||||
#ifdef RE2_HAVE_LONGLONG
|
||||
static inline Arg CRadix(long long* x);
|
||||
static inline Arg CRadix(unsigned long long* x);
|
||||
#endif
|
||||
|
||||
static inline Arg Hex(short* x);
|
||||
static inline Arg Hex(unsigned short* x);
|
||||
@ -690,10 +697,8 @@ class RE2 {
|
||||
static inline Arg Hex(unsigned int* x);
|
||||
static inline Arg Hex(long* x);
|
||||
static inline Arg Hex(unsigned long* x);
|
||||
#ifdef RE2_HAVE_LONGLONG
|
||||
static inline Arg Hex(long long* x);
|
||||
static inline Arg Hex(unsigned long long* x);
|
||||
#endif
|
||||
|
||||
static inline Arg Octal(short* x);
|
||||
static inline Arg Octal(unsigned short* x);
|
||||
@ -701,47 +706,50 @@ class RE2 {
|
||||
static inline Arg Octal(unsigned int* x);
|
||||
static inline Arg Octal(long* x);
|
||||
static inline Arg Octal(unsigned long* x);
|
||||
#ifdef RE2_HAVE_LONGLONG
|
||||
static inline Arg Octal(long long* x);
|
||||
static inline Arg Octal(unsigned long long* x);
|
||||
#endif
|
||||
|
||||
private:
|
||||
void Init(const StringPiece& pattern, const Options& options);
|
||||
|
||||
bool DoMatch(const StringPiece& text,
|
||||
Anchor anchor,
|
||||
int* consumed,
|
||||
const Arg* const args[],
|
||||
int n) const;
|
||||
Anchor anchor,
|
||||
size_t* consumed,
|
||||
const Arg* const args[],
|
||||
int n) const;
|
||||
|
||||
re2::Prog* ReverseProg() const;
|
||||
|
||||
mutable Mutex* mutex_;
|
||||
string pattern_; // string regular expression
|
||||
Options options_; // option flags
|
||||
string pattern_; // string regular expression
|
||||
Options options_; // option flags
|
||||
string prefix_; // required prefix (before regexp_)
|
||||
bool prefix_foldcase_; // prefix is ASCII case-insensitive
|
||||
re2::Regexp* entire_regexp_; // parsed regular expression
|
||||
re2::Regexp* suffix_regexp_; // parsed regular expression, prefix removed
|
||||
re2::Prog* prog_; // compiled program for regexp
|
||||
mutable re2::Prog* rprog_; // reverse program for regexp
|
||||
bool is_one_pass_; // can use prog_->SearchOnePass?
|
||||
mutable const string* error_; // Error indicator
|
||||
// (or points to empty string)
|
||||
mutable ErrorCode error_code_; // Error code
|
||||
mutable string error_arg_; // Fragment of regexp showing error
|
||||
mutable int num_captures_; // Number of capturing groups
|
||||
bool is_one_pass_; // can use prog_->SearchOnePass?
|
||||
|
||||
mutable re2::Prog* rprog_; // reverse program for regexp
|
||||
mutable const string* error_; // Error indicator
|
||||
// (or points to empty string)
|
||||
mutable ErrorCode error_code_; // Error code
|
||||
mutable string error_arg_; // Fragment of regexp showing error
|
||||
mutable int num_captures_; // Number of capturing groups
|
||||
|
||||
// Map from capture names to indices
|
||||
mutable const map<string, int>* named_groups_;
|
||||
mutable const std::map<string, int>* named_groups_;
|
||||
|
||||
// Map from capture indices to names
|
||||
mutable const map<int, string>* group_names_;
|
||||
mutable const std::map<int, string>* group_names_;
|
||||
|
||||
//DISALLOW_EVIL_CONSTRUCTORS(RE2);
|
||||
RE2(const RE2&);
|
||||
void operator=(const RE2&);
|
||||
// Onces for lazy computations.
|
||||
mutable std::once_flag rprog_once_;
|
||||
mutable std::once_flag num_captures_once_;
|
||||
mutable std::once_flag named_groups_once_;
|
||||
mutable std::once_flag group_names_once_;
|
||||
|
||||
RE2(const RE2&) = delete;
|
||||
RE2& operator=(const RE2&) = delete;
|
||||
};
|
||||
|
||||
/***** Implementation details *****/
|
||||
@ -752,7 +760,7 @@ class RE2 {
|
||||
template <class T>
|
||||
class _RE2_MatchObject {
|
||||
public:
|
||||
static inline bool Parse(const char* str, int n, void* dest) {
|
||||
static inline bool Parse(const char* str, size_t n, void* dest) {
|
||||
if (dest == NULL) return true;
|
||||
T* object = reinterpret_cast<T*>(dest);
|
||||
return object->ParseFrom(str, n);
|
||||
@ -767,65 +775,64 @@ class RE2::Arg {
|
||||
// Constructor specially designed for NULL arguments
|
||||
Arg(void*);
|
||||
|
||||
typedef bool (*Parser)(const char* str, int n, void* dest);
|
||||
typedef bool (*Parser)(const char* str, size_t n, void* dest);
|
||||
|
||||
// Type-specific parsers
|
||||
#define MAKE_PARSER(type,name) \
|
||||
Arg(type* p) : arg_(p), parser_(name) { } \
|
||||
Arg(type* p, Parser parser) : arg_(p), parser_(parser) { } \
|
||||
|
||||
#define MAKE_PARSER(type, name) \
|
||||
Arg(type* p) : arg_(p), parser_(name) {} \
|
||||
Arg(type* p, Parser parser) : arg_(p), parser_(parser) {}
|
||||
|
||||
MAKE_PARSER(char, parse_char);
|
||||
MAKE_PARSER(signed char, parse_char);
|
||||
MAKE_PARSER(signed char, parse_schar);
|
||||
MAKE_PARSER(unsigned char, parse_uchar);
|
||||
MAKE_PARSER(float, parse_float);
|
||||
MAKE_PARSER(double, parse_double);
|
||||
MAKE_PARSER(string, parse_string);
|
||||
MAKE_PARSER(StringPiece, parse_stringpiece);
|
||||
|
||||
MAKE_PARSER(short, parse_short);
|
||||
MAKE_PARSER(unsigned short, parse_ushort);
|
||||
MAKE_PARSER(int, parse_int);
|
||||
MAKE_PARSER(unsigned int, parse_uint);
|
||||
MAKE_PARSER(long, parse_long);
|
||||
MAKE_PARSER(unsigned long, parse_ulong);
|
||||
#ifdef RE2_HAVE_LONGLONG
|
||||
MAKE_PARSER(long long, parse_longlong);
|
||||
MAKE_PARSER(unsigned long long, parse_ulonglong);
|
||||
#endif
|
||||
MAKE_PARSER(float, parse_float);
|
||||
MAKE_PARSER(double, parse_double);
|
||||
MAKE_PARSER(string, parse_string);
|
||||
MAKE_PARSER(StringPiece, parse_stringpiece);
|
||||
|
||||
#undef MAKE_PARSER
|
||||
|
||||
// Generic constructor
|
||||
template <class T> Arg(T*, Parser parser);
|
||||
// Generic constructor template
|
||||
// Generic constructor templates
|
||||
template <class T> Arg(T* p)
|
||||
: arg_(p), parser_(_RE2_MatchObject<T>::Parse) {
|
||||
}
|
||||
: arg_(p), parser_(_RE2_MatchObject<T>::Parse) { }
|
||||
template <class T> Arg(T* p, Parser parser)
|
||||
: arg_(p), parser_(parser) { }
|
||||
|
||||
// Parse the data
|
||||
bool Parse(const char* str, int n) const;
|
||||
bool Parse(const char* str, size_t n) const;
|
||||
|
||||
private:
|
||||
void* arg_;
|
||||
Parser parser_;
|
||||
|
||||
static bool parse_null (const char* str, int n, void* dest);
|
||||
static bool parse_char (const char* str, int n, void* dest);
|
||||
static bool parse_uchar (const char* str, int n, void* dest);
|
||||
static bool parse_float (const char* str, int n, void* dest);
|
||||
static bool parse_double (const char* str, int n, void* dest);
|
||||
static bool parse_string (const char* str, int n, void* dest);
|
||||
static bool parse_stringpiece (const char* str, int n, void* dest);
|
||||
static bool parse_null (const char* str, size_t n, void* dest);
|
||||
static bool parse_char (const char* str, size_t n, void* dest);
|
||||
static bool parse_schar (const char* str, size_t n, void* dest);
|
||||
static bool parse_uchar (const char* str, size_t n, void* dest);
|
||||
static bool parse_float (const char* str, size_t n, void* dest);
|
||||
static bool parse_double (const char* str, size_t n, void* dest);
|
||||
static bool parse_string (const char* str, size_t n, void* dest);
|
||||
static bool parse_stringpiece (const char* str, size_t n, void* dest);
|
||||
|
||||
#define DECLARE_INTEGER_PARSER(name) \
|
||||
private: \
|
||||
static bool parse_ ## name(const char* str, int n, void* dest); \
|
||||
static bool parse_ ## name ## _radix( \
|
||||
const char* str, int n, void* dest, int radix); \
|
||||
public: \
|
||||
static bool parse_ ## name ## _hex(const char* str, int n, void* dest); \
|
||||
static bool parse_ ## name ## _octal(const char* str, int n, void* dest); \
|
||||
static bool parse_ ## name ## _cradix(const char* str, int n, void* dest)
|
||||
#define DECLARE_INTEGER_PARSER(name) \
|
||||
private: \
|
||||
static bool parse_##name(const char* str, size_t n, void* dest); \
|
||||
static bool parse_##name##_radix(const char* str, size_t n, void* dest, \
|
||||
int radix); \
|
||||
\
|
||||
public: \
|
||||
static bool parse_##name##_hex(const char* str, size_t n, void* dest); \
|
||||
static bool parse_##name##_octal(const char* str, size_t n, void* dest); \
|
||||
static bool parse_##name##_cradix(const char* str, size_t n, void* dest)
|
||||
|
||||
DECLARE_INTEGER_PARSER(short);
|
||||
DECLARE_INTEGER_PARSER(ushort);
|
||||
@ -833,29 +840,31 @@ class RE2::Arg {
|
||||
DECLARE_INTEGER_PARSER(uint);
|
||||
DECLARE_INTEGER_PARSER(long);
|
||||
DECLARE_INTEGER_PARSER(ulong);
|
||||
#ifdef RE2_HAVE_LONGLONG
|
||||
DECLARE_INTEGER_PARSER(longlong);
|
||||
DECLARE_INTEGER_PARSER(ulonglong);
|
||||
#endif
|
||||
|
||||
#undef DECLARE_INTEGER_PARSER
|
||||
|
||||
};
|
||||
|
||||
inline RE2::Arg::Arg() : arg_(NULL), parser_(parse_null) { }
|
||||
inline RE2::Arg::Arg(void* p) : arg_(p), parser_(parse_null) { }
|
||||
|
||||
inline bool RE2::Arg::Parse(const char* str, int n) const {
|
||||
inline bool RE2::Arg::Parse(const char* str, size_t n) const {
|
||||
return (*parser_)(str, n, arg_);
|
||||
}
|
||||
|
||||
// This part of the parser, appropriate only for ints, deals with bases
|
||||
#define MAKE_INTEGER_PARSER(type, name) \
|
||||
inline RE2::Arg RE2::Hex(type* ptr) { \
|
||||
return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _hex); } \
|
||||
inline RE2::Arg RE2::Octal(type* ptr) { \
|
||||
return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _octal); } \
|
||||
inline RE2::Arg RE2::CRadix(type* ptr) { \
|
||||
return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _cradix); }
|
||||
#define MAKE_INTEGER_PARSER(type, name) \
|
||||
inline RE2::Arg RE2::Hex(type* ptr) { \
|
||||
return RE2::Arg(ptr, RE2::Arg::parse_##name##_hex); \
|
||||
} \
|
||||
inline RE2::Arg RE2::Octal(type* ptr) { \
|
||||
return RE2::Arg(ptr, RE2::Arg::parse_##name##_octal); \
|
||||
} \
|
||||
inline RE2::Arg RE2::CRadix(type* ptr) { \
|
||||
return RE2::Arg(ptr, RE2::Arg::parse_##name##_cradix); \
|
||||
}
|
||||
|
||||
MAKE_INTEGER_PARSER(short, short)
|
||||
MAKE_INTEGER_PARSER(unsigned short, ushort)
|
||||
@ -863,15 +872,70 @@ MAKE_INTEGER_PARSER(int, int)
|
||||
MAKE_INTEGER_PARSER(unsigned int, uint)
|
||||
MAKE_INTEGER_PARSER(long, long)
|
||||
MAKE_INTEGER_PARSER(unsigned long, ulong)
|
||||
#ifdef RE2_HAVE_LONGLONG
|
||||
MAKE_INTEGER_PARSER(long long, longlong)
|
||||
MAKE_INTEGER_PARSER(unsigned long long, ulonglong)
|
||||
#endif
|
||||
|
||||
#undef MAKE_INTEGER_PARSER
|
||||
|
||||
#ifndef SWIG
|
||||
|
||||
// Silence warnings about missing initializers for members of LazyRE2.
|
||||
// Note that we test for Clang first because it defines __GNUC__ as well.
|
||||
#if defined(__clang__)
|
||||
#elif defined(__GNUC__) && __GNUC__ >= 6
|
||||
#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
|
||||
#endif
|
||||
|
||||
// Helper for writing global or static RE2s safely.
|
||||
// Write
|
||||
// static LazyRE2 re = {".*"};
|
||||
// and then use *re instead of writing
|
||||
// static RE2 re(".*");
|
||||
// The former is more careful about multithreaded
|
||||
// situations than the latter.
|
||||
//
|
||||
// N.B. This class never deletes the RE2 object that
|
||||
// it constructs: that's a feature, so that it can be used
|
||||
// for global and function static variables.
|
||||
class LazyRE2 {
|
||||
private:
|
||||
struct NoArg {};
|
||||
|
||||
public:
|
||||
typedef RE2 element_type; // support std::pointer_traits
|
||||
|
||||
// Constructor omitted to preserve braced initialization in C++98.
|
||||
|
||||
// Pretend to be a pointer to Type (never NULL due to on-demand creation):
|
||||
RE2& operator*() const { return *get(); }
|
||||
RE2* operator->() const { return get(); }
|
||||
|
||||
// Named accessor/initializer:
|
||||
RE2* get() const {
|
||||
std::call_once(once_, &LazyRE2::Init, this);
|
||||
return ptr_;
|
||||
}
|
||||
|
||||
// All data fields must be public to support {"foo"} initialization.
|
||||
const char* pattern_;
|
||||
RE2::CannedOptions options_;
|
||||
NoArg barrier_against_excess_initializers_;
|
||||
|
||||
mutable RE2* ptr_;
|
||||
mutable std::once_flag once_;
|
||||
|
||||
private:
|
||||
static void Init(const LazyRE2* lazy_re2) {
|
||||
lazy_re2->ptr_ = new RE2(lazy_re2->pattern_, lazy_re2->options_);
|
||||
}
|
||||
|
||||
void operator=(const LazyRE2&); // disallowed
|
||||
};
|
||||
#endif // SWIG
|
||||
|
||||
} // namespace re2
|
||||
|
||||
using re2::RE2;
|
||||
using re2::LazyRE2;
|
||||
|
||||
#endif /* RE2_RE2_H */
|
||||
#endif // RE2_RE2_H_
|
||||
|
@ -5,8 +5,21 @@
|
||||
// Regular expression representation.
|
||||
// Tested by parse_test.cc
|
||||
|
||||
#include "util/util.h"
|
||||
#include "re2/regexp.h"
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <algorithm>
|
||||
#include <map>
|
||||
#include <mutex>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "util/util.h"
|
||||
#include "util/logging.h"
|
||||
#include "util/mutex.h"
|
||||
#include "util/utf.h"
|
||||
#include "re2/stringpiece.h"
|
||||
#include "re2/walker-inl.h"
|
||||
|
||||
@ -14,9 +27,9 @@ namespace re2 {
|
||||
|
||||
// Constructor. Allocates vectors as appropriate for operator.
|
||||
Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
|
||||
: op_(op),
|
||||
: op_(static_cast<uint8_t>(op)),
|
||||
simple_(false),
|
||||
parse_flags_(static_cast<uint16>(parse_flags)),
|
||||
parse_flags_(static_cast<uint16_t>(parse_flags)),
|
||||
ref_(1),
|
||||
nsub_(0),
|
||||
down_(NULL) {
|
||||
@ -43,7 +56,8 @@ Regexp::~Regexp() {
|
||||
delete[] runes_;
|
||||
break;
|
||||
case kRegexpCharClass:
|
||||
cc_->Delete();
|
||||
if (cc_)
|
||||
cc_->Delete();
|
||||
delete ccb_;
|
||||
break;
|
||||
}
|
||||
@ -59,30 +73,29 @@ bool Regexp::QuickDestroy() {
|
||||
return false;
|
||||
}
|
||||
|
||||
static map<Regexp*, int> *ref_map;
|
||||
GLOBAL_MUTEX(ref_mutex);
|
||||
// Lazily allocated.
|
||||
static Mutex* ref_mutex;
|
||||
static std::map<Regexp*, int>* ref_map;
|
||||
|
||||
int Regexp::Ref() {
|
||||
if (ref_ < kMaxRef)
|
||||
return ref_;
|
||||
|
||||
GLOBAL_MUTEX_LOCK(ref_mutex);
|
||||
int r = 0;
|
||||
if (ref_map != NULL) {
|
||||
r = (*ref_map)[this];
|
||||
}
|
||||
GLOBAL_MUTEX_UNLOCK(ref_mutex);
|
||||
return r;
|
||||
MutexLock l(ref_mutex);
|
||||
return (*ref_map)[this];
|
||||
}
|
||||
|
||||
// Increments reference count, returns object as convenience.
|
||||
Regexp* Regexp::Incref() {
|
||||
if (ref_ >= kMaxRef-1) {
|
||||
static std::once_flag ref_once;
|
||||
std::call_once(ref_once, []() {
|
||||
ref_mutex = new Mutex;
|
||||
ref_map = new std::map<Regexp*, int>;
|
||||
});
|
||||
|
||||
// Store ref count in overflow map.
|
||||
GLOBAL_MUTEX_LOCK(ref_mutex);
|
||||
if (ref_map == NULL) {
|
||||
ref_map = new map<Regexp*, int>;
|
||||
}
|
||||
MutexLock l(ref_mutex);
|
||||
if (ref_ == kMaxRef) {
|
||||
// already overflowed
|
||||
(*ref_map)[this]++;
|
||||
@ -91,7 +104,6 @@ Regexp* Regexp::Incref() {
|
||||
(*ref_map)[this] = kMaxRef;
|
||||
ref_ = kMaxRef;
|
||||
}
|
||||
GLOBAL_MUTEX_UNLOCK(ref_mutex);
|
||||
return this;
|
||||
}
|
||||
|
||||
@ -103,15 +115,14 @@ Regexp* Regexp::Incref() {
|
||||
void Regexp::Decref() {
|
||||
if (ref_ == kMaxRef) {
|
||||
// Ref count is stored in overflow map.
|
||||
GLOBAL_MUTEX_LOCK(ref_mutex);
|
||||
MutexLock l(ref_mutex);
|
||||
int r = (*ref_map)[this] - 1;
|
||||
if (r < kMaxRef) {
|
||||
ref_ = r;
|
||||
ref_ = static_cast<uint16_t>(r);
|
||||
ref_map->erase(this);
|
||||
} else {
|
||||
(*ref_map)[this] = r;
|
||||
}
|
||||
GLOBAL_MUTEX_UNLOCK(ref_mutex);
|
||||
return;
|
||||
}
|
||||
ref_--;
|
||||
@ -179,31 +190,45 @@ Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
|
||||
return re;
|
||||
}
|
||||
|
||||
Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
|
||||
if (sub->op() == kRegexpPlus && sub->parse_flags() == flags)
|
||||
Regexp* Regexp::StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags) {
|
||||
// Squash **, ++ and ??.
|
||||
if (op == sub->op() && flags == sub->parse_flags())
|
||||
return sub;
|
||||
Regexp* re = new Regexp(kRegexpPlus, flags);
|
||||
|
||||
// Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because
|
||||
// op is Star/Plus/Quest, we just have to check that sub->op() is too.
|
||||
if ((sub->op() == kRegexpStar ||
|
||||
sub->op() == kRegexpPlus ||
|
||||
sub->op() == kRegexpQuest) &&
|
||||
flags == sub->parse_flags()) {
|
||||
// If sub is Star, no need to rewrite it.
|
||||
if (sub->op() == kRegexpStar)
|
||||
return sub;
|
||||
|
||||
// Rewrite sub to Star.
|
||||
Regexp* re = new Regexp(kRegexpStar, flags);
|
||||
re->AllocSub(1);
|
||||
re->sub()[0] = sub->sub()[0]->Incref();
|
||||
sub->Decref(); // We didn't consume the reference after all.
|
||||
return re;
|
||||
}
|
||||
|
||||
Regexp* re = new Regexp(op, flags);
|
||||
re->AllocSub(1);
|
||||
re->sub()[0] = sub;
|
||||
return re;
|
||||
}
|
||||
|
||||
Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
|
||||
return StarPlusOrQuest(kRegexpPlus, sub, flags);
|
||||
}
|
||||
|
||||
Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
|
||||
if (sub->op() == kRegexpStar && sub->parse_flags() == flags)
|
||||
return sub;
|
||||
Regexp* re = new Regexp(kRegexpStar, flags);
|
||||
re->AllocSub(1);
|
||||
re->sub()[0] = sub;
|
||||
return re;
|
||||
return StarPlusOrQuest(kRegexpStar, sub, flags);
|
||||
}
|
||||
|
||||
Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
|
||||
if (sub->op() == kRegexpQuest && sub->parse_flags() == flags)
|
||||
return sub;
|
||||
Regexp* re = new Regexp(kRegexpQuest, flags);
|
||||
re->AllocSub(1);
|
||||
re->sub()[0] = sub;
|
||||
return re;
|
||||
return StarPlusOrQuest(kRegexpQuest, sub, flags);
|
||||
}
|
||||
|
||||
Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
|
||||
@ -211,6 +236,13 @@ Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
|
||||
if (nsub == 1)
|
||||
return sub[0];
|
||||
|
||||
if (nsub == 0) {
|
||||
if (op == kRegexpAlternate)
|
||||
return new Regexp(kRegexpNoMatch, flags);
|
||||
else
|
||||
return new Regexp(kRegexpEmptyMatch, flags);
|
||||
}
|
||||
|
||||
Regexp** subcopy = NULL;
|
||||
if (op == kRegexpAlternate && can_factor) {
|
||||
// Going to edit sub; make a copy so we don't step on caller.
|
||||
@ -405,7 +437,7 @@ bool Regexp::Equal(Regexp* a, Regexp* b) {
|
||||
// The stack (vector) has pairs of regexps waiting to
|
||||
// be compared. The regexps are only equal if
|
||||
// all the pairs end up being equal.
|
||||
vector<Regexp*> stk;
|
||||
std::vector<Regexp*> stk;
|
||||
|
||||
for (;;) {
|
||||
// Invariant: TopEqual(a, b) == true.
|
||||
@ -445,10 +477,11 @@ bool Regexp::Equal(Regexp* a, Regexp* b) {
|
||||
continue;
|
||||
}
|
||||
|
||||
int n = stk.size();
|
||||
size_t n = stk.size();
|
||||
if (n == 0)
|
||||
break;
|
||||
|
||||
DCHECK_GE(n, 2);
|
||||
a = stk[n-2];
|
||||
b = stk[n-1];
|
||||
stk.resize(n-2);
|
||||
@ -517,7 +550,9 @@ class NumCapturesWalker : public Regexp::Walker<Ignored> {
|
||||
|
||||
private:
|
||||
int ncapture_;
|
||||
DISALLOW_EVIL_CONSTRUCTORS(NumCapturesWalker);
|
||||
|
||||
NumCapturesWalker(const NumCapturesWalker&) = delete;
|
||||
NumCapturesWalker& operator=(const NumCapturesWalker&) = delete;
|
||||
};
|
||||
|
||||
int Regexp::NumCaptures() {
|
||||
@ -532,8 +567,8 @@ class NamedCapturesWalker : public Regexp::Walker<Ignored> {
|
||||
NamedCapturesWalker() : map_(NULL) {}
|
||||
~NamedCapturesWalker() { delete map_; }
|
||||
|
||||
map<string, int>* TakeMap() {
|
||||
map<string, int>* m = map_;
|
||||
std::map<string, int>* TakeMap() {
|
||||
std::map<string, int>* m = map_;
|
||||
map_ = NULL;
|
||||
return m;
|
||||
}
|
||||
@ -542,7 +577,7 @@ class NamedCapturesWalker : public Regexp::Walker<Ignored> {
|
||||
if (re->op() == kRegexpCapture && re->name() != NULL) {
|
||||
// Allocate map once we find a name.
|
||||
if (map_ == NULL)
|
||||
map_ = new map<string, int>;
|
||||
map_ = new std::map<string, int>;
|
||||
|
||||
// Record first occurrence of each name.
|
||||
// (The rule is that if you have the same name
|
||||
@ -560,11 +595,13 @@ class NamedCapturesWalker : public Regexp::Walker<Ignored> {
|
||||
}
|
||||
|
||||
private:
|
||||
map<string, int>* map_;
|
||||
DISALLOW_EVIL_CONSTRUCTORS(NamedCapturesWalker);
|
||||
std::map<string, int>* map_;
|
||||
|
||||
NamedCapturesWalker(const NamedCapturesWalker&) = delete;
|
||||
NamedCapturesWalker& operator=(const NamedCapturesWalker&) = delete;
|
||||
};
|
||||
|
||||
map<string, int>* Regexp::NamedCaptures() {
|
||||
std::map<string, int>* Regexp::NamedCaptures() {
|
||||
NamedCapturesWalker w;
|
||||
w.Walk(this, 0);
|
||||
return w.TakeMap();
|
||||
@ -576,8 +613,8 @@ class CaptureNamesWalker : public Regexp::Walker<Ignored> {
|
||||
CaptureNamesWalker() : map_(NULL) {}
|
||||
~CaptureNamesWalker() { delete map_; }
|
||||
|
||||
map<int, string>* TakeMap() {
|
||||
map<int, string>* m = map_;
|
||||
std::map<int, string>* TakeMap() {
|
||||
std::map<int, string>* m = map_;
|
||||
map_ = NULL;
|
||||
return m;
|
||||
}
|
||||
@ -586,7 +623,7 @@ class CaptureNamesWalker : public Regexp::Walker<Ignored> {
|
||||
if (re->op() == kRegexpCapture && re->name() != NULL) {
|
||||
// Allocate map once we find a name.
|
||||
if (map_ == NULL)
|
||||
map_ = new map<int, string>;
|
||||
map_ = new std::map<int, string>;
|
||||
|
||||
(*map_)[re->cap()] = *re->name();
|
||||
}
|
||||
@ -600,11 +637,13 @@ class CaptureNamesWalker : public Regexp::Walker<Ignored> {
|
||||
}
|
||||
|
||||
private:
|
||||
map<int, string>* map_;
|
||||
DISALLOW_EVIL_CONSTRUCTORS(CaptureNamesWalker);
|
||||
std::map<int, string>* map_;
|
||||
|
||||
CaptureNamesWalker(const CaptureNamesWalker&) = delete;
|
||||
CaptureNamesWalker& operator=(const CaptureNamesWalker&) = delete;
|
||||
};
|
||||
|
||||
map<int, string>* Regexp::CaptureNames() {
|
||||
std::map<int, string>* Regexp::CaptureNames() {
|
||||
CaptureNamesWalker w;
|
||||
w.Walk(this, 0);
|
||||
return w.TakeMap();
|
||||
@ -614,7 +653,7 @@ map<int, string>* Regexp::CaptureNames() {
|
||||
// with a fixed string prefix. If so, returns the prefix and
|
||||
// the regexp that remains after the prefix. The prefix might
|
||||
// be ASCII case-insensitive.
|
||||
bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
|
||||
bool Regexp::RequiredPrefix(string* prefix, bool* foldcase, Regexp** suffix) {
|
||||
// No need for a walker: the regexp must be of the form
|
||||
// 1. some number of ^ anchors
|
||||
// 2. a literal char or string
|
||||
@ -643,7 +682,7 @@ bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
|
||||
if (re->parse_flags() & Latin1) {
|
||||
prefix->resize(re->nrunes_);
|
||||
for (int j = 0; j < re->nrunes_; j++)
|
||||
(*prefix)[j] = re->runes_[j];
|
||||
(*prefix)[j] = static_cast<char>(re->runes_[j]);
|
||||
} else {
|
||||
// Convert to UTF-8 in place.
|
||||
// Assume worst-case space and then trim.
|
||||
@ -652,7 +691,7 @@ bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
|
||||
for (int j = 0; j < re->nrunes_; j++) {
|
||||
Rune r = re->runes_[j];
|
||||
if (r < Runeself)
|
||||
*p++ = r;
|
||||
*p++ = static_cast<char>(r);
|
||||
else
|
||||
p += runetochar(p, &r);
|
||||
}
|
||||
@ -662,14 +701,14 @@ bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
|
||||
|
||||
case kRegexpLiteral:
|
||||
if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) {
|
||||
prefix->append(1, re->rune_);
|
||||
prefix->append(1, static_cast<char>(re->rune_));
|
||||
} else {
|
||||
char buf[UTFmax];
|
||||
prefix->append(buf, runetochar(buf, &re->rune_));
|
||||
}
|
||||
break;
|
||||
}
|
||||
*foldcase = (sub[i]->parse_flags() & FoldCase);
|
||||
*foldcase = (sub[i]->parse_flags() & FoldCase) != 0;
|
||||
i++;
|
||||
|
||||
// The rest.
|
||||
@ -704,13 +743,13 @@ bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
|
||||
if (lo <= 'z' && hi >= 'A') {
|
||||
// Overlaps some alpha, maybe not all.
|
||||
// Update bitmaps telling which ASCII letters are in the set.
|
||||
Rune lo1 = max<Rune>(lo, 'A');
|
||||
Rune hi1 = min<Rune>(hi, 'Z');
|
||||
Rune lo1 = std::max<Rune>(lo, 'A');
|
||||
Rune hi1 = std::min<Rune>(hi, 'Z');
|
||||
if (lo1 <= hi1)
|
||||
upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');
|
||||
|
||||
lo1 = max<Rune>(lo, 'a');
|
||||
hi1 = min<Rune>(hi, 'z');
|
||||
lo1 = std::max<Rune>(lo, 'a');
|
||||
hi1 = std::min<Rune>(hi, 'z');
|
||||
if (lo1 <= hi1)
|
||||
lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
|
||||
}
|
||||
@ -826,7 +865,7 @@ void CharClassBuilder::RemoveAbove(Rune r) {
|
||||
void CharClassBuilder::Negate() {
|
||||
// Build up negation and then copy in.
|
||||
// Could edit ranges in place, but C++ won't let me.
|
||||
vector<RuneRange> v;
|
||||
std::vector<RuneRange> v;
|
||||
v.reserve(ranges_.size() + 1);
|
||||
|
||||
// In negation, first range begins at 0, unless
|
||||
@ -863,7 +902,7 @@ void CharClassBuilder::Negate() {
|
||||
|
||||
CharClass* CharClass::New(int maxranges) {
|
||||
CharClass* cc;
|
||||
uint8* data = new uint8[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
|
||||
uint8_t* data = new uint8_t[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
|
||||
cc = reinterpret_cast<CharClass*>(data);
|
||||
cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc);
|
||||
cc->nranges_ = 0;
|
||||
@ -873,7 +912,7 @@ CharClass* CharClass::New(int maxranges) {
|
||||
}
|
||||
|
||||
void CharClass::Delete() {
|
||||
uint8 *data = reinterpret_cast<uint8*>(this);
|
||||
uint8_t* data = reinterpret_cast<uint8_t*>(this);
|
||||
delete[] data;
|
||||
}
|
||||
|
||||
@ -915,7 +954,7 @@ bool CharClass::Contains(Rune r) {
|
||||
}
|
||||
|
||||
CharClass* CharClassBuilder::GetCharClass() {
|
||||
CharClass* cc = CharClass::New(ranges_.size());
|
||||
CharClass* cc = CharClass::New(static_cast<int>(ranges_.size()));
|
||||
int n = 0;
|
||||
for (iterator it = begin(); it != end(); ++it)
|
||||
cc->ranges_[n++] = *it;
|
||||
|
@ -2,6 +2,9 @@
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef RE2_REGEXP_H_
|
||||
#define RE2_REGEXP_H_
|
||||
|
||||
// --- SPONSORED LINK --------------------------------------------------
|
||||
// If you want to use this library for regular expression matching,
|
||||
// you should use re2/re2.h, which provides a class RE2 that
|
||||
@ -83,10 +86,14 @@
|
||||
// form accessible to clients, so that client code can analyze the
|
||||
// parsed regular expressions.
|
||||
|
||||
#ifndef RE2_REGEXP_H__
|
||||
#define RE2_REGEXP_H__
|
||||
#include <stdint.h>
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <string>
|
||||
|
||||
#include "util/util.h"
|
||||
#include "util/logging.h"
|
||||
#include "util/utf.h"
|
||||
#include "re2/stringpiece.h"
|
||||
|
||||
namespace re2 {
|
||||
@ -185,10 +192,10 @@ class RegexpStatus {
|
||||
RegexpStatus() : code_(kRegexpSuccess), tmp_(NULL) {}
|
||||
~RegexpStatus() { delete tmp_; }
|
||||
|
||||
void set_code(enum RegexpStatusCode code) { code_ = code; }
|
||||
void set_code(RegexpStatusCode code) { code_ = code; }
|
||||
void set_error_arg(const StringPiece& error_arg) { error_arg_ = error_arg; }
|
||||
void set_tmp(string* tmp) { delete tmp_; tmp_ = tmp; }
|
||||
enum RegexpStatusCode code() const { return code_; }
|
||||
RegexpStatusCode code() const { return code_; }
|
||||
const StringPiece& error_arg() const { return error_arg_; }
|
||||
bool ok() const { return code() == kRegexpSuccess; }
|
||||
|
||||
@ -197,23 +204,21 @@ class RegexpStatus {
|
||||
|
||||
// Returns text equivalent of code, e.g.:
|
||||
// "Bad character class"
|
||||
static string CodeText(enum RegexpStatusCode code);
|
||||
static string CodeText(RegexpStatusCode code);
|
||||
|
||||
// Returns text describing error, e.g.:
|
||||
// "Bad character class: [z-a]"
|
||||
string Text() const;
|
||||
|
||||
private:
|
||||
enum RegexpStatusCode code_; // Kind of error
|
||||
RegexpStatusCode code_; // Kind of error
|
||||
StringPiece error_arg_; // Piece of regexp containing syntax error.
|
||||
string* tmp_; // Temporary storage, possibly where error_arg_ is.
|
||||
|
||||
DISALLOW_EVIL_CONSTRUCTORS(RegexpStatus);
|
||||
RegexpStatus(const RegexpStatus&) = delete;
|
||||
RegexpStatus& operator=(const RegexpStatus&) = delete;
|
||||
};
|
||||
|
||||
// Walker to implement Simplify.
|
||||
class SimplifyWalker;
|
||||
|
||||
// Compiled form; see prog.h
|
||||
class Prog;
|
||||
|
||||
@ -261,7 +266,9 @@ class CharClass {
|
||||
int nrunes_;
|
||||
RuneRange *ranges_;
|
||||
int nranges_;
|
||||
DISALLOW_EVIL_CONSTRUCTORS(CharClass);
|
||||
|
||||
CharClass(const CharClass&) = delete;
|
||||
CharClass& operator=(const CharClass&) = delete;
|
||||
};
|
||||
|
||||
class Regexp {
|
||||
@ -269,51 +276,52 @@ class Regexp {
|
||||
|
||||
// Flags for parsing. Can be ORed together.
|
||||
enum ParseFlags {
|
||||
NoParseFlags = 0,
|
||||
FoldCase = 1<<0, // Fold case during matching (case-insensitive).
|
||||
Literal = 1<<1, // Treat s as literal string instead of a regexp.
|
||||
ClassNL = 1<<2, // Allow char classes like [^a-z] and \D and \s
|
||||
// and [[:space:]] to match newline.
|
||||
DotNL = 1<<3, // Allow . to match newline.
|
||||
MatchNL = ClassNL | DotNL,
|
||||
OneLine = 1<<4, // Treat ^ and $ as only matching at beginning and
|
||||
// end of text, not around embedded newlines.
|
||||
// (Perl's default)
|
||||
Latin1 = 1<<5, // Regexp and text are in Latin1, not UTF-8.
|
||||
NonGreedy = 1<<6, // Repetition operators are non-greedy by default.
|
||||
PerlClasses = 1<<7, // Allow Perl character classes like \d.
|
||||
PerlB = 1<<8, // Allow Perl's \b and \B.
|
||||
PerlX = 1<<9, // Perl extensions:
|
||||
// non-capturing parens - (?: )
|
||||
// non-greedy operators - *? +? ?? {}?
|
||||
// flag edits - (?i) (?-i) (?i: )
|
||||
// i - FoldCase
|
||||
// m - !OneLine
|
||||
// s - DotNL
|
||||
// U - NonGreedy
|
||||
// line ends: \A \z
|
||||
// \Q and \E to disable/enable metacharacters
|
||||
// (?P<name>expr) for named captures
|
||||
// \C to match any single byte
|
||||
UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group
|
||||
// and \P{Han} for its negation.
|
||||
NeverNL = 1<<11, // Never match NL, even if the regexp mentions
|
||||
// it explicitly.
|
||||
NeverCapture = 1<<12, // Parse all parens as non-capturing.
|
||||
NoParseFlags = 0,
|
||||
FoldCase = 1<<0, // Fold case during matching (case-insensitive).
|
||||
Literal = 1<<1, // Treat s as literal string instead of a regexp.
|
||||
ClassNL = 1<<2, // Allow char classes like [^a-z] and \D and \s
|
||||
// and [[:space:]] to match newline.
|
||||
DotNL = 1<<3, // Allow . to match newline.
|
||||
MatchNL = ClassNL | DotNL,
|
||||
OneLine = 1<<4, // Treat ^ and $ as only matching at beginning and
|
||||
// end of text, not around embedded newlines.
|
||||
// (Perl's default)
|
||||
Latin1 = 1<<5, // Regexp and text are in Latin1, not UTF-8.
|
||||
NonGreedy = 1<<6, // Repetition operators are non-greedy by default.
|
||||
PerlClasses = 1<<7, // Allow Perl character classes like \d.
|
||||
PerlB = 1<<8, // Allow Perl's \b and \B.
|
||||
PerlX = 1<<9, // Perl extensions:
|
||||
// non-capturing parens - (?: )
|
||||
// non-greedy operators - *? +? ?? {}?
|
||||
// flag edits - (?i) (?-i) (?i: )
|
||||
// i - FoldCase
|
||||
// m - !OneLine
|
||||
// s - DotNL
|
||||
// U - NonGreedy
|
||||
// line ends: \A \z
|
||||
// \Q and \E to disable/enable metacharacters
|
||||
// (?P<name>expr) for named captures
|
||||
// \C to match any single byte
|
||||
UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group
|
||||
// and \P{Han} for its negation.
|
||||
NeverNL = 1<<11, // Never match NL, even if the regexp mentions
|
||||
// it explicitly.
|
||||
NeverCapture = 1<<12, // Parse all parens as non-capturing.
|
||||
|
||||
// As close to Perl as we can get.
|
||||
LikePerl = ClassNL | OneLine | PerlClasses | PerlB | PerlX |
|
||||
UnicodeGroups,
|
||||
LikePerl = ClassNL | OneLine | PerlClasses | PerlB | PerlX |
|
||||
UnicodeGroups,
|
||||
|
||||
// Internal use only.
|
||||
WasDollar = 1<<15, // on kRegexpEndText: was $ in regexp text
|
||||
WasDollar = 1<<13, // on kRegexpEndText: was $ in regexp text
|
||||
AllParseFlags = (1<<14)-1,
|
||||
};
|
||||
|
||||
// Get. No set, Regexps are logically immutable once created.
|
||||
RegexpOp op() { return static_cast<RegexpOp>(op_); }
|
||||
int nsub() { return nsub_; }
|
||||
bool simple() { return simple_; }
|
||||
enum ParseFlags parse_flags() { return static_cast<ParseFlags>(parse_flags_); }
|
||||
bool simple() { return simple_ != 0; }
|
||||
ParseFlags parse_flags() { return static_cast<ParseFlags>(parse_flags_); }
|
||||
int Ref(); // For testing.
|
||||
|
||||
Regexp** sub() {
|
||||
@ -353,6 +361,7 @@ class Regexp {
|
||||
// removed. The result will capture exactly the same
|
||||
// subexpressions the original did, unless formatted with ToString.
|
||||
Regexp* Simplify();
|
||||
friend class CoalesceWalker;
|
||||
friend class SimplifyWalker;
|
||||
|
||||
// Parses the regexp src and then simplifies it and sets *dst to the
|
||||
@ -369,12 +378,12 @@ class Regexp {
|
||||
// Returns a map from names to capturing group indices,
|
||||
// or NULL if the regexp contains no named capture groups.
|
||||
// The caller is responsible for deleting the map.
|
||||
map<string, int>* NamedCaptures();
|
||||
std::map<string, int>* NamedCaptures();
|
||||
|
||||
// Returns a map from capturing group indices to capturing group
|
||||
// names or NULL if the regexp contains no named capture groups. The
|
||||
// caller is responsible for deleting the map.
|
||||
map<int, string>* CaptureNames();
|
||||
std::map<int, string>* CaptureNames();
|
||||
|
||||
// Returns a string representation of the current regexp,
|
||||
// using as few parentheses as possible.
|
||||
@ -410,8 +419,8 @@ class Regexp {
|
||||
// Construction and execution of prog will
|
||||
// stay within approximately max_mem bytes of memory.
|
||||
// If max_mem <= 0, a reasonable default is used.
|
||||
Prog* CompileToProg(int64 max_mem);
|
||||
Prog* CompileToReverseProg(int64 max_mem);
|
||||
Prog* CompileToProg(int64_t max_mem);
|
||||
Prog* CompileToReverseProg(int64_t max_mem);
|
||||
|
||||
// Whether to expect this library to find exactly the same answer as PCRE
|
||||
// when running this regexp. Most regexps do mimic PCRE exactly, but a few
|
||||
@ -427,7 +436,9 @@ class Regexp {
|
||||
// begin with a non-empty fixed string (perhaps after ASCII
|
||||
// case-folding). If so, returns the prefix and the sub-regexp that
|
||||
// follows it.
|
||||
bool RequiredPrefix(string* prefix, bool *foldcase, Regexp** suffix);
|
||||
// Callers should expect *prefix, *foldcase and *suffix to be "zeroed"
|
||||
// regardless of the return value.
|
||||
bool RequiredPrefix(string* prefix, bool* foldcase, Regexp** suffix);
|
||||
|
||||
private:
|
||||
// Constructor allocates vectors as appropriate for operator.
|
||||
@ -441,6 +452,7 @@ class Regexp {
|
||||
|
||||
// Helpers for Parse. Listed here so they can edit Regexps.
|
||||
class ParseState;
|
||||
|
||||
friend class ParseState;
|
||||
friend bool ParseCharClass(StringPiece* s, Regexp** out_re,
|
||||
RegexpStatus* status);
|
||||
@ -451,6 +463,10 @@ class Regexp {
|
||||
// Computes whether Regexp is already simple.
|
||||
bool ComputeSimple();
|
||||
|
||||
// Constructor that generates a Star, Plus or Quest,
|
||||
// squashing the pair if sub is also a Star, Plus or Quest.
|
||||
static Regexp* StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags);
|
||||
|
||||
// Constructor that generates a concatenation or alternation,
|
||||
// enforcing the limit on the number of subexpressions for
|
||||
// a particular Regexp.
|
||||
@ -478,8 +494,7 @@ class Regexp {
|
||||
// Simplifies an alternation of literal strings by factoring out
|
||||
// common prefixes.
|
||||
static int FactorAlternation(Regexp** sub, int nsub, ParseFlags flags);
|
||||
static int FactorAlternationRecursive(Regexp** sub, int nsub,
|
||||
ParseFlags flags, int maxdepth);
|
||||
friend class FactorAlternationImpl;
|
||||
|
||||
// Is a == b? Only efficient on regexps that have not been through
|
||||
// Simplify yet - the expansion of a kRegexpRepeat will make this
|
||||
@ -488,11 +503,10 @@ class Regexp {
|
||||
|
||||
// Allocate space for n sub-regexps.
|
||||
void AllocSub(int n) {
|
||||
if (n < 0 || static_cast<uint16>(n) != n)
|
||||
LOG(FATAL) << "Cannot AllocSub " << n;
|
||||
DCHECK(n >= 0 && static_cast<uint16_t>(n) == n);
|
||||
if (n > 1)
|
||||
submany_ = new Regexp*[n];
|
||||
nsub_ = n;
|
||||
nsub_ = static_cast<uint16_t>(n);
|
||||
}
|
||||
|
||||
// Add Rune to LiteralString
|
||||
@ -502,38 +516,38 @@ class Regexp {
|
||||
void Swap(Regexp *that);
|
||||
|
||||
// Operator. See description of operators above.
|
||||
// uint8 instead of RegexpOp to control space usage.
|
||||
uint8 op_;
|
||||
// uint8_t instead of RegexpOp to control space usage.
|
||||
uint8_t op_;
|
||||
|
||||
// Is this regexp structure already simple
|
||||
// (has it been returned by Simplify)?
|
||||
// uint8 instead of bool to control space usage.
|
||||
uint8 simple_;
|
||||
// uint8_t instead of bool to control space usage.
|
||||
uint8_t simple_;
|
||||
|
||||
// Flags saved from parsing and used during execution.
|
||||
// (Only FoldCase is used.)
|
||||
// uint16 instead of ParseFlags to control space usage.
|
||||
uint16 parse_flags_;
|
||||
// uint16_t instead of ParseFlags to control space usage.
|
||||
uint16_t parse_flags_;
|
||||
|
||||
// Reference count. Exists so that SimplifyRegexp can build
|
||||
// regexp structures that are dags rather than trees to avoid
|
||||
// exponential blowup in space requirements.
|
||||
// uint16 to control space usage.
|
||||
// uint16_t to control space usage.
|
||||
// The standard regexp routines will never generate a
|
||||
// ref greater than the maximum repeat count (100),
|
||||
// ref greater than the maximum repeat count (kMaxRepeat),
|
||||
// but even so, Incref and Decref consult an overflow map
|
||||
// when ref_ reaches kMaxRef.
|
||||
uint16 ref_;
|
||||
static const uint16 kMaxRef = 0xffff;
|
||||
uint16_t ref_;
|
||||
static const uint16_t kMaxRef = 0xffff;
|
||||
|
||||
// Subexpressions.
|
||||
// uint16 to control space usage.
|
||||
// uint16_t to control space usage.
|
||||
// Concat and Alternate handle larger numbers of subexpressions
|
||||
// by building concatenation or alternation trees.
|
||||
// Other routines should call Concat or Alternate instead of
|
||||
// filling in sub() by hand.
|
||||
uint16 nsub_;
|
||||
static const uint16 kMaxNsub = 0xffff;
|
||||
uint16_t nsub_;
|
||||
static const uint16_t kMaxNsub = 0xffff;
|
||||
union {
|
||||
Regexp** submany_; // if nsub_ > 1
|
||||
Regexp* subone_; // if nsub_ == 1
|
||||
@ -568,11 +582,12 @@ class Regexp {
|
||||
void *the_union_[2]; // as big as any other element, for memset
|
||||
};
|
||||
|
||||
DISALLOW_EVIL_CONSTRUCTORS(Regexp);
|
||||
Regexp(const Regexp&) = delete;
|
||||
Regexp& operator=(const Regexp&) = delete;
|
||||
};
|
||||
|
||||
// Character class set: contains non-overlapping, non-abutting RuneRanges.
|
||||
typedef set<RuneRange, RuneRangeLess> RuneRangeSet;
|
||||
typedef std::set<RuneRange, RuneRangeLess> RuneRangeSet;
|
||||
|
||||
class CharClassBuilder {
|
||||
public:
|
||||
@ -597,37 +612,41 @@ class CharClassBuilder {
|
||||
void AddRangeFlags(Rune lo, Rune hi, Regexp::ParseFlags parse_flags);
|
||||
|
||||
private:
|
||||
static const uint32 AlphaMask = (1<<26) - 1;
|
||||
uint32 upper_; // bitmap of A-Z
|
||||
uint32 lower_; // bitmap of a-z
|
||||
static const uint32_t AlphaMask = (1<<26) - 1;
|
||||
uint32_t upper_; // bitmap of A-Z
|
||||
uint32_t lower_; // bitmap of a-z
|
||||
int nrunes_;
|
||||
RuneRangeSet ranges_;
|
||||
DISALLOW_EVIL_CONSTRUCTORS(CharClassBuilder);
|
||||
|
||||
CharClassBuilder(const CharClassBuilder&) = delete;
|
||||
CharClassBuilder& operator=(const CharClassBuilder&) = delete;
|
||||
};
|
||||
|
||||
// Tell g++ that bitwise ops on ParseFlags produce ParseFlags.
|
||||
inline Regexp::ParseFlags operator|(Regexp::ParseFlags a, Regexp::ParseFlags b)
|
||||
{
|
||||
return static_cast<Regexp::ParseFlags>(static_cast<int>(a) | static_cast<int>(b));
|
||||
// Bitwise ops on ParseFlags produce ParseFlags.
|
||||
inline Regexp::ParseFlags operator|(Regexp::ParseFlags a,
|
||||
Regexp::ParseFlags b) {
|
||||
return static_cast<Regexp::ParseFlags>(
|
||||
static_cast<int>(a) | static_cast<int>(b));
|
||||
}
|
||||
|
||||
inline Regexp::ParseFlags operator^(Regexp::ParseFlags a, Regexp::ParseFlags b)
|
||||
{
|
||||
return static_cast<Regexp::ParseFlags>(static_cast<int>(a) ^ static_cast<int>(b));
|
||||
inline Regexp::ParseFlags operator^(Regexp::ParseFlags a,
|
||||
Regexp::ParseFlags b) {
|
||||
return static_cast<Regexp::ParseFlags>(
|
||||
static_cast<int>(a) ^ static_cast<int>(b));
|
||||
}
|
||||
|
||||
inline Regexp::ParseFlags operator&(Regexp::ParseFlags a, Regexp::ParseFlags b)
|
||||
{
|
||||
return static_cast<Regexp::ParseFlags>(static_cast<int>(a) & static_cast<int>(b));
|
||||
inline Regexp::ParseFlags operator&(Regexp::ParseFlags a,
|
||||
Regexp::ParseFlags b) {
|
||||
return static_cast<Regexp::ParseFlags>(
|
||||
static_cast<int>(a) & static_cast<int>(b));
|
||||
}
|
||||
|
||||
inline Regexp::ParseFlags operator~(Regexp::ParseFlags a)
|
||||
{
|
||||
return static_cast<Regexp::ParseFlags>(~static_cast<int>(a));
|
||||
inline Regexp::ParseFlags operator~(Regexp::ParseFlags a) {
|
||||
// Attempting to produce a value out of enum's range has undefined behaviour.
|
||||
return static_cast<Regexp::ParseFlags>(
|
||||
~static_cast<int>(a) & static_cast<int>(Regexp::AllParseFlags));
|
||||
}
|
||||
|
||||
|
||||
|
||||
} // namespace re2
|
||||
|
||||
#endif // RE2_REGEXP_H__
|
||||
#endif // RE2_REGEXP_H_
|
||||
|
@ -4,36 +4,42 @@
|
||||
|
||||
#include "re2/set.h"
|
||||
|
||||
#include <stddef.h>
|
||||
#include <algorithm>
|
||||
#include <memory>
|
||||
|
||||
#include "util/util.h"
|
||||
#include "util/logging.h"
|
||||
#include "re2/stringpiece.h"
|
||||
#include "re2/prog.h"
|
||||
#include "re2/re2.h"
|
||||
#include "re2/regexp.h"
|
||||
|
||||
using namespace re2;
|
||||
namespace re2 {
|
||||
|
||||
RE2::Set::Set(const RE2::Options& options, RE2::Anchor anchor) {
|
||||
options_.Copy(options);
|
||||
options_.set_never_capture(true); // might unblock some optimisations
|
||||
anchor_ = anchor;
|
||||
prog_ = NULL;
|
||||
compiled_ = false;
|
||||
size_ = 0;
|
||||
}
|
||||
|
||||
RE2::Set::~Set() {
|
||||
for (size_t i = 0; i < re_.size(); i++)
|
||||
re_[i]->Decref();
|
||||
for (size_t i = 0; i < elem_.size(); i++)
|
||||
elem_[i].second->Decref();
|
||||
delete prog_;
|
||||
}
|
||||
|
||||
int RE2::Set::Add(const StringPiece& pattern, string* error) {
|
||||
if (compiled_) {
|
||||
LOG(DFATAL) << "RE2::Set::Add after Compile";
|
||||
LOG(DFATAL) << "RE2::Set::Add() called after compiling";
|
||||
return -1;
|
||||
}
|
||||
|
||||
Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
|
||||
options_.ParseFlags());
|
||||
|
||||
RegexpStatus status;
|
||||
re2::Regexp* re = Regexp::Parse(pattern, pf, &status);
|
||||
if (re == NULL) {
|
||||
@ -45,7 +51,7 @@ int RE2::Set::Add(const StringPiece& pattern, string* error) {
|
||||
}
|
||||
|
||||
// Concatenate with match index and push on vector.
|
||||
int n = re_.size();
|
||||
int n = static_cast<int>(elem_.size());
|
||||
re2::Regexp* m = re2::Regexp::HaveMatch(n, pf);
|
||||
if (re->op() == kRegexpConcat) {
|
||||
int nsub = re->nsub();
|
||||
@ -62,52 +68,87 @@ int RE2::Set::Add(const StringPiece& pattern, string* error) {
|
||||
sub[1] = m;
|
||||
re = re2::Regexp::Concat(sub, 2, pf);
|
||||
}
|
||||
re_.push_back(re);
|
||||
elem_.emplace_back(pattern.ToString(), re);
|
||||
return n;
|
||||
}
|
||||
|
||||
bool RE2::Set::Compile() {
|
||||
if (compiled_) {
|
||||
LOG(DFATAL) << "RE2::Set::Compile multiple times";
|
||||
LOG(DFATAL) << "RE2::Set::Compile() called more than once";
|
||||
return false;
|
||||
}
|
||||
compiled_ = true;
|
||||
size_ = static_cast<int>(elem_.size());
|
||||
|
||||
// Sort the elements by their patterns. This is good enough for now
|
||||
// until we have a Regexp comparison function. (Maybe someday...)
|
||||
std::sort(elem_.begin(), elem_.end(),
|
||||
[](const Elem& a, const Elem& b) -> bool {
|
||||
return a.first < b.first;
|
||||
});
|
||||
|
||||
re2::Regexp** sub = new re2::Regexp*[size_];
|
||||
for (size_t i = 0; i < elem_.size(); i++)
|
||||
sub[i] = elem_[i].second;
|
||||
elem_.clear();
|
||||
elem_.shrink_to_fit();
|
||||
|
||||
Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
|
||||
options_.ParseFlags());
|
||||
re2::Regexp* re = re2::Regexp::Alternate(const_cast<re2::Regexp**>(&re_[0]),
|
||||
re_.size(), pf);
|
||||
re_.clear();
|
||||
re2::Regexp* sre = re->Simplify();
|
||||
re->Decref();
|
||||
re = sre;
|
||||
if (re == NULL) {
|
||||
if (options_.log_errors())
|
||||
LOG(ERROR) << "Error simplifying during Compile.";
|
||||
return false;
|
||||
}
|
||||
re2::Regexp* re = re2::Regexp::Alternate(sub, size_, pf);
|
||||
delete[] sub;
|
||||
|
||||
prog_ = Prog::CompileSet(options_, anchor_, re);
|
||||
prog_ = Prog::CompileSet(re, anchor_, options_.max_mem());
|
||||
re->Decref();
|
||||
return prog_ != NULL;
|
||||
}
|
||||
|
||||
bool RE2::Set::Match(const StringPiece& text, vector<int>* v) const {
|
||||
if (!compiled_) {
|
||||
LOG(DFATAL) << "RE2::Set::Match without Compile";
|
||||
return false;
|
||||
}
|
||||
v->clear();
|
||||
bool failed;
|
||||
bool ret = prog_->SearchDFA(text, text, Prog::kAnchored,
|
||||
Prog::kManyMatch, NULL, &failed, v);
|
||||
if (failed)
|
||||
LOG(DFATAL) << "RE2::Set::Match: DFA ran out of cache space";
|
||||
bool RE2::Set::Match(const StringPiece& text, std::vector<int>* v) const {
|
||||
return Match(text, v, NULL);
|
||||
}
|
||||
|
||||
if (ret == false)
|
||||
return false;
|
||||
if (v->size() == 0) {
|
||||
LOG(DFATAL) << "RE2::Set::Match: match but unknown regexp set";
|
||||
bool RE2::Set::Match(const StringPiece& text, std::vector<int>* v,
|
||||
ErrorInfo* error_info) const {
|
||||
if (!compiled_) {
|
||||
LOG(DFATAL) << "RE2::Set::Match() called before compiling";
|
||||
if (error_info != NULL)
|
||||
error_info->kind = kNotCompiled;
|
||||
return false;
|
||||
}
|
||||
bool dfa_failed = false;
|
||||
std::unique_ptr<SparseSet> matches;
|
||||
if (v != NULL) {
|
||||
matches.reset(new SparseSet(size_));
|
||||
v->clear();
|
||||
}
|
||||
bool ret = prog_->SearchDFA(text, text, Prog::kAnchored, Prog::kManyMatch,
|
||||
NULL, &dfa_failed, matches.get());
|
||||
if (dfa_failed) {
|
||||
if (options_.log_errors())
|
||||
LOG(ERROR) << "DFA out of memory: size " << prog_->size() << ", "
|
||||
<< "bytemap range " << prog_->bytemap_range() << ", "
|
||||
<< "list count " << prog_->list_count();
|
||||
if (error_info != NULL)
|
||||
error_info->kind = kOutOfMemory;
|
||||
return false;
|
||||
}
|
||||
if (ret == false) {
|
||||
if (error_info != NULL)
|
||||
error_info->kind = kNoError;
|
||||
return false;
|
||||
}
|
||||
if (v != NULL) {
|
||||
if (matches->empty()) {
|
||||
LOG(DFATAL) << "RE2::Set::Match() matched, but no matches returned?!";
|
||||
if (error_info != NULL)
|
||||
error_info->kind = kInconsistent;
|
||||
return false;
|
||||
}
|
||||
v->assign(matches->begin(), matches->end());
|
||||
}
|
||||
if (error_info != NULL)
|
||||
error_info->kind = kNoError;
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
|
@ -2,54 +2,79 @@
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef RE2_SET_H
|
||||
#define RE2_SET_H
|
||||
#ifndef RE2_SET_H_
|
||||
#define RE2_SET_H_
|
||||
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "re2/re2.h"
|
||||
|
||||
namespace re2 {
|
||||
using std::vector;
|
||||
class Prog;
|
||||
class Regexp;
|
||||
} // namespace re2
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// An RE2::Set represents a collection of regexps that can
|
||||
// be searched for simultaneously.
|
||||
class RE2::Set {
|
||||
public:
|
||||
enum ErrorKind {
|
||||
kNoError = 0,
|
||||
kNotCompiled, // The set is not compiled.
|
||||
kOutOfMemory, // The DFA ran out of memory.
|
||||
kInconsistent, // The result is inconsistent. This should never happen.
|
||||
};
|
||||
|
||||
struct ErrorInfo {
|
||||
ErrorKind kind;
|
||||
};
|
||||
|
||||
Set(const RE2::Options& options, RE2::Anchor anchor);
|
||||
~Set();
|
||||
|
||||
// Add adds regexp pattern to the set, interpreted using the RE2 options.
|
||||
// (The RE2 constructor's default options parameter is RE2::UTF8.)
|
||||
// Add returns the regexp index that will be used to identify
|
||||
// it in the result of Match, or -1 if the regexp cannot be parsed.
|
||||
// Adds pattern to the set using the options passed to the constructor.
|
||||
// Returns the index that will identify the regexp in the output of Match(),
|
||||
// or -1 if the regexp cannot be parsed.
|
||||
// Indices are assigned in sequential order starting from 0.
|
||||
// Error returns do not increment the index.
|
||||
// If an error occurs and error != NULL, *error will hold an error message.
|
||||
// Errors do not increment the index; if error is not NULL, *error will hold
|
||||
// the error message from the parser.
|
||||
int Add(const StringPiece& pattern, string* error);
|
||||
|
||||
// Compile prepares the Set for matching.
|
||||
// Add must not be called again after Compile.
|
||||
// Compile must be called before FullMatch or PartialMatch.
|
||||
// Compile may return false if it runs out of memory.
|
||||
// Compiles the set in preparation for matching.
|
||||
// Returns false if the compiler runs out of memory.
|
||||
// Add() must not be called again after Compile().
|
||||
// Compile() must be called before Match().
|
||||
bool Compile();
|
||||
|
||||
// Match returns true if text matches any of the regexps in the set.
|
||||
// If so, it fills v with the indices of the matching regexps.
|
||||
bool Match(const StringPiece& text, vector<int>* v) const;
|
||||
// Returns true if text matches at least one of the regexps in the set.
|
||||
// Fills v (if not NULL) with the indices of the matching regexps.
|
||||
// Callers must not expect v to be sorted.
|
||||
bool Match(const StringPiece& text, std::vector<int>* v) const;
|
||||
|
||||
// As above, but populates error_info (if not NULL) when none of the regexps
|
||||
// in the set matched. This can inform callers when DFA execution fails, for
|
||||
// example, because they might wish to handle that case differently.
|
||||
bool Match(const StringPiece& text, std::vector<int>* v,
|
||||
ErrorInfo* error_info) const;
|
||||
|
||||
private:
|
||||
typedef std::pair<string, re2::Regexp*> Elem;
|
||||
|
||||
RE2::Options options_;
|
||||
RE2::Anchor anchor_;
|
||||
vector<re2::Regexp*> re_;
|
||||
std::vector<Elem> elem_;
|
||||
re2::Prog* prog_;
|
||||
bool compiled_;
|
||||
//DISALLOW_EVIL_CONSTRUCTORS(Set);
|
||||
Set(const Set&);
|
||||
void operator=(const Set&);
|
||||
int size_;
|
||||
|
||||
Set(const Set&) = delete;
|
||||
Set& operator=(const Set&) = delete;
|
||||
};
|
||||
|
||||
} // namespace re2
|
||||
|
||||
#endif // RE2_SET_H
|
||||
#endif // RE2_SET_H_
|
||||
|
@ -6,7 +6,11 @@
|
||||
// to use simple extended regular expression features.
|
||||
// Also sort and simplify character classes.
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "util/util.h"
|
||||
#include "util/logging.h"
|
||||
#include "util/utf.h"
|
||||
#include "re2/regexp.h"
|
||||
#include "re2/walker-inl.h"
|
||||
|
||||
@ -61,7 +65,7 @@ bool Regexp::ComputeSimple() {
|
||||
// These are simple as long as the subpieces are simple.
|
||||
subs = sub();
|
||||
for (int i = 0; i < nsub_; i++)
|
||||
if (!subs[i]->simple_)
|
||||
if (!subs[i]->simple())
|
||||
return false;
|
||||
return true;
|
||||
case kRegexpCharClass:
|
||||
@ -71,12 +75,12 @@ bool Regexp::ComputeSimple() {
|
||||
return !cc_->empty() && !cc_->full();
|
||||
case kRegexpCapture:
|
||||
subs = sub();
|
||||
return subs[0]->simple_;
|
||||
return subs[0]->simple();
|
||||
case kRegexpStar:
|
||||
case kRegexpPlus:
|
||||
case kRegexpQuest:
|
||||
subs = sub();
|
||||
if (!subs[0]->simple_)
|
||||
if (!subs[0]->simple())
|
||||
return false;
|
||||
switch (subs[0]->op_) {
|
||||
case kRegexpStar:
|
||||
@ -96,6 +100,37 @@ bool Regexp::ComputeSimple() {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Walker subclass used by Simplify.
|
||||
// Coalesces runs of star/plus/quest/repeat of the same literal along with any
|
||||
// occurrences of that literal into repeats of that literal. It also works for
|
||||
// char classes, any char and any byte.
|
||||
// PostVisit creates the coalesced result, which should then be simplified.
|
||||
class CoalesceWalker : public Regexp::Walker<Regexp*> {
|
||||
public:
|
||||
CoalesceWalker() {}
|
||||
virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
|
||||
Regexp** child_args, int nchild_args);
|
||||
virtual Regexp* Copy(Regexp* re);
|
||||
virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
|
||||
|
||||
private:
|
||||
// These functions are declared inside CoalesceWalker so that
|
||||
// they can edit the private fields of the Regexps they construct.
|
||||
|
||||
// Returns true if r1 and r2 can be coalesced. In particular, ensures that
|
||||
// the parse flags are consistent. (They will not be checked again later.)
|
||||
static bool CanCoalesce(Regexp* r1, Regexp* r2);
|
||||
|
||||
// Coalesces *r1ptr and *r2ptr. In most cases, the array elements afterwards
|
||||
// will be empty match and the coalesced op. In other cases, where part of a
|
||||
// literal string was removed to be coalesced, the array elements afterwards
|
||||
// will be the coalesced op and the remainder of the literal string.
|
||||
static void DoCoalesce(Regexp** r1ptr, Regexp** r2ptr);
|
||||
|
||||
CoalesceWalker(const CoalesceWalker&) = delete;
|
||||
CoalesceWalker& operator=(const CoalesceWalker&) = delete;
|
||||
};
|
||||
|
||||
// Walker subclass used by Simplify.
|
||||
// The simplify walk is purely post-recursive: given the simplified children,
|
||||
// PostVisit creates the simplified result.
|
||||
@ -104,9 +139,7 @@ class SimplifyWalker : public Regexp::Walker<Regexp*> {
|
||||
public:
|
||||
SimplifyWalker() {}
|
||||
virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
|
||||
virtual Regexp* PostVisit(Regexp* re,
|
||||
Regexp* parent_arg,
|
||||
Regexp* pre_arg,
|
||||
virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
|
||||
Regexp** child_args, int nchild_args);
|
||||
virtual Regexp* Copy(Regexp* re);
|
||||
virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
|
||||
@ -130,7 +163,8 @@ class SimplifyWalker : public Regexp::Walker<Regexp*> {
|
||||
// Caller must Decref return value when done with it.
|
||||
static Regexp* SimplifyCharClass(Regexp* re);
|
||||
|
||||
DISALLOW_EVIL_CONSTRUCTORS(SimplifyWalker);
|
||||
SimplifyWalker(const SimplifyWalker&) = delete;
|
||||
SimplifyWalker& operator=(const SimplifyWalker&) = delete;
|
||||
};
|
||||
|
||||
// Simplifies a regular expression, returning a new regexp.
|
||||
@ -143,14 +177,261 @@ class SimplifyWalker : public Regexp::Walker<Regexp*> {
|
||||
// Caller must Decref() return value when done with it.
|
||||
|
||||
Regexp* Regexp::Simplify() {
|
||||
if (simple_)
|
||||
return Incref();
|
||||
SimplifyWalker w;
|
||||
return w.Walk(this, NULL);
|
||||
CoalesceWalker cw;
|
||||
Regexp* cre = cw.Walk(this, NULL);
|
||||
if (cre == NULL)
|
||||
return cre;
|
||||
SimplifyWalker sw;
|
||||
Regexp* sre = sw.Walk(cre, NULL);
|
||||
cre->Decref();
|
||||
return sre;
|
||||
}
|
||||
|
||||
#define Simplify DontCallSimplify // Avoid accidental recursion
|
||||
|
||||
// Utility function for PostVisit implementations that compares re->sub() with
|
||||
// child_args to determine whether any child_args changed. In the common case,
|
||||
// where nothing changed, calls Decref() for all child_args and returns false,
|
||||
// so PostVisit must return re->Incref(). Otherwise, returns true.
|
||||
static bool ChildArgsChanged(Regexp* re, Regexp** child_args) {
|
||||
for (int i = 0; i < re->nsub(); i++) {
|
||||
Regexp* sub = re->sub()[i];
|
||||
Regexp* newsub = child_args[i];
|
||||
if (newsub != sub)
|
||||
return true;
|
||||
}
|
||||
for (int i = 0; i < re->nsub(); i++) {
|
||||
Regexp* newsub = child_args[i];
|
||||
newsub->Decref();
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
Regexp* CoalesceWalker::Copy(Regexp* re) {
|
||||
return re->Incref();
|
||||
}
|
||||
|
||||
Regexp* CoalesceWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
|
||||
// This should never be called, since we use Walk and not
|
||||
// WalkExponential.
|
||||
LOG(DFATAL) << "CoalesceWalker::ShortVisit called";
|
||||
return re->Incref();
|
||||
}
|
||||
|
||||
Regexp* CoalesceWalker::PostVisit(Regexp* re,
|
||||
Regexp* parent_arg,
|
||||
Regexp* pre_arg,
|
||||
Regexp** child_args,
|
||||
int nchild_args) {
|
||||
if (re->nsub() == 0)
|
||||
return re->Incref();
|
||||
|
||||
if (re->op() != kRegexpConcat) {
|
||||
if (!ChildArgsChanged(re, child_args))
|
||||
return re->Incref();
|
||||
|
||||
// Something changed. Build a new op.
|
||||
Regexp* nre = new Regexp(re->op(), re->parse_flags());
|
||||
nre->AllocSub(re->nsub());
|
||||
Regexp** nre_subs = nre->sub();
|
||||
for (int i = 0; i < re->nsub(); i++)
|
||||
nre_subs[i] = child_args[i];
|
||||
// Repeats and Captures have additional data that must be copied.
|
||||
if (re->op() == kRegexpRepeat) {
|
||||
nre->min_ = re->min();
|
||||
nre->max_ = re->max();
|
||||
} else if (re->op() == kRegexpCapture) {
|
||||
nre->cap_ = re->cap();
|
||||
}
|
||||
return nre;
|
||||
}
|
||||
|
||||
bool can_coalesce = false;
|
||||
for (int i = 0; i < re->nsub(); i++) {
|
||||
if (i+1 < re->nsub() &&
|
||||
CanCoalesce(child_args[i], child_args[i+1])) {
|
||||
can_coalesce = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!can_coalesce) {
|
||||
if (!ChildArgsChanged(re, child_args))
|
||||
return re->Incref();
|
||||
|
||||
// Something changed. Build a new op.
|
||||
Regexp* nre = new Regexp(re->op(), re->parse_flags());
|
||||
nre->AllocSub(re->nsub());
|
||||
Regexp** nre_subs = nre->sub();
|
||||
for (int i = 0; i < re->nsub(); i++)
|
||||
nre_subs[i] = child_args[i];
|
||||
return nre;
|
||||
}
|
||||
|
||||
for (int i = 0; i < re->nsub(); i++) {
|
||||
if (i+1 < re->nsub() &&
|
||||
CanCoalesce(child_args[i], child_args[i+1]))
|
||||
DoCoalesce(&child_args[i], &child_args[i+1]);
|
||||
}
|
||||
// Determine how many empty matches were left by DoCoalesce.
|
||||
int n = 0;
|
||||
for (int i = n; i < re->nsub(); i++) {
|
||||
if (child_args[i]->op() == kRegexpEmptyMatch)
|
||||
n++;
|
||||
}
|
||||
// Build a new op.
|
||||
Regexp* nre = new Regexp(re->op(), re->parse_flags());
|
||||
nre->AllocSub(re->nsub() - n);
|
||||
Regexp** nre_subs = nre->sub();
|
||||
for (int i = 0, j = 0; i < re->nsub(); i++) {
|
||||
if (child_args[i]->op() == kRegexpEmptyMatch) {
|
||||
child_args[i]->Decref();
|
||||
continue;
|
||||
}
|
||||
nre_subs[j] = child_args[i];
|
||||
j++;
|
||||
}
|
||||
return nre;
|
||||
}
|
||||
|
||||
bool CoalesceWalker::CanCoalesce(Regexp* r1, Regexp* r2) {
|
||||
// r1 must be a star/plus/quest/repeat of a literal, char class, any char or
|
||||
// any byte.
|
||||
if ((r1->op() == kRegexpStar ||
|
||||
r1->op() == kRegexpPlus ||
|
||||
r1->op() == kRegexpQuest ||
|
||||
r1->op() == kRegexpRepeat) &&
|
||||
(r1->sub()[0]->op() == kRegexpLiteral ||
|
||||
r1->sub()[0]->op() == kRegexpCharClass ||
|
||||
r1->sub()[0]->op() == kRegexpAnyChar ||
|
||||
r1->sub()[0]->op() == kRegexpAnyByte)) {
|
||||
// r2 must be a star/plus/quest/repeat of the same literal, char class,
|
||||
// any char or any byte.
|
||||
if ((r2->op() == kRegexpStar ||
|
||||
r2->op() == kRegexpPlus ||
|
||||
r2->op() == kRegexpQuest ||
|
||||
r2->op() == kRegexpRepeat) &&
|
||||
Regexp::Equal(r1->sub()[0], r2->sub()[0]) &&
|
||||
// The parse flags must be consistent.
|
||||
((r1->parse_flags() & Regexp::NonGreedy) ==
|
||||
(r2->parse_flags() & Regexp::NonGreedy))) {
|
||||
return true;
|
||||
}
|
||||
// ... OR an occurrence of that literal, char class, any char or any byte
|
||||
if (Regexp::Equal(r1->sub()[0], r2)) {
|
||||
return true;
|
||||
}
|
||||
// ... OR a literal string that begins with that literal.
|
||||
if (r1->sub()[0]->op() == kRegexpLiteral &&
|
||||
r2->op() == kRegexpLiteralString &&
|
||||
r2->runes()[0] == r1->sub()[0]->rune() &&
|
||||
// The parse flags must be consistent.
|
||||
((r1->sub()[0]->parse_flags() & Regexp::FoldCase) ==
|
||||
(r2->parse_flags() & Regexp::FoldCase))) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void CoalesceWalker::DoCoalesce(Regexp** r1ptr, Regexp** r2ptr) {
|
||||
Regexp* r1 = *r1ptr;
|
||||
Regexp* r2 = *r2ptr;
|
||||
|
||||
Regexp* nre = Regexp::Repeat(
|
||||
r1->sub()[0]->Incref(), r1->parse_flags(), 0, 0);
|
||||
|
||||
switch (r1->op()) {
|
||||
case kRegexpStar:
|
||||
nre->min_ = 0;
|
||||
nre->max_ = -1;
|
||||
break;
|
||||
|
||||
case kRegexpPlus:
|
||||
nre->min_ = 1;
|
||||
nre->max_ = -1;
|
||||
break;
|
||||
|
||||
case kRegexpQuest:
|
||||
nre->min_ = 0;
|
||||
nre->max_ = 1;
|
||||
break;
|
||||
|
||||
case kRegexpRepeat:
|
||||
nre->min_ = r1->min();
|
||||
nre->max_ = r1->max();
|
||||
break;
|
||||
|
||||
default:
|
||||
LOG(DFATAL) << "DoCoalesce failed: r1->op() is " << r1->op();
|
||||
nre->Decref();
|
||||
return;
|
||||
}
|
||||
|
||||
switch (r2->op()) {
|
||||
case kRegexpStar:
|
||||
nre->max_ = -1;
|
||||
goto LeaveEmpty;
|
||||
|
||||
case kRegexpPlus:
|
||||
nre->min_++;
|
||||
nre->max_ = -1;
|
||||
goto LeaveEmpty;
|
||||
|
||||
case kRegexpQuest:
|
||||
if (nre->max() != -1)
|
||||
nre->max_++;
|
||||
goto LeaveEmpty;
|
||||
|
||||
case kRegexpRepeat:
|
||||
nre->min_ += r2->min();
|
||||
if (r2->max() == -1)
|
||||
nre->max_ = -1;
|
||||
else if (nre->max() != -1)
|
||||
nre->max_ += r2->max();
|
||||
goto LeaveEmpty;
|
||||
|
||||
case kRegexpLiteral:
|
||||
case kRegexpCharClass:
|
||||
case kRegexpAnyChar:
|
||||
case kRegexpAnyByte:
|
||||
nre->min_++;
|
||||
if (nre->max() != -1)
|
||||
nre->max_++;
|
||||
goto LeaveEmpty;
|
||||
|
||||
LeaveEmpty:
|
||||
*r1ptr = new Regexp(kRegexpEmptyMatch, Regexp::NoParseFlags);
|
||||
*r2ptr = nre;
|
||||
break;
|
||||
|
||||
case kRegexpLiteralString: {
|
||||
Rune r = r1->sub()[0]->rune();
|
||||
// Determine how much of the literal string is removed.
|
||||
// We know that we have at least one rune. :)
|
||||
int n = 1;
|
||||
while (n < r2->nrunes() && r2->runes()[n] == r)
|
||||
n++;
|
||||
nre->min_ += n;
|
||||
if (nre->max() != -1)
|
||||
nre->max_ += n;
|
||||
if (n == r2->nrunes())
|
||||
goto LeaveEmpty;
|
||||
*r1ptr = nre;
|
||||
*r2ptr = Regexp::LiteralString(
|
||||
&r2->runes()[n], r2->nrunes() - n, r2->parse_flags());
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
LOG(DFATAL) << "DoCoalesce failed: r2->op() is " << r2->op();
|
||||
nre->Decref();
|
||||
return;
|
||||
}
|
||||
|
||||
r1->Decref();
|
||||
r2->Decref();
|
||||
}
|
||||
|
||||
Regexp* SimplifyWalker::Copy(Regexp* re) {
|
||||
return re->Incref();
|
||||
}
|
||||
@ -163,7 +444,7 @@ Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
|
||||
}
|
||||
|
||||
Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {
|
||||
if (re->simple_) {
|
||||
if (re->simple()) {
|
||||
*stop = true;
|
||||
return re->Incref();
|
||||
}
|
||||
@ -196,29 +477,14 @@ Regexp* SimplifyWalker::PostVisit(Regexp* re,
|
||||
case kRegexpConcat:
|
||||
case kRegexpAlternate: {
|
||||
// These are simple as long as the subpieces are simple.
|
||||
// Two passes to avoid allocation in the common case.
|
||||
bool changed = false;
|
||||
Regexp** subs = re->sub();
|
||||
for (int i = 0; i < re->nsub_; i++) {
|
||||
Regexp* sub = subs[i];
|
||||
Regexp* newsub = child_args[i];
|
||||
if (newsub != sub) {
|
||||
changed = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!changed) {
|
||||
for (int i = 0; i < re->nsub_; i++) {
|
||||
Regexp* newsub = child_args[i];
|
||||
newsub->Decref();
|
||||
}
|
||||
if (!ChildArgsChanged(re, child_args)) {
|
||||
re->simple_ = true;
|
||||
return re->Incref();
|
||||
}
|
||||
Regexp* nre = new Regexp(re->op(), re->parse_flags());
|
||||
nre->AllocSub(re->nsub_);
|
||||
nre->AllocSub(re->nsub());
|
||||
Regexp** nre_subs = nre->sub();
|
||||
for (int i = 0; i <re->nsub_; i++)
|
||||
for (int i = 0; i < re->nsub(); i++)
|
||||
nre_subs[i] = child_args[i];
|
||||
nre->simple_ = true;
|
||||
return nre;
|
||||
@ -234,7 +500,7 @@ Regexp* SimplifyWalker::PostVisit(Regexp* re,
|
||||
Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());
|
||||
nre->AllocSub(1);
|
||||
nre->sub()[0] = newsub;
|
||||
nre->cap_ = re->cap_;
|
||||
nre->cap_ = re->cap();
|
||||
nre->simple_ = true;
|
||||
return nre;
|
||||
}
|
||||
@ -323,13 +589,12 @@ Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
|
||||
return Regexp::Plus(re->Incref(), f);
|
||||
|
||||
// General case: x{4,} is xxxx+
|
||||
Regexp* nre = new Regexp(kRegexpConcat, f);
|
||||
nre->AllocSub(min);
|
||||
VLOG(1) << "Simplify " << min;
|
||||
Regexp** nre_subs = nre->sub();
|
||||
Regexp** nre_subs = new Regexp*[min];
|
||||
for (int i = 0; i < min-1; i++)
|
||||
nre_subs[i] = re->Incref();
|
||||
nre_subs[min-1] = Regexp::Plus(re->Incref(), f);
|
||||
Regexp* nre = Regexp::Concat(nre_subs, min, f);
|
||||
delete[] nre_subs;
|
||||
return nre;
|
||||
}
|
||||
|
||||
@ -348,11 +613,11 @@ Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
|
||||
// Build leading prefix: xx. Capturing only on the last one.
|
||||
Regexp* nre = NULL;
|
||||
if (min > 0) {
|
||||
nre = new Regexp(kRegexpConcat, f);
|
||||
nre->AllocSub(min);
|
||||
Regexp** nre_subs = nre->sub();
|
||||
Regexp** nre_subs = new Regexp*[min];
|
||||
for (int i = 0; i < min; i++)
|
||||
nre_subs[i] = re->Incref();
|
||||
nre = Regexp::Concat(nre_subs, min, f);
|
||||
delete[] nre_subs;
|
||||
}
|
||||
|
||||
// Build and attach suffix: (x(x(x)?)?)?
|
||||
|
65
contrib/libre2/re2/stringpiece.cc
Normal file
65
contrib/libre2/re2/stringpiece.cc
Normal file
@ -0,0 +1,65 @@
|
||||
// Copyright 2004 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "re2/stringpiece.h"
|
||||
|
||||
#include <ostream>
|
||||
|
||||
#include "util/util.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
const StringPiece::size_type StringPiece::npos; // initialized in stringpiece.h
|
||||
|
||||
StringPiece::size_type StringPiece::copy(char* buf, size_type n,
|
||||
size_type pos) const {
|
||||
size_type ret = std::min(size_ - pos, n);
|
||||
memcpy(buf, data_ + pos, ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
StringPiece StringPiece::substr(size_type pos, size_type n) const {
|
||||
if (pos > size_) pos = size_;
|
||||
if (n > size_ - pos) n = size_ - pos;
|
||||
return StringPiece(data_ + pos, n);
|
||||
}
|
||||
|
||||
StringPiece::size_type StringPiece::find(const StringPiece& s,
|
||||
size_type pos) const {
|
||||
if (pos > size_) return npos;
|
||||
const_pointer result = std::search(data_ + pos, data_ + size_,
|
||||
s.data_, s.data_ + s.size_);
|
||||
size_type xpos = result - data_;
|
||||
return xpos + s.size_ <= size_ ? xpos : npos;
|
||||
}
|
||||
|
||||
StringPiece::size_type StringPiece::find(char c, size_type pos) const {
|
||||
if (size_ <= 0 || pos >= size_) return npos;
|
||||
const_pointer result = std::find(data_ + pos, data_ + size_, c);
|
||||
return result != data_ + size_ ? result - data_ : npos;
|
||||
}
|
||||
|
||||
StringPiece::size_type StringPiece::rfind(const StringPiece& s,
|
||||
size_type pos) const {
|
||||
if (size_ < s.size_) return npos;
|
||||
if (s.size_ == 0) return std::min(size_, pos);
|
||||
const_pointer last = data_ + std::min(size_ - s.size_, pos) + s.size_;
|
||||
const_pointer result = std::find_end(data_, last, s.data_, s.data_ + s.size_);
|
||||
return result != last ? result - data_ : npos;
|
||||
}
|
||||
|
||||
StringPiece::size_type StringPiece::rfind(char c, size_type pos) const {
|
||||
if (size_ <= 0) return npos;
|
||||
for (size_t i = std::min(pos + 1, size_); i != 0;) {
|
||||
if (data_[--i] == c) return i;
|
||||
}
|
||||
return npos;
|
||||
}
|
||||
|
||||
std::ostream& operator<<(std::ostream& o, const StringPiece& p) {
|
||||
o.write(p.data(), p.size());
|
||||
return o;
|
||||
}
|
||||
|
||||
} // namespace re2
|
@ -2,6 +2,9 @@
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef RE2_STRINGPIECE_H_
|
||||
#define RE2_STRINGPIECE_H_
|
||||
|
||||
// A string-like object that points to a sized piece of memory.
|
||||
//
|
||||
// Functions or methods may use const StringPiece& parameters to accept either
|
||||
@ -16,140 +19,145 @@
|
||||
//
|
||||
// Arghh! I wish C++ literals were "string".
|
||||
|
||||
#ifndef STRINGS_STRINGPIECE_H__
|
||||
#define STRINGS_STRINGPIECE_H__
|
||||
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
#include <cstddef>
|
||||
#include <algorithm>
|
||||
#include <iosfwd>
|
||||
#include <iterator>
|
||||
#include <string>
|
||||
|
||||
namespace re2 {
|
||||
|
||||
class StringPiece {
|
||||
private:
|
||||
const char* ptr_;
|
||||
int length_;
|
||||
|
||||
public:
|
||||
typedef char value_type;
|
||||
typedef char* pointer;
|
||||
typedef const char* const_pointer;
|
||||
typedef char& reference;
|
||||
typedef const char& const_reference;
|
||||
typedef const char* const_iterator;
|
||||
typedef const_iterator iterator;
|
||||
typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
|
||||
typedef const_reverse_iterator reverse_iterator;
|
||||
typedef size_t size_type;
|
||||
typedef ptrdiff_t difference_type;
|
||||
static const size_type npos = static_cast<size_type>(-1);
|
||||
|
||||
// We provide non-explicit singleton constructors so users can pass
|
||||
// in a "const char*" or a "string" wherever a "StringPiece" is
|
||||
// expected.
|
||||
StringPiece() : ptr_(NULL), length_(0) { }
|
||||
StringPiece(const char* str)
|
||||
: ptr_(str), length_((str == NULL) ? 0 : static_cast<int>(strlen(str))) { }
|
||||
StringPiece()
|
||||
: data_(NULL), size_(0) {}
|
||||
StringPiece(const std::string& str)
|
||||
: ptr_(str.data()), length_(static_cast<int>(str.size())) { }
|
||||
StringPiece(const char* offset, int len) : ptr_(offset), length_(len) { }
|
||||
: data_(str.data()), size_(str.size()) {}
|
||||
StringPiece(const char* str)
|
||||
: data_(str), size_(str == NULL ? 0 : strlen(str)) {}
|
||||
StringPiece(const char* str, size_type len)
|
||||
: data_(str), size_(len) {}
|
||||
|
||||
// data() may return a pointer to a buffer with embedded NULs, and the
|
||||
// returned buffer may or may not be null terminated. Therefore it is
|
||||
// typically a mistake to pass data() to a routine that expects a NUL
|
||||
// terminated string.
|
||||
const char* data() const { return ptr_; }
|
||||
int size() const { return length_; }
|
||||
int length() const { return length_; }
|
||||
bool empty() const { return length_ == 0; }
|
||||
const_iterator begin() const { return data_; }
|
||||
const_iterator end() const { return data_ + size_; }
|
||||
const_reverse_iterator rbegin() const {
|
||||
return const_reverse_iterator(data_ + size_);
|
||||
}
|
||||
const_reverse_iterator rend() const {
|
||||
return const_reverse_iterator(data_);
|
||||
}
|
||||
|
||||
size_type size() const { return size_; }
|
||||
size_type length() const { return size_; }
|
||||
bool empty() const { return size_ == 0; }
|
||||
|
||||
const_reference operator[](size_type i) const { return data_[i]; }
|
||||
const_pointer data() const { return data_; }
|
||||
|
||||
void remove_prefix(size_type n) {
|
||||
data_ += n;
|
||||
size_ -= n;
|
||||
}
|
||||
|
||||
void remove_suffix(size_type n) {
|
||||
size_ -= n;
|
||||
}
|
||||
|
||||
void clear() { ptr_ = NULL; length_ = 0; }
|
||||
void set(const char* data, int len) { ptr_ = data; length_ = len; }
|
||||
void set(const char* str) {
|
||||
ptr_ = str;
|
||||
if (str != NULL)
|
||||
length_ = static_cast<int>(strlen(str));
|
||||
else
|
||||
length_ = 0;
|
||||
}
|
||||
void set(const void* data, int len) {
|
||||
ptr_ = reinterpret_cast<const char*>(data);
|
||||
length_ = len;
|
||||
data_ = str;
|
||||
size_ = str == NULL ? 0 : strlen(str);
|
||||
}
|
||||
|
||||
char operator[](int i) const { return ptr_[i]; }
|
||||
|
||||
void remove_prefix(int n) {
|
||||
ptr_ += n;
|
||||
length_ -= n;
|
||||
}
|
||||
|
||||
void remove_suffix(int n) {
|
||||
length_ -= n;
|
||||
}
|
||||
|
||||
int compare(const StringPiece& x) const {
|
||||
int r = memcmp(ptr_, x.ptr_, std::min(length_, x.length_));
|
||||
if (r == 0) {
|
||||
if (length_ < x.length_) r = -1;
|
||||
else if (length_ > x.length_) r = +1;
|
||||
}
|
||||
return r;
|
||||
void set(const char* str, size_type len) {
|
||||
data_ = str;
|
||||
size_ = len;
|
||||
}
|
||||
|
||||
std::string as_string() const {
|
||||
return std::string(data(), size());
|
||||
return std::string(data_, size_);
|
||||
}
|
||||
|
||||
// We also define ToString() here, since many other string-like
|
||||
// interfaces name the routine that converts to a C++ string
|
||||
// "ToString", and it's confusing to have the method that does that
|
||||
// for a StringPiece be called "as_string()". We also leave the
|
||||
// "as_string()" method defined here for existing code.
|
||||
std::string ToString() const {
|
||||
return std::string(data(), size());
|
||||
return std::string(data_, size_);
|
||||
}
|
||||
|
||||
void CopyToString(std::string* target) const;
|
||||
void AppendToString(std::string* target) const;
|
||||
void CopyToString(std::string* target) const {
|
||||
target->assign(data_, size_);
|
||||
}
|
||||
|
||||
// Does "this" start with "x"
|
||||
void AppendToString(std::string* target) const {
|
||||
target->append(data_, size_);
|
||||
}
|
||||
|
||||
size_type copy(char* buf, size_type n, size_type pos = 0) const;
|
||||
StringPiece substr(size_type pos = 0, size_type n = npos) const;
|
||||
|
||||
int compare(const StringPiece& x) const {
|
||||
size_type min_size = std::min(size(), x.size());
|
||||
if (min_size > 0) {
|
||||
int r = memcmp(data(), x.data(), min_size);
|
||||
if (r < 0) return -1;
|
||||
if (r > 0) return 1;
|
||||
}
|
||||
if (size() < x.size()) return -1;
|
||||
if (size() > x.size()) return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Does "this" start with "x"?
|
||||
bool starts_with(const StringPiece& x) const {
|
||||
return ((length_ >= x.length_) &&
|
||||
(memcmp(ptr_, x.ptr_, x.length_) == 0));
|
||||
return x.empty() ||
|
||||
(size() >= x.size() && memcmp(data(), x.data(), x.size()) == 0);
|
||||
}
|
||||
|
||||
// Does "this" end with "x"
|
||||
// Does "this" end with "x"?
|
||||
bool ends_with(const StringPiece& x) const {
|
||||
return ((length_ >= x.length_) &&
|
||||
(memcmp(ptr_ + (length_-x.length_), x.ptr_, x.length_) == 0));
|
||||
return x.empty() ||
|
||||
(size() >= x.size() &&
|
||||
memcmp(data() + (size() - x.size()), x.data(), x.size()) == 0);
|
||||
}
|
||||
|
||||
// standard STL container boilerplate
|
||||
typedef char value_type;
|
||||
typedef const char* pointer;
|
||||
typedef const char& reference;
|
||||
typedef const char& const_reference;
|
||||
typedef size_t size_type;
|
||||
typedef ptrdiff_t difference_type;
|
||||
static const size_type npos;
|
||||
typedef const char* const_iterator;
|
||||
typedef const char* iterator;
|
||||
typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
|
||||
typedef std::reverse_iterator<iterator> reverse_iterator;
|
||||
iterator begin() const { return ptr_; }
|
||||
iterator end() const { return ptr_ + length_; }
|
||||
const_reverse_iterator rbegin() const {
|
||||
return const_reverse_iterator(ptr_ + length_);
|
||||
bool contains(const StringPiece& s) const {
|
||||
return find(s) != npos;
|
||||
}
|
||||
const_reverse_iterator rend() const {
|
||||
return const_reverse_iterator(ptr_);
|
||||
}
|
||||
// STLS says return size_type, but Google says return int
|
||||
int max_size() const { return length_; }
|
||||
int capacity() const { return length_; }
|
||||
|
||||
int copy(char* buf, size_type n, size_type pos = 0) const;
|
||||
size_type find(const StringPiece& s, size_type pos = 0) const;
|
||||
size_type find(char c, size_type pos = 0) const;
|
||||
size_type rfind(const StringPiece& s, size_type pos = npos) const;
|
||||
size_type rfind(char c, size_type pos = npos) const;
|
||||
|
||||
int find(const StringPiece& s, size_type pos = 0) const;
|
||||
int find(char c, size_type pos = 0) const;
|
||||
int rfind(const StringPiece& s, size_type pos = npos) const;
|
||||
int rfind(char c, size_type pos = npos) const;
|
||||
|
||||
StringPiece substr(size_type pos, size_type n = npos) const;
|
||||
|
||||
static bool _equal(const StringPiece&, const StringPiece&);
|
||||
private:
|
||||
const_pointer data_;
|
||||
size_type size_;
|
||||
};
|
||||
|
||||
inline bool operator==(const StringPiece& x, const StringPiece& y) {
|
||||
return StringPiece::_equal(x, y);
|
||||
StringPiece::size_type len = x.size();
|
||||
if (len != y.size()) return false;
|
||||
return x.data() == y.data() || len == 0 ||
|
||||
memcmp(x.data(), y.data(), len) == 0;
|
||||
}
|
||||
|
||||
inline bool operator!=(const StringPiece& x, const StringPiece& y) {
|
||||
@ -157,9 +165,9 @@ inline bool operator!=(const StringPiece& x, const StringPiece& y) {
|
||||
}
|
||||
|
||||
inline bool operator<(const StringPiece& x, const StringPiece& y) {
|
||||
const int r = memcmp(x.data(), y.data(),
|
||||
std::min(x.size(), y.size()));
|
||||
return ((r < 0) || ((r == 0) && (x.size() < y.size())));
|
||||
StringPiece::size_type min_size = std::min(x.size(), y.size());
|
||||
int r = min_size == 0 ? 0 : memcmp(x.data(), y.data(), min_size);
|
||||
return (r < 0) || (r == 0 && x.size() < y.size());
|
||||
}
|
||||
|
||||
inline bool operator>(const StringPiece& x, const StringPiece& y) {
|
||||
@ -174,9 +182,9 @@ inline bool operator>=(const StringPiece& x, const StringPiece& y) {
|
||||
return !(x < y);
|
||||
}
|
||||
|
||||
// Allow StringPiece to be logged.
|
||||
std::ostream& operator<<(std::ostream& o, const StringPiece& p);
|
||||
|
||||
} // namespace re2
|
||||
|
||||
// allow StringPiece to be logged
|
||||
extern std::ostream& operator<<(std::ostream& o, const re2::StringPiece& piece);
|
||||
|
||||
#endif // STRINGS_STRINGPIECE_H__
|
||||
#endif // RE2_STRINGPIECE_H_
|
||||
|
@ -5,7 +5,13 @@
|
||||
// Format a regular expression structure as a string.
|
||||
// Tested by parse_test.cc
|
||||
|
||||
#include <string.h>
|
||||
#include <string>
|
||||
|
||||
#include "util/util.h"
|
||||
#include "util/logging.h"
|
||||
#include "util/strutil.h"
|
||||
#include "util/utf.h"
|
||||
#include "re2/regexp.h"
|
||||
#include "re2/walker-inl.h"
|
||||
|
||||
@ -42,7 +48,8 @@ class ToStringWalker : public Regexp::Walker<int> {
|
||||
private:
|
||||
string* t_; // The string the walker appends to.
|
||||
|
||||
DISALLOW_EVIL_CONSTRUCTORS(ToStringWalker);
|
||||
ToStringWalker(const ToStringWalker&) = delete;
|
||||
ToStringWalker& operator=(const ToStringWalker&) = delete;
|
||||
};
|
||||
|
||||
string Regexp::ToString() {
|
||||
@ -94,6 +101,8 @@ int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) {
|
||||
|
||||
case kRegexpCapture:
|
||||
t_->append("(");
|
||||
if (re->cap() == 0)
|
||||
LOG(DFATAL) << "kRegexpCapture cap() == 0";
|
||||
if (re->name()) {
|
||||
t_->append("?P<");
|
||||
t_->append(*re->name());
|
||||
@ -120,13 +129,12 @@ int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) {
|
||||
static void AppendLiteral(string *t, Rune r, bool foldcase) {
|
||||
if (r != 0 && r < 0x80 && strchr("(){}[]*+?|.^$\\", r)) {
|
||||
t->append(1, '\\');
|
||||
t->append(1, r);
|
||||
t->append(1, static_cast<char>(r));
|
||||
} else if (foldcase && 'a' <= r && r <= 'z') {
|
||||
if ('a' <= r && r <= 'z')
|
||||
r += 'A' - 'a';
|
||||
r -= 'a' - 'A';
|
||||
t->append(1, '[');
|
||||
t->append(1, r);
|
||||
t->append(1, r + 'a' - 'A');
|
||||
t->append(1, static_cast<char>(r));
|
||||
t->append(1, static_cast<char>(r) + 'a' - 'A');
|
||||
t->append(1, ']');
|
||||
} else {
|
||||
AppendCCRange(t, r, r);
|
||||
@ -154,12 +162,14 @@ int ToStringWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg,
|
||||
break;
|
||||
|
||||
case kRegexpLiteral:
|
||||
AppendLiteral(t_, re->rune(), re->parse_flags() & Regexp::FoldCase);
|
||||
AppendLiteral(t_, re->rune(),
|
||||
(re->parse_flags() & Regexp::FoldCase) != 0);
|
||||
break;
|
||||
|
||||
case kRegexpLiteralString:
|
||||
for (int i = 0; i < re->nrunes(); i++)
|
||||
AppendLiteral(t_, re->runes()[i], re->parse_flags() & Regexp::FoldCase);
|
||||
AppendLiteral(t_, re->runes()[i],
|
||||
(re->parse_flags() & Regexp::FoldCase) != 0);
|
||||
if (prec < PrecConcat)
|
||||
t_->append(")");
|
||||
break;
|
||||
@ -297,7 +307,7 @@ static void AppendCCChar(string* t, Rune r) {
|
||||
if (0x20 <= r && r <= 0x7E) {
|
||||
if (strchr("[]^-\\", r))
|
||||
t->append("\\");
|
||||
t->append(1, r);
|
||||
t->append(1, static_cast<char>(r));
|
||||
return;
|
||||
}
|
||||
switch (r) {
|
||||
|
@ -9,7 +9,7 @@ import re
|
||||
import urllib2
|
||||
|
||||
# Directory or URL where Unicode tables reside.
|
||||
_UNICODE_DIR = "http://www.unicode.org/Public/6.3.0/ucd"
|
||||
_UNICODE_DIR = "http://www.unicode.org/Public/10.0.0/ucd"
|
||||
|
||||
# Largest valid Unicode code value.
|
||||
_RUNE_MAX = 0x10FFFF
|
||||
|
@ -7,7 +7,7 @@
|
||||
namespace re2 {
|
||||
|
||||
|
||||
// 1034 groups, 2089 pairs, 289 ranges
|
||||
// 1295 groups, 2620 pairs, 343 ranges
|
||||
const CaseFold unicode_casefold[] = {
|
||||
{ 65, 90, 32 },
|
||||
{ 97, 106, -32 },
|
||||
@ -105,13 +105,17 @@ const CaseFold unicode_casefold[] = {
|
||||
{ 598, 599, -205 },
|
||||
{ 601, 601, -202 },
|
||||
{ 603, 603, -203 },
|
||||
{ 604, 604, 42319 },
|
||||
{ 608, 608, -205 },
|
||||
{ 609, 609, 42315 },
|
||||
{ 611, 611, -207 },
|
||||
{ 613, 613, 42280 },
|
||||
{ 614, 614, 42308 },
|
||||
{ 616, 616, -209 },
|
||||
{ 617, 617, -211 },
|
||||
{ 618, 618, 42308 },
|
||||
{ 619, 619, 10743 },
|
||||
{ 620, 620, 42305 },
|
||||
{ 623, 623, -211 },
|
||||
{ 625, 625, 10749 },
|
||||
{ 626, 626, -213 },
|
||||
@ -119,15 +123,19 @@ const CaseFold unicode_casefold[] = {
|
||||
{ 637, 637, 10727 },
|
||||
{ 640, 640, -218 },
|
||||
{ 643, 643, -218 },
|
||||
{ 647, 647, 42282 },
|
||||
{ 648, 648, -218 },
|
||||
{ 649, 649, -69 },
|
||||
{ 650, 651, -217 },
|
||||
{ 652, 652, -71 },
|
||||
{ 658, 658, -219 },
|
||||
{ 669, 669, 42261 },
|
||||
{ 670, 670, 42258 },
|
||||
{ 837, 837, 84 },
|
||||
{ 880, 883, EvenOdd },
|
||||
{ 886, 887, EvenOdd },
|
||||
{ 891, 893, 130 },
|
||||
{ 895, 895, 116 },
|
||||
{ 902, 902, 38 },
|
||||
{ 904, 906, 37 },
|
||||
{ 908, 908, 64 },
|
||||
@ -168,6 +176,7 @@ const CaseFold unicode_casefold[] = {
|
||||
{ 1008, 1008, -86 },
|
||||
{ 1009, 1009, -80 },
|
||||
{ 1010, 1010, 7 },
|
||||
{ 1011, 1011, -116 },
|
||||
{ 1012, 1012, -92 },
|
||||
{ 1013, 1013, -96 },
|
||||
{ 1015, 1016, OddEven },
|
||||
@ -176,19 +185,43 @@ const CaseFold unicode_casefold[] = {
|
||||
{ 1021, 1023, -130 },
|
||||
{ 1024, 1039, 80 },
|
||||
{ 1040, 1071, 32 },
|
||||
{ 1072, 1103, -32 },
|
||||
{ 1072, 1073, -32 },
|
||||
{ 1074, 1074, 6222 },
|
||||
{ 1075, 1075, -32 },
|
||||
{ 1076, 1076, 6221 },
|
||||
{ 1077, 1085, -32 },
|
||||
{ 1086, 1086, 6212 },
|
||||
{ 1087, 1088, -32 },
|
||||
{ 1089, 1090, 6210 },
|
||||
{ 1091, 1097, -32 },
|
||||
{ 1098, 1098, 6204 },
|
||||
{ 1099, 1103, -32 },
|
||||
{ 1104, 1119, -80 },
|
||||
{ 1120, 1153, EvenOdd },
|
||||
{ 1120, 1122, EvenOdd },
|
||||
{ 1123, 1123, 6180 },
|
||||
{ 1124, 1153, EvenOdd },
|
||||
{ 1162, 1215, EvenOdd },
|
||||
{ 1216, 1216, 15 },
|
||||
{ 1217, 1230, OddEven },
|
||||
{ 1231, 1231, -15 },
|
||||
{ 1232, 1319, EvenOdd },
|
||||
{ 1232, 1327, EvenOdd },
|
||||
{ 1329, 1366, 48 },
|
||||
{ 1377, 1414, -48 },
|
||||
{ 4256, 4293, 7264 },
|
||||
{ 4295, 4295, 7264 },
|
||||
{ 4301, 4301, 7264 },
|
||||
{ 5024, 5103, 38864 },
|
||||
{ 5104, 5109, 8 },
|
||||
{ 5112, 5117, -8 },
|
||||
{ 7296, 7296, -6254 },
|
||||
{ 7297, 7297, -6253 },
|
||||
{ 7298, 7298, -6244 },
|
||||
{ 7299, 7299, -6242 },
|
||||
{ 7300, 7300, EvenOdd },
|
||||
{ 7301, 7301, -6243 },
|
||||
{ 7302, 7302, -6236 },
|
||||
{ 7303, 7303, -6181 },
|
||||
{ 7304, 7304, 35266 },
|
||||
{ 7545, 7545, 35332 },
|
||||
{ 7549, 7549, 3814 },
|
||||
{ 7680, 7776, EvenOdd },
|
||||
@ -282,8 +315,10 @@ const CaseFold unicode_casefold[] = {
|
||||
{ 11520, 11557, -7264 },
|
||||
{ 11559, 11559, -7264 },
|
||||
{ 11565, 11565, -7264 },
|
||||
{ 42560, 42605, EvenOdd },
|
||||
{ 42624, 42647, EvenOdd },
|
||||
{ 42560, 42570, EvenOdd },
|
||||
{ 42571, 42571, -35267 },
|
||||
{ 42572, 42605, EvenOdd },
|
||||
{ 42624, 42651, EvenOdd },
|
||||
{ 42786, 42799, EvenOdd },
|
||||
{ 42802, 42863, EvenOdd },
|
||||
{ 42873, 42876, OddEven },
|
||||
@ -292,16 +327,35 @@ const CaseFold unicode_casefold[] = {
|
||||
{ 42891, 42892, OddEven },
|
||||
{ 42893, 42893, -42280 },
|
||||
{ 42896, 42899, EvenOdd },
|
||||
{ 42912, 42921, EvenOdd },
|
||||
{ 42902, 42921, EvenOdd },
|
||||
{ 42922, 42922, -42308 },
|
||||
{ 42923, 42923, -42319 },
|
||||
{ 42924, 42924, -42315 },
|
||||
{ 42925, 42925, -42305 },
|
||||
{ 42926, 42926, -42308 },
|
||||
{ 42928, 42928, -42258 },
|
||||
{ 42929, 42929, -42282 },
|
||||
{ 42930, 42930, -42261 },
|
||||
{ 42931, 42931, 928 },
|
||||
{ 42932, 42935, EvenOdd },
|
||||
{ 43859, 43859, -928 },
|
||||
{ 43888, 43967, -38864 },
|
||||
{ 65313, 65338, 32 },
|
||||
{ 65345, 65370, -32 },
|
||||
{ 66560, 66599, 40 },
|
||||
{ 66600, 66639, -40 },
|
||||
{ 66736, 66771, 40 },
|
||||
{ 66776, 66811, -40 },
|
||||
{ 68736, 68786, 64 },
|
||||
{ 68800, 68850, -64 },
|
||||
{ 71840, 71871, 32 },
|
||||
{ 71872, 71903, -32 },
|
||||
{ 125184, 125217, 34 },
|
||||
{ 125218, 125251, -34 },
|
||||
};
|
||||
const int num_unicode_casefold = 289;
|
||||
const int num_unicode_casefold = 343;
|
||||
|
||||
// 1034 groups, 1055 pairs, 167 ranges
|
||||
// 1295 groups, 1325 pairs, 191 ranges
|
||||
const CaseFold unicode_tolower[] = {
|
||||
{ 65, 90, 32 },
|
||||
{ 181, 181, 775 },
|
||||
@ -370,6 +424,7 @@ const CaseFold unicode_tolower[] = {
|
||||
{ 837, 837, 116 },
|
||||
{ 880, 882, EvenOddSkip },
|
||||
{ 886, 886, EvenOdd },
|
||||
{ 895, 895, 116 },
|
||||
{ 902, 902, 38 },
|
||||
{ 904, 906, 37 },
|
||||
{ 908, 908, 64 },
|
||||
@ -397,11 +452,20 @@ const CaseFold unicode_tolower[] = {
|
||||
{ 1162, 1214, EvenOddSkip },
|
||||
{ 1216, 1216, 15 },
|
||||
{ 1217, 1229, OddEvenSkip },
|
||||
{ 1232, 1318, EvenOddSkip },
|
||||
{ 1232, 1326, EvenOddSkip },
|
||||
{ 1329, 1366, 48 },
|
||||
{ 4256, 4293, 7264 },
|
||||
{ 4295, 4295, 7264 },
|
||||
{ 4301, 4301, 7264 },
|
||||
{ 5112, 5117, -8 },
|
||||
{ 7296, 7296, -6222 },
|
||||
{ 7297, 7297, -6221 },
|
||||
{ 7298, 7298, -6212 },
|
||||
{ 7299, 7300, -6210 },
|
||||
{ 7301, 7301, -6211 },
|
||||
{ 7302, 7302, -6204 },
|
||||
{ 7303, 7303, -6180 },
|
||||
{ 7304, 7304, 35267 },
|
||||
{ 7680, 7828, EvenOddSkip },
|
||||
{ 7835, 7835, -58 },
|
||||
{ 7838, 7838, -7615 },
|
||||
@ -457,7 +521,7 @@ const CaseFold unicode_tolower[] = {
|
||||
{ 11499, 11501, OddEvenSkip },
|
||||
{ 11506, 11506, EvenOdd },
|
||||
{ 42560, 42604, EvenOddSkip },
|
||||
{ 42624, 42646, EvenOddSkip },
|
||||
{ 42624, 42650, EvenOddSkip },
|
||||
{ 42786, 42798, EvenOddSkip },
|
||||
{ 42802, 42862, EvenOddSkip },
|
||||
{ 42873, 42875, OddEvenSkip },
|
||||
@ -466,12 +530,26 @@ const CaseFold unicode_tolower[] = {
|
||||
{ 42891, 42891, OddEven },
|
||||
{ 42893, 42893, -42280 },
|
||||
{ 42896, 42898, EvenOddSkip },
|
||||
{ 42912, 42920, EvenOddSkip },
|
||||
{ 42902, 42920, EvenOddSkip },
|
||||
{ 42922, 42922, -42308 },
|
||||
{ 42923, 42923, -42319 },
|
||||
{ 42924, 42924, -42315 },
|
||||
{ 42925, 42925, -42305 },
|
||||
{ 42926, 42926, -42308 },
|
||||
{ 42928, 42928, -42258 },
|
||||
{ 42929, 42929, -42282 },
|
||||
{ 42930, 42930, -42261 },
|
||||
{ 42931, 42931, 928 },
|
||||
{ 42932, 42934, EvenOddSkip },
|
||||
{ 43888, 43967, -38864 },
|
||||
{ 65313, 65338, 32 },
|
||||
{ 66560, 66599, 40 },
|
||||
{ 66736, 66771, 40 },
|
||||
{ 68736, 68786, 64 },
|
||||
{ 71840, 71871, 32 },
|
||||
{ 125184, 125217, 34 },
|
||||
};
|
||||
const int num_unicode_tolower = 167;
|
||||
const int num_unicode_tolower = 191;
|
||||
|
||||
|
||||
|
||||
|
@ -2,6 +2,9 @@
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef RE2_UNICODE_CASEFOLD_H_
|
||||
#define RE2_UNICODE_CASEFOLD_H_
|
||||
|
||||
// Unicode case folding tables.
|
||||
|
||||
// The Unicode case folding tables encode the mapping from one Unicode point
|
||||
@ -16,7 +19,7 @@
|
||||
// 'K' -> 'K'
|
||||
//
|
||||
// Like everything Unicode, these tables are big. If we represent the table
|
||||
// as a sorted list of uint32 pairs, it has 2049 entries and is 16 kB.
|
||||
// as a sorted list of uint32_t pairs, it has 2049 entries and is 16 kB.
|
||||
// Most table entries look like the ones around them:
|
||||
// 'A' maps to 'A'+32, 'B' maps to 'B'+32, etc.
|
||||
// Instead of listing all the pairs explicitly, we make a list of ranges
|
||||
@ -36,10 +39,10 @@
|
||||
// The grouped form also allows for efficient fold range calculations
|
||||
// rather than looping one character at a time.
|
||||
|
||||
#ifndef RE2_UNICODE_CASEFOLD_H__
|
||||
#define RE2_UNICODE_CASEFOLD_H__
|
||||
#include <stdint.h>
|
||||
|
||||
#include "util/util.h"
|
||||
#include "util/utf.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
@ -51,9 +54,9 @@ enum {
|
||||
};
|
||||
|
||||
struct CaseFold {
|
||||
uint32 lo;
|
||||
uint32 hi;
|
||||
int32 delta;
|
||||
Rune lo;
|
||||
Rune hi;
|
||||
int32_t delta;
|
||||
};
|
||||
|
||||
extern const CaseFold unicode_casefold[];
|
||||
@ -72,4 +75,4 @@ extern Rune ApplyFold(const CaseFold *f, Rune r);
|
||||
|
||||
} // namespace re2
|
||||
|
||||
#endif // RE2_UNICODE_CASEFOLD_H__
|
||||
#endif // RE2_UNICODE_CASEFOLD_H_
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -2,6 +2,9 @@
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef RE2_UNICODE_GROUPS_H_
|
||||
#define RE2_UNICODE_GROUPS_H_
|
||||
|
||||
// Unicode character groups.
|
||||
|
||||
// The codes get split into ranges of 16-bit codes
|
||||
@ -15,23 +18,23 @@
|
||||
// to 16.5 kB of data but make the data harder to use;
|
||||
// we don't bother.
|
||||
|
||||
#ifndef RE2_UNICODE_GROUPS_H__
|
||||
#define RE2_UNICODE_GROUPS_H__
|
||||
#include <stdint.h>
|
||||
|
||||
#include "util/util.h"
|
||||
#include "util/utf.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
struct URange16
|
||||
{
|
||||
uint16 lo;
|
||||
uint16 hi;
|
||||
uint16_t lo;
|
||||
uint16_t hi;
|
||||
};
|
||||
|
||||
struct URange32
|
||||
{
|
||||
uint32 lo;
|
||||
uint32 hi;
|
||||
Rune lo;
|
||||
Rune hi;
|
||||
};
|
||||
|
||||
struct UGroup
|
||||
@ -61,4 +64,4 @@ extern const int num_perl_groups;
|
||||
|
||||
} // namespace re2
|
||||
|
||||
#endif // RE2_UNICODE_GROUPS_H__
|
||||
#endif // RE2_UNICODE_GROUPS_H_
|
||||
|
@ -2,6 +2,9 @@
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef RE2_WALKER_INL_H_
|
||||
#define RE2_WALKER_INL_H_
|
||||
|
||||
// Helper class for traversing Regexps without recursion.
|
||||
// Clients should declare their own subclasses that override
|
||||
// the PreVisit and PostVisit methods, which are called before
|
||||
@ -10,9 +13,9 @@
|
||||
// Not quite the Visitor pattern, because (among other things)
|
||||
// the Visitor pattern is recursive.
|
||||
|
||||
#ifndef RE2_WALKER_INL_H__
|
||||
#define RE2_WALKER_INL_H__
|
||||
#include <stack>
|
||||
|
||||
#include "util/logging.h"
|
||||
#include "re2/regexp.h"
|
||||
|
||||
namespace re2 {
|
||||
@ -86,13 +89,14 @@ template<typename T> class Regexp::Walker {
|
||||
|
||||
private:
|
||||
// Walk state for the entire traversal.
|
||||
stack<WalkState<T> >* stack_;
|
||||
std::stack<WalkState<T> >* stack_;
|
||||
bool stopped_early_;
|
||||
int max_visits_;
|
||||
|
||||
T WalkInternal(Regexp* re, T top_arg, bool use_copy);
|
||||
|
||||
DISALLOW_EVIL_CONSTRUCTORS(Walker);
|
||||
Walker(const Walker&) = delete;
|
||||
Walker& operator=(const Walker&) = delete;
|
||||
};
|
||||
|
||||
template<typename T> T Regexp::Walker<T>::PreVisit(Regexp* re,
|
||||
@ -130,7 +134,7 @@ template<typename T> struct WalkState {
|
||||
};
|
||||
|
||||
template<typename T> Regexp::Walker<T>::Walker() {
|
||||
stack_ = new stack<WalkState<T> >;
|
||||
stack_ = new std::stack<WalkState<T> >;
|
||||
stopped_early_ = false;
|
||||
}
|
||||
|
||||
@ -187,7 +191,7 @@ template<typename T> T Regexp::Walker<T>::WalkInternal(Regexp* re, T top_arg,
|
||||
s->child_args = &s->child_arg;
|
||||
else if (re->nsub_ > 1)
|
||||
s->child_args = new T[re->nsub_];
|
||||
// Fall through.
|
||||
FALLTHROUGH_INTENDED;
|
||||
}
|
||||
default: {
|
||||
if (re->nsub_ > 0) {
|
||||
@ -241,4 +245,4 @@ template<typename T> T Regexp::Walker<T>::WalkExponential(Regexp* re, T top_arg,
|
||||
|
||||
} // namespace re2
|
||||
|
||||
#endif // RE2_WALKER_INL_H__
|
||||
#endif // RE2_WALKER_INL_H_
|
||||
|
@ -1,5 +1,6 @@
|
||||
file (READ ${SOURCE_FILENAME} CONTENT)
|
||||
string (REGEX REPLACE "using re2::RE2;" "" CONTENT "${CONTENT}")
|
||||
string (REGEX REPLACE "using re2::LazyRE2;" "" CONTENT "${CONTENT}")
|
||||
string (REGEX REPLACE "namespace re2" "namespace re2_st" CONTENT "${CONTENT}")
|
||||
string (REGEX REPLACE "re2::" "re2_st::" CONTENT "${CONTENT}")
|
||||
string (REGEX REPLACE "\"re2/" "\"re2_st/" CONTENT "${CONTENT}")
|
||||
|
@ -1,168 +0,0 @@
|
||||
// Copyright 2000 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "util/util.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// UnsafeArena::UnsafeArena()
|
||||
// UnsafeArena::~UnsafeArena()
|
||||
// Destroying the arena automatically calls Reset()
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
|
||||
UnsafeArena::UnsafeArena(const size_t block_size)
|
||||
: block_size_(block_size),
|
||||
freestart_(NULL), // set for real in Reset()
|
||||
last_alloc_(NULL),
|
||||
remaining_(0),
|
||||
blocks_alloced_(1),
|
||||
overflow_blocks_(NULL) {
|
||||
assert(block_size > kDefaultAlignment);
|
||||
|
||||
first_blocks_[0].mem = reinterpret_cast<char*>(malloc(block_size_));
|
||||
first_blocks_[0].size = block_size_;
|
||||
|
||||
Reset();
|
||||
}
|
||||
|
||||
UnsafeArena::~UnsafeArena() {
|
||||
FreeBlocks();
|
||||
assert(overflow_blocks_ == NULL); // FreeBlocks() should do that
|
||||
// The first X blocks stay allocated always by default. Delete them now.
|
||||
for (int i = 0; i < blocks_alloced_; i++)
|
||||
free(first_blocks_[i].mem);
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// UnsafeArena::Reset()
|
||||
// Clears all the memory an arena is using.
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
void UnsafeArena::Reset() {
|
||||
FreeBlocks();
|
||||
freestart_ = first_blocks_[0].mem;
|
||||
remaining_ = first_blocks_[0].size;
|
||||
last_alloc_ = NULL;
|
||||
|
||||
// We do not know for sure whether or not the first block is aligned,
|
||||
// so we fix that right now.
|
||||
const int overage = reinterpret_cast<uintptr_t>(freestart_) &
|
||||
(kDefaultAlignment-1);
|
||||
if (overage > 0) {
|
||||
const int waste = kDefaultAlignment - overage;
|
||||
freestart_ += waste;
|
||||
remaining_ -= waste;
|
||||
}
|
||||
freestart_when_empty_ = freestart_;
|
||||
assert(!(reinterpret_cast<uintptr_t>(freestart_)&(kDefaultAlignment-1)));
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------
|
||||
// UnsafeArena::AllocNewBlock()
|
||||
// Adds and returns an AllocatedBlock.
|
||||
// The returned AllocatedBlock* is valid until the next call
|
||||
// to AllocNewBlock or Reset. (i.e. anything that might
|
||||
// affect overflow_blocks_).
|
||||
// -------------------------------------------------------------
|
||||
|
||||
UnsafeArena::AllocatedBlock* UnsafeArena::AllocNewBlock(const size_t block_size) {
|
||||
AllocatedBlock *block;
|
||||
// Find the next block.
|
||||
if (static_cast<size_t>(blocks_alloced_) < arraysize(first_blocks_) ) {
|
||||
// Use one of the pre-allocated blocks
|
||||
block = &first_blocks_[blocks_alloced_++];
|
||||
} else { // oops, out of space, move to the vector
|
||||
if (overflow_blocks_ == NULL) overflow_blocks_ = new vector<AllocatedBlock>;
|
||||
// Adds another block to the vector.
|
||||
overflow_blocks_->resize(overflow_blocks_->size()+1);
|
||||
// block points to the last block of the vector.
|
||||
block = &overflow_blocks_->back();
|
||||
}
|
||||
|
||||
block->mem = reinterpret_cast<char*>(malloc(block_size));
|
||||
block->size = block_size;
|
||||
|
||||
return block;
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// UnsafeArena::GetMemoryFallback()
|
||||
// We take memory out of our pool, aligned on the byte boundary
|
||||
// requested. If we don't have space in our current pool, we
|
||||
// allocate a new block (wasting the remaining space in the
|
||||
// current block) and give you that. If your memory needs are
|
||||
// too big for a single block, we make a special your-memory-only
|
||||
// allocation -- this is equivalent to not using the arena at all.
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
void* UnsafeArena::GetMemoryFallback(const size_t size, const int align) {
|
||||
if (size == 0)
|
||||
return NULL; // stl/stl_alloc.h says this is okay
|
||||
|
||||
assert(align > 0 && 0 == (align & (align - 1))); // must be power of 2
|
||||
|
||||
// If the object is more than a quarter of the block size, allocate
|
||||
// it separately to avoid wasting too much space in leftover bytes
|
||||
if (block_size_ == 0 || size > block_size_/4) {
|
||||
// then it gets its own block in the arena
|
||||
assert(align <= kDefaultAlignment); // because that's what new gives us
|
||||
// This block stays separate from the rest of the world; in particular
|
||||
// we don't update last_alloc_ so you can't reclaim space on this block.
|
||||
return AllocNewBlock(size)->mem;
|
||||
}
|
||||
|
||||
const int overage =
|
||||
(reinterpret_cast<uintptr_t>(freestart_) & (align-1));
|
||||
if (overage) {
|
||||
const int waste = align - overage;
|
||||
freestart_ += waste;
|
||||
if (waste < static_cast<int>(remaining_)) {
|
||||
remaining_ -= waste;
|
||||
} else {
|
||||
remaining_ = 0;
|
||||
}
|
||||
}
|
||||
if (size > remaining_) {
|
||||
AllocatedBlock *block = AllocNewBlock(block_size_);
|
||||
freestart_ = block->mem;
|
||||
remaining_ = block->size;
|
||||
}
|
||||
remaining_ -= size;
|
||||
last_alloc_ = freestart_;
|
||||
freestart_ += size;
|
||||
assert((reinterpret_cast<uintptr_t>(last_alloc_) & (align-1)) == 0);
|
||||
return reinterpret_cast<void*>(last_alloc_);
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// UnsafeArena::FreeBlocks()
|
||||
// Unlike GetMemory(), which does actual work, ReturnMemory() is a
|
||||
// no-op: we don't "free" memory until Reset() is called. We do
|
||||
// update some stats, though. Note we do no checking that the
|
||||
// pointer you pass in was actually allocated by us, or that it
|
||||
// was allocated for the size you say, so be careful here!
|
||||
// FreeBlocks() does the work for Reset(), actually freeing all
|
||||
// memory allocated in one fell swoop.
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
void UnsafeArena::FreeBlocks() {
|
||||
for ( int i = 1; i < blocks_alloced_; ++i ) { // keep first block alloced
|
||||
free(first_blocks_[i].mem);
|
||||
first_blocks_[i].mem = NULL;
|
||||
first_blocks_[i].size = 0;
|
||||
}
|
||||
blocks_alloced_ = 1;
|
||||
if (overflow_blocks_ != NULL) {
|
||||
vector<AllocatedBlock>::iterator it;
|
||||
for (it = overflow_blocks_->begin(); it != overflow_blocks_->end(); ++it) {
|
||||
free(it->mem);
|
||||
}
|
||||
delete overflow_blocks_; // These should be used very rarely
|
||||
overflow_blocks_ = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace re2
|
@ -1,103 +0,0 @@
|
||||
// Copyright 2000 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Sometimes it is necessary to allocate a large number of small
|
||||
// objects. Doing this the usual way (malloc, new) is slow,
|
||||
// especially for multithreaded programs. An UnsafeArena provides a
|
||||
// mark/release method of memory management: it asks for a large chunk
|
||||
// from the operating system and doles it out bit by bit as required.
|
||||
// Then you free all the memory at once by calling UnsafeArena::Reset().
|
||||
// The "Unsafe" refers to the fact that UnsafeArena is not safe to
|
||||
// call from multiple threads.
|
||||
//
|
||||
// The global operator new that can be used as follows:
|
||||
//
|
||||
// #include "lib/arena-inl.h"
|
||||
//
|
||||
// UnsafeArena arena(1000);
|
||||
// Foo* foo = new (AllocateInArena, &arena) Foo;
|
||||
//
|
||||
|
||||
#ifndef RE2_UTIL_ARENA_H_
|
||||
#define RE2_UTIL_ARENA_H_
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// This class is thread-compatible.
|
||||
class UnsafeArena {
|
||||
public:
|
||||
UnsafeArena(const size_t block_size);
|
||||
virtual ~UnsafeArena();
|
||||
|
||||
void Reset();
|
||||
|
||||
// This should be the worst-case alignment for any type. This is
|
||||
// good for IA-32, SPARC version 7 (the last one I know), and
|
||||
// supposedly Alpha. i386 would be more time-efficient with a
|
||||
// default alignment of 8, but ::operator new() uses alignment of 4,
|
||||
// and an assertion will fail below after the call to MakeNewBlock()
|
||||
// if you try to use a larger alignment.
|
||||
#ifdef __i386__
|
||||
static const int kDefaultAlignment = 4;
|
||||
#else
|
||||
static const int kDefaultAlignment = 8;
|
||||
#endif
|
||||
|
||||
private:
|
||||
void* GetMemoryFallback(const size_t size, const int align);
|
||||
|
||||
public:
|
||||
void* GetMemory(const size_t size, const int align) {
|
||||
if ( size > 0 && size < remaining_ && align == 1 ) { // common case
|
||||
last_alloc_ = freestart_;
|
||||
freestart_ += size;
|
||||
remaining_ -= size;
|
||||
return reinterpret_cast<void*>(last_alloc_);
|
||||
}
|
||||
return GetMemoryFallback(size, align);
|
||||
}
|
||||
|
||||
private:
|
||||
struct AllocatedBlock {
|
||||
char *mem;
|
||||
size_t size;
|
||||
};
|
||||
|
||||
// The returned AllocatedBlock* is valid until the next call to AllocNewBlock
|
||||
// or Reset (i.e. anything that might affect overflow_blocks_).
|
||||
AllocatedBlock *AllocNewBlock(const size_t block_size);
|
||||
|
||||
const AllocatedBlock *IndexToBlock(int index) const;
|
||||
|
||||
const size_t block_size_;
|
||||
char* freestart_; // beginning of the free space in most recent block
|
||||
char* freestart_when_empty_; // beginning of the free space when we're empty
|
||||
char* last_alloc_; // used to make sure ReturnBytes() is safe
|
||||
size_t remaining_;
|
||||
// STL vector isn't as efficient as it could be, so we use an array at first
|
||||
int blocks_alloced_; // how many of the first_blocks_ have been alloced
|
||||
AllocatedBlock first_blocks_[16]; // the length of this array is arbitrary
|
||||
// if the first_blocks_ aren't enough, expand into overflow_blocks_.
|
||||
vector<AllocatedBlock>* overflow_blocks_;
|
||||
|
||||
void FreeBlocks(); // Frees all except first block
|
||||
|
||||
DISALLOW_EVIL_CONSTRUCTORS(UnsafeArena);
|
||||
};
|
||||
|
||||
// Operators for allocation on the arena
|
||||
// Syntax: new (AllocateInArena, arena) MyClass;
|
||||
// STL containers, etc.
|
||||
enum AllocateInArenaType { AllocateInArena };
|
||||
|
||||
} // namespace re2
|
||||
|
||||
inline void* operator new(size_t size,
|
||||
re2::AllocateInArenaType /* unused */,
|
||||
re2::UnsafeArena *arena) {
|
||||
return reinterpret_cast<char*>(arena->GetMemory(size, 1));
|
||||
}
|
||||
|
||||
#endif // RE2_UTIL_ARENA_H_
|
||||
|
@ -1,137 +0,0 @@
|
||||
// Copyright 2006-2008 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef RE2_UTIL_ATOMICOPS_H__
|
||||
#define RE2_UTIL_ATOMICOPS_H__
|
||||
|
||||
// The memory ordering constraints resemble the ones in C11.
|
||||
// RELAXED - no memory ordering, just an atomic operation.
|
||||
// CONSUME - data-dependent ordering.
|
||||
// ACQUIRE - prevents memory accesses from hoisting above the operation.
|
||||
// RELEASE - prevents memory accesses from sinking below the operation.
|
||||
|
||||
#if (__clang_major__ * 100 + __clang_minor__ >= 303) || \
|
||||
(__GNUC__ * 1000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__ >= 40801)
|
||||
|
||||
#define ATOMIC_LOAD_RELAXED(x, p) do { (x) = __atomic_load_n((p), __ATOMIC_RELAXED); } while (0)
|
||||
#define ATOMIC_LOAD_CONSUME(x, p) do { (x) = __atomic_load_n((p), __ATOMIC_CONSUME); } while (0)
|
||||
#define ATOMIC_LOAD_ACQUIRE(x, p) do { (x) = __atomic_load_n((p), __ATOMIC_ACQUIRE); } while (0)
|
||||
#define ATOMIC_STORE_RELAXED(p, v) __atomic_store_n((p), (v), __ATOMIC_RELAXED)
|
||||
#define ATOMIC_STORE_RELEASE(p, v) __atomic_store_n((p), (v), __ATOMIC_RELEASE)
|
||||
|
||||
#else // old compiler
|
||||
|
||||
#define ATOMIC_LOAD_RELAXED(x, p) do { (x) = *(p); } while (0)
|
||||
#define ATOMIC_LOAD_CONSUME(x, p) do { (x) = *(p); MaybeReadMemoryBarrier(); } while (0)
|
||||
#define ATOMIC_LOAD_ACQUIRE(x, p) do { (x) = *(p); ReadMemoryBarrier(); } while (0)
|
||||
#define ATOMIC_STORE_RELAXED(p, v) do { *(p) = (v); } while (0)
|
||||
#define ATOMIC_STORE_RELEASE(p, v) do { WriteMemoryBarrier(); *(p) = (v); } while (0)
|
||||
|
||||
// WriteMemoryBarrier(), ReadMemoryBarrier() and MaybeReadMemoryBarrier()
|
||||
// are an implementation detail and must not be used in the rest of the code.
|
||||
|
||||
#if defined(__i386__)
|
||||
|
||||
static inline void WriteMemoryBarrier() {
|
||||
int x;
|
||||
__asm__ __volatile__("xchgl (%0),%0" // The lock prefix is implicit for xchg.
|
||||
:: "r" (&x));
|
||||
}
|
||||
|
||||
#elif defined(__x86_64__)
|
||||
|
||||
// 64-bit implementations of memory barrier can be simpler, because
|
||||
// "sfence" is guaranteed to exist.
|
||||
static inline void WriteMemoryBarrier() {
|
||||
__asm__ __volatile__("sfence" : : : "memory");
|
||||
}
|
||||
|
||||
#elif defined(__ppc__)
|
||||
|
||||
static inline void WriteMemoryBarrier() {
|
||||
__asm__ __volatile__("eieio" : : : "memory");
|
||||
}
|
||||
|
||||
#elif defined(__alpha__)
|
||||
|
||||
static inline void WriteMemoryBarrier() {
|
||||
__asm__ __volatile__("wmb" : : : "memory");
|
||||
}
|
||||
|
||||
#elif defined(__aarch64__)
|
||||
|
||||
static inline void WriteMemoryBarrier() {
|
||||
__asm__ __volatile__("dmb st" : : : "memory");
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#include "util/mutex.h"
|
||||
|
||||
static inline void WriteMemoryBarrier() {
|
||||
// Slight overkill, but good enough:
|
||||
// any mutex implementation must have
|
||||
// a read barrier after the lock operation and
|
||||
// a write barrier before the unlock operation.
|
||||
//
|
||||
// It may be worthwhile to write architecture-specific
|
||||
// barriers for the common platforms, as above, but
|
||||
// this is a correct fallback.
|
||||
re2::Mutex mu;
|
||||
re2::MutexLock l(&mu);
|
||||
}
|
||||
|
||||
/*
|
||||
#error Need WriteMemoryBarrier for architecture.
|
||||
|
||||
// Windows
|
||||
inline void WriteMemoryBarrier() {
|
||||
LONG x;
|
||||
::InterlockedExchange(&x, 0);
|
||||
}
|
||||
*/
|
||||
|
||||
#endif
|
||||
|
||||
// Alpha has very weak memory ordering. If relying on WriteBarriers, one must
|
||||
// use read barriers for the readers too.
|
||||
#if defined(__alpha__)
|
||||
|
||||
static inline void MaybeReadMemoryBarrier() {
|
||||
__asm__ __volatile__("mb" : : : "memory");
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static inline void MaybeReadMemoryBarrier() {}
|
||||
|
||||
#endif // __alpha__
|
||||
|
||||
// Read barrier for various targets.
|
||||
|
||||
#if defined(__aarch64__)
|
||||
|
||||
static inline void ReadMemoryBarrier() {
|
||||
__asm__ __volatile__("dmb ld" : : : "memory");
|
||||
}
|
||||
|
||||
#elif defined(__alpha__)
|
||||
|
||||
static inline void ReadMemoryBarrier() {
|
||||
__asm__ __volatile__("mb" : : : "memory");
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static inline void ReadMemoryBarrier() {}
|
||||
|
||||
#endif
|
||||
|
||||
#endif // old compiler
|
||||
|
||||
#ifndef NO_THREAD_SAFETY_ANALYSIS
|
||||
#define NO_THREAD_SAFETY_ANALYSIS
|
||||
#endif
|
||||
|
||||
#endif // RE2_UTIL_ATOMICOPS_H__
|
@ -2,6 +2,12 @@
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <algorithm>
|
||||
#include <chrono>
|
||||
|
||||
#include "util/util.h"
|
||||
#include "util/flags.h"
|
||||
#include "util/benchmark.h"
|
||||
@ -9,8 +15,11 @@
|
||||
|
||||
DEFINE_string(test_tmpdir, "/var/tmp", "temp directory");
|
||||
|
||||
#ifdef _WIN32
|
||||
#define snprintf _snprintf
|
||||
#endif
|
||||
|
||||
using testing::Benchmark;
|
||||
using namespace re2;
|
||||
|
||||
static Benchmark* benchmarks[10000];
|
||||
static int nbenchmarks;
|
||||
@ -24,19 +33,17 @@ void Benchmark::Register() {
|
||||
nbenchmarks++;
|
||||
}
|
||||
|
||||
static int64 nsec() {
|
||||
struct timeval tv;
|
||||
if(gettimeofday(&tv, 0) < 0)
|
||||
return -1;
|
||||
return (int64)tv.tv_sec*1000*1000*1000 + tv.tv_usec*1000;
|
||||
static int64_t nsec() {
|
||||
return std::chrono::duration_cast<std::chrono::nanoseconds>(
|
||||
std::chrono::steady_clock::now().time_since_epoch()).count();
|
||||
}
|
||||
|
||||
static int64 bytes;
|
||||
static int64 ns;
|
||||
static int64 t0;
|
||||
static int64 items;
|
||||
static int64_t bytes;
|
||||
static int64_t ns;
|
||||
static int64_t t0;
|
||||
static int64_t items;
|
||||
|
||||
void SetBenchmarkBytesProcessed(long long x) {
|
||||
void SetBenchmarkBytesProcessed(int64_t x) {
|
||||
bytes = x;
|
||||
}
|
||||
|
||||
@ -74,7 +81,7 @@ static void runN(Benchmark *b, int n, int siz) {
|
||||
b->fnr(n, siz);
|
||||
else {
|
||||
fprintf(stderr, "%s: missing function\n", b->name);
|
||||
exit(2);
|
||||
abort();
|
||||
}
|
||||
if(t0 != 0)
|
||||
ns += nsec() - t0;
|
||||
@ -105,11 +112,11 @@ void RunBench(Benchmark* b, int nthread, int siz) {
|
||||
while(ns < (int)1e9 && n < (int)1e9) {
|
||||
last = n;
|
||||
if(ns/n == 0)
|
||||
n = 1e9;
|
||||
n = (int)1e9;
|
||||
else
|
||||
n = 1e9 / (ns/n);
|
||||
n = (int)1e9 / static_cast<int>(ns/n);
|
||||
|
||||
n = max(last+1, min(n+n/2, 100*last));
|
||||
n = std::max(last+1, std::min(n+n/2, 100*last));
|
||||
n = round(n);
|
||||
runN(b, n, siz);
|
||||
}
|
||||
@ -146,7 +153,7 @@ int main(int argc, const char** argv) {
|
||||
Benchmark* b = benchmarks[i];
|
||||
if(match(b->name, argc, argv))
|
||||
for(int j = b->threadlo; j <= b->threadhi; j++)
|
||||
for(int k = max(b->lo, 1); k <= max(b->hi, 1); k<<=1)
|
||||
for(int k = std::max(b->lo, 1); k <= std::max(b->hi, 1); k<<=1)
|
||||
RunBench(b, j, k);
|
||||
}
|
||||
}
|
||||
|
@ -2,8 +2,10 @@
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef RE2_UTIL_BENCHMARK_H__
|
||||
#define RE2_UTIL_BENCHMARK_H__
|
||||
#ifndef UTIL_BENCHMARK_H_
|
||||
#define UTIL_BENCHMARK_H_
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
namespace testing {
|
||||
struct Benchmark {
|
||||
@ -14,7 +16,7 @@ struct Benchmark {
|
||||
int hi;
|
||||
int threadlo;
|
||||
int threadhi;
|
||||
|
||||
|
||||
void Register();
|
||||
Benchmark(const char* name, void (*f)(int)) { Clear(name); fn = f; Register(); }
|
||||
Benchmark(const char* name, void (*f)(int, int), int l, int h) { Clear(name); fnr = f; lo = l; hi = h; Register(); }
|
||||
@ -23,7 +25,7 @@ struct Benchmark {
|
||||
};
|
||||
} // namespace testing
|
||||
|
||||
void SetBenchmarkBytesProcessed(long long);
|
||||
void SetBenchmarkBytesProcessed(int64_t);
|
||||
void StopBenchmarkTiming();
|
||||
void StartBenchmarkTiming();
|
||||
void BenchmarkMemoryUsage();
|
||||
@ -38,4 +40,4 @@ int NumCPUs();
|
||||
::testing::Benchmark* _benchmark_##f = \
|
||||
(new ::testing::Benchmark(#f, f, lo, hi))
|
||||
|
||||
#endif // RE2_UTIL_BENCHMARK_H__
|
||||
#endif // UTIL_BENCHMARK_H_
|
||||
|
@ -2,13 +2,15 @@
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef UTIL_FLAGS_H_
|
||||
#define UTIL_FLAGS_H_
|
||||
|
||||
// Simplified version of Google's command line flags.
|
||||
// Does not support parsing the command line.
|
||||
// If you want to do that, see
|
||||
// http://code.google.com/p/google-gflags
|
||||
// https://gflags.github.io/gflags/
|
||||
|
||||
#ifndef RE2_UTIL_FLAGS_H__
|
||||
#define RE2_UTIL_FLAGS_H__
|
||||
#include <stdint.h>
|
||||
|
||||
#define DEFINE_flag(type, name, deflt, desc) \
|
||||
namespace re2 { type FLAGS_##name = deflt; }
|
||||
@ -17,11 +19,11 @@
|
||||
namespace re2 { extern type FLAGS_##name; }
|
||||
|
||||
#define DEFINE_bool(name, deflt, desc) DEFINE_flag(bool, name, deflt, desc)
|
||||
#define DEFINE_int32(name, deflt, desc) DEFINE_flag(int32, name, deflt, desc)
|
||||
#define DEFINE_int32(name, deflt, desc) DEFINE_flag(int32_t, name, deflt, desc)
|
||||
#define DEFINE_string(name, deflt, desc) DEFINE_flag(string, name, deflt, desc)
|
||||
|
||||
#define DECLARE_bool(name) DECLARE_flag(bool, name)
|
||||
#define DECLARE_int32(name) DECLARE_flag(int32, name)
|
||||
#define DECLARE_int32(name) DECLARE_flag(int32_t, name)
|
||||
#define DECLARE_string(name) DECLARE_flag(string, name)
|
||||
|
||||
#endif // RE2_UTIL_FLAGS_H__
|
||||
#endif // UTIL_FLAGS_H_
|
||||
|
21
contrib/libre2/util/fuzz.cc
Normal file
21
contrib/libre2/util/fuzz.cc
Normal file
@ -0,0 +1,21 @@
|
||||
// Copyright 2016 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
// Entry point for libFuzzer.
|
||||
extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size);
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
uint8_t data[32];
|
||||
for (int i = 0; i < 32; i++) {
|
||||
for (int j = 0; j < 32; j++) {
|
||||
data[j] = random() & 0xFF;
|
||||
}
|
||||
LLVMFuzzerTestOneInput(data, 32);
|
||||
}
|
||||
return 0;
|
||||
}
|
@ -1,231 +0,0 @@
|
||||
// Modified by Russ Cox to add "namespace re2".
|
||||
// Also threw away all but hashword and hashword2.
|
||||
// http://burtleburtle.net/bob/c/lookup3.c
|
||||
|
||||
/*
|
||||
-------------------------------------------------------------------------------
|
||||
lookup3.c, by Bob Jenkins, May 2006, Public Domain.
|
||||
|
||||
These are functions for producing 32-bit hashes for hash table lookup.
|
||||
hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
|
||||
are externally useful functions. Routines to test the hash are included
|
||||
if SELF_TEST is defined. You can use this free for any purpose. It's in
|
||||
the public domain. It has no warranty.
|
||||
|
||||
You probably want to use hashlittle(). hashlittle() and hashbig()
|
||||
hash byte arrays. hashlittle() is is faster than hashbig() on
|
||||
little-endian machines. Intel and AMD are little-endian machines.
|
||||
On second thought, you probably want hashlittle2(), which is identical to
|
||||
hashlittle() except it returns two 32-bit hashes for the price of one.
|
||||
You could implement hashbig2() if you wanted but I haven't bothered here.
|
||||
|
||||
If you want to find a hash of, say, exactly 7 integers, do
|
||||
a = i1; b = i2; c = i3;
|
||||
mix(a,b,c);
|
||||
a += i4; b += i5; c += i6;
|
||||
mix(a,b,c);
|
||||
a += i7;
|
||||
final(a,b,c);
|
||||
then use c as the hash value. If you have a variable length array of
|
||||
4-byte integers to hash, use hashword(). If you have a byte array (like
|
||||
a character string), use hashlittle(). If you have several byte arrays, or
|
||||
a mix of things, see the comments above hashlittle().
|
||||
|
||||
Why is this so big? I read 12 bytes at a time into 3 4-byte integers,
|
||||
then mix those integers. This is fast (you can do a lot more thorough
|
||||
mixing with 12*3 instructions on 3 integers than you can with 3 instructions
|
||||
on 1 byte), but shoehorning those bytes into integers efficiently is messy.
|
||||
-------------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
#include "util/util.h"
|
||||
|
||||
#define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
|
||||
|
||||
/*
|
||||
-------------------------------------------------------------------------------
|
||||
mix -- mix 3 32-bit values reversibly.
|
||||
|
||||
This is reversible, so any information in (a,b,c) before mix() is
|
||||
still in (a,b,c) after mix().
|
||||
|
||||
If four pairs of (a,b,c) inputs are run through mix(), or through
|
||||
mix() in reverse, there are at least 32 bits of the output that
|
||||
are sometimes the same for one pair and different for another pair.
|
||||
This was tested for:
|
||||
* pairs that differed by one bit, by two bits, in any combination
|
||||
of top bits of (a,b,c), or in any combination of bottom bits of
|
||||
(a,b,c).
|
||||
* "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
|
||||
the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
|
||||
is commonly produced by subtraction) look like a single 1-bit
|
||||
difference.
|
||||
* the base values were pseudorandom, all zero but one bit set, or
|
||||
all zero plus a counter that starts at zero.
|
||||
|
||||
Some k values for my "a-=c; a^=rot(c,k); c+=b;" arrangement that
|
||||
satisfy this are
|
||||
4 6 8 16 19 4
|
||||
9 15 3 18 27 15
|
||||
14 9 3 7 17 3
|
||||
Well, "9 15 3 18 27 15" didn't quite get 32 bits diffing
|
||||
for "differ" defined as + with a one-bit base and a two-bit delta. I
|
||||
used http://burtleburtle.net/bob/hash/avalanche.html to choose
|
||||
the operations, constants, and arrangements of the variables.
|
||||
|
||||
This does not achieve avalanche. There are input bits of (a,b,c)
|
||||
that fail to affect some output bits of (a,b,c), especially of a. The
|
||||
most thoroughly mixed value is c, but it doesn't really even achieve
|
||||
avalanche in c.
|
||||
|
||||
This allows some parallelism. Read-after-writes are good at doubling
|
||||
the number of bits affected, so the goal of mixing pulls in the opposite
|
||||
direction as the goal of parallelism. I did what I could. Rotates
|
||||
seem to cost as much as shifts on every machine I could lay my hands
|
||||
on, and rotates are much kinder to the top and bottom bits, so I used
|
||||
rotates.
|
||||
-------------------------------------------------------------------------------
|
||||
*/
|
||||
#define mix(a,b,c) \
|
||||
{ \
|
||||
a -= c; a ^= rot(c, 4); c += b; \
|
||||
b -= a; b ^= rot(a, 6); a += c; \
|
||||
c -= b; c ^= rot(b, 8); b += a; \
|
||||
a -= c; a ^= rot(c,16); c += b; \
|
||||
b -= a; b ^= rot(a,19); a += c; \
|
||||
c -= b; c ^= rot(b, 4); b += a; \
|
||||
}
|
||||
|
||||
/*
|
||||
-------------------------------------------------------------------------------
|
||||
final -- final mixing of 3 32-bit values (a,b,c) into c
|
||||
|
||||
Pairs of (a,b,c) values differing in only a few bits will usually
|
||||
produce values of c that look totally different. This was tested for
|
||||
* pairs that differed by one bit, by two bits, in any combination
|
||||
of top bits of (a,b,c), or in any combination of bottom bits of
|
||||
(a,b,c).
|
||||
* "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
|
||||
the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
|
||||
is commonly produced by subtraction) look like a single 1-bit
|
||||
difference.
|
||||
* the base values were pseudorandom, all zero but one bit set, or
|
||||
all zero plus a counter that starts at zero.
|
||||
|
||||
These constants passed:
|
||||
14 11 25 16 4 14 24
|
||||
12 14 25 16 4 14 24
|
||||
and these came close:
|
||||
4 8 15 26 3 22 24
|
||||
10 8 15 26 3 22 24
|
||||
11 8 15 26 3 22 24
|
||||
-------------------------------------------------------------------------------
|
||||
*/
|
||||
#define final(a,b,c) \
|
||||
{ \
|
||||
c ^= b; c -= rot(b,14); \
|
||||
a ^= c; a -= rot(c,11); \
|
||||
b ^= a; b -= rot(a,25); \
|
||||
c ^= b; c -= rot(b,16); \
|
||||
a ^= c; a -= rot(c,4); \
|
||||
b ^= a; b -= rot(a,14); \
|
||||
c ^= b; c -= rot(b,24); \
|
||||
}
|
||||
|
||||
namespace re2 {
|
||||
|
||||
/*
|
||||
--------------------------------------------------------------------
|
||||
This works on all machines. To be useful, it requires
|
||||
-- that the key be an array of uint32_t's, and
|
||||
-- that the length be the number of uint32_t's in the key
|
||||
|
||||
The function hashword() is identical to hashlittle() on little-endian
|
||||
machines, and identical to hashbig() on big-endian machines,
|
||||
except that the length has to be measured in uint32_ts rather than in
|
||||
bytes. hashlittle() is more complicated than hashword() only because
|
||||
hashlittle() has to dance around fitting the key bytes into registers.
|
||||
--------------------------------------------------------------------
|
||||
*/
|
||||
uint32 hashword(
|
||||
const uint32 *k, /* the key, an array of uint32_t values */
|
||||
size_t length, /* the length of the key, in uint32_ts */
|
||||
uint32 initval) /* the previous hash, or an arbitrary value */
|
||||
{
|
||||
uint32_t a,b,c;
|
||||
|
||||
/* Set up the internal state */
|
||||
a = b = c = 0xdeadbeef + (((uint32_t)length)<<2) + initval;
|
||||
|
||||
/*------------------------------------------------- handle most of the key */
|
||||
while (length > 3)
|
||||
{
|
||||
a += k[0];
|
||||
b += k[1];
|
||||
c += k[2];
|
||||
mix(a,b,c);
|
||||
length -= 3;
|
||||
k += 3;
|
||||
}
|
||||
|
||||
/*------------------------------------------- handle the last 3 uint32_t's */
|
||||
switch(length) /* all the case statements fall through */
|
||||
{
|
||||
case 3 : c+=k[2];
|
||||
case 2 : b+=k[1];
|
||||
case 1 : a+=k[0];
|
||||
final(a,b,c);
|
||||
case 0: /* case 0: nothing left to add */
|
||||
break;
|
||||
}
|
||||
/*------------------------------------------------------ report the result */
|
||||
return c;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
--------------------------------------------------------------------
|
||||
hashword2() -- same as hashword(), but take two seeds and return two
|
||||
32-bit values. pc and pb must both be nonnull, and *pc and *pb must
|
||||
both be initialized with seeds. If you pass in (*pb)==0, the output
|
||||
(*pc) will be the same as the return value from hashword().
|
||||
--------------------------------------------------------------------
|
||||
*/
|
||||
void hashword2 (
|
||||
const uint32 *k, /* the key, an array of uint32_t values */
|
||||
size_t length, /* the length of the key, in uint32_ts */
|
||||
uint32 *pc, /* IN: seed OUT: primary hash value */
|
||||
uint32 *pb) /* IN: more seed OUT: secondary hash value */
|
||||
{
|
||||
uint32_t a,b,c;
|
||||
|
||||
/* Set up the internal state */
|
||||
a = b = c = 0xdeadbeef + ((uint32_t)(length<<2)) + *pc;
|
||||
c += *pb;
|
||||
|
||||
/*------------------------------------------------- handle most of the key */
|
||||
while (length > 3)
|
||||
{
|
||||
a += k[0];
|
||||
b += k[1];
|
||||
c += k[2];
|
||||
mix(a,b,c);
|
||||
length -= 3;
|
||||
k += 3;
|
||||
}
|
||||
|
||||
/*------------------------------------------- handle the last 3 uint32_t's */
|
||||
switch(length) /* all the case statements fall through */
|
||||
{
|
||||
case 3 : c+=k[2];
|
||||
case 2 : b+=k[1];
|
||||
case 1 : a+=k[0];
|
||||
final(a,b,c);
|
||||
case 0: /* case 0: nothing left to add */
|
||||
break;
|
||||
}
|
||||
/*------------------------------------------------------ report the result */
|
||||
*pc=c; *pb=b;
|
||||
}
|
||||
|
||||
} // namespace re2
|
@ -2,14 +2,19 @@
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef UTIL_LOGGING_H_
|
||||
#define UTIL_LOGGING_H_
|
||||
|
||||
// Simplified version of Google's logging.
|
||||
|
||||
#ifndef RE2_UTIL_LOGGING_H__
|
||||
#define RE2_UTIL_LOGGING_H__
|
||||
|
||||
#include <unistd.h> /* for write */
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <ostream>
|
||||
#include <sstream>
|
||||
|
||||
#include "util/util.h"
|
||||
|
||||
// Debug-only checking.
|
||||
#define DCHECK(condition) assert(condition)
|
||||
#define DCHECK_EQ(val1, val2) assert((val1) == (val2))
|
||||
@ -29,33 +34,37 @@
|
||||
#define CHECK_NE(x, y) CHECK((x) != (y))
|
||||
|
||||
#define LOG_INFO LogMessage(__FILE__, __LINE__)
|
||||
#define LOG_ERROR LOG_INFO
|
||||
#define LOG_WARNING LOG_INFO
|
||||
#define LOG_WARNING LogMessage(__FILE__, __LINE__)
|
||||
#define LOG_ERROR LogMessage(__FILE__, __LINE__)
|
||||
#define LOG_FATAL LogMessageFatal(__FILE__, __LINE__)
|
||||
#define LOG_QFATAL LOG_FATAL
|
||||
|
||||
#define VLOG(x) if((x)>0){}else LOG_INFO.stream()
|
||||
// It seems that one of the Windows header files defines ERROR as 0.
|
||||
#ifdef _WIN32
|
||||
#define LOG_0 LOG_INFO
|
||||
#endif
|
||||
|
||||
#ifdef NDEBUG
|
||||
#define DEBUG_MODE 0
|
||||
#define LOG_DFATAL LOG_ERROR
|
||||
#else
|
||||
#define DEBUG_MODE 1
|
||||
#define LOG_DFATAL LOG_FATAL
|
||||
#endif
|
||||
|
||||
#define LOG(severity) LOG_ ## severity.stream()
|
||||
|
||||
#define VLOG(x) if((x)>0){}else LOG_INFO.stream()
|
||||
|
||||
class LogMessage {
|
||||
public:
|
||||
LogMessage(const char* file, int line) : flushed_(false) {
|
||||
LogMessage(const char* file, int line)
|
||||
: flushed_(false) {
|
||||
stream() << file << ":" << line << ": ";
|
||||
}
|
||||
void Flush() {
|
||||
stream() << "\n";
|
||||
string s = str_.str();
|
||||
int n = (int)s.size(); // shut up msvc
|
||||
if(write(2, s.data(), n) < 0) {} // shut up gcc
|
||||
size_t n = s.size();
|
||||
if (fwrite(s.data(), 1, n, stderr) < n) {} // shut up gcc
|
||||
flushed_ = true;
|
||||
}
|
||||
~LogMessage() {
|
||||
@ -63,24 +72,38 @@ class LogMessage {
|
||||
Flush();
|
||||
}
|
||||
}
|
||||
ostream& stream() { return str_; }
|
||||
|
||||
std::ostream& stream() { return str_; }
|
||||
|
||||
private:
|
||||
bool flushed_;
|
||||
std::ostringstream str_;
|
||||
DISALLOW_EVIL_CONSTRUCTORS(LogMessage);
|
||||
|
||||
LogMessage(const LogMessage&) = delete;
|
||||
LogMessage& operator=(const LogMessage&) = delete;
|
||||
};
|
||||
|
||||
// Silence "destructor never returns" warning for ~LogMessageFatal().
|
||||
// Since this is a header file, push and then pop to limit the scope.
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning(push)
|
||||
#pragma warning(disable: 4722)
|
||||
#endif
|
||||
|
||||
class LogMessageFatal : public LogMessage {
|
||||
public:
|
||||
LogMessageFatal(const char* file, int line)
|
||||
: LogMessage(file, line) { }
|
||||
: LogMessage(file, line) {}
|
||||
~LogMessageFatal() {
|
||||
Flush();
|
||||
abort();
|
||||
}
|
||||
private:
|
||||
DISALLOW_EVIL_CONSTRUCTORS(LogMessageFatal);
|
||||
LogMessageFatal(const LogMessageFatal&) = delete;
|
||||
LogMessageFatal& operator=(const LogMessageFatal&) = delete;
|
||||
};
|
||||
|
||||
#endif // RE2_UTIL_LOGGING_H__
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning(pop)
|
||||
#endif
|
||||
|
||||
#endif // UTIL_LOGGING_H_
|
||||
|
41
contrib/libre2/util/mix.h
Normal file
41
contrib/libre2/util/mix.h
Normal file
@ -0,0 +1,41 @@
|
||||
// Copyright 2016 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef UTIL_MIX_H_
|
||||
#define UTIL_MIX_H_
|
||||
|
||||
#include <stddef.h>
|
||||
#include <limits>
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Silence "truncation of constant value" warning for kMul in 32-bit mode.
|
||||
// Since this is a header file, push and then pop to limit the scope.
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning(push)
|
||||
#pragma warning(disable: 4309)
|
||||
#endif
|
||||
|
||||
class HashMix {
|
||||
public:
|
||||
HashMix() : hash_(1) {}
|
||||
explicit HashMix(size_t val) : hash_(val + 83) {}
|
||||
void Mix(size_t val) {
|
||||
static const size_t kMul = static_cast<size_t>(0xdc3eb94af8ab4c93ULL);
|
||||
hash_ *= kMul;
|
||||
hash_ = ((hash_ << 19) |
|
||||
(hash_ >> (std::numeric_limits<size_t>::digits - 19))) + val;
|
||||
}
|
||||
size_t get() const { return hash_; }
|
||||
private:
|
||||
size_t hash_;
|
||||
};
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning(pop)
|
||||
#endif
|
||||
|
||||
} // namespace re2
|
||||
|
||||
#endif // UTIL_MIX_H_
|
@ -2,64 +2,41 @@
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef UTIL_MUTEX_H_
|
||||
#define UTIL_MUTEX_H_
|
||||
|
||||
/*
|
||||
* A simple mutex wrapper, supporting locks and read-write locks.
|
||||
* You should assume the locks are *not* re-entrant.
|
||||
*/
|
||||
|
||||
#ifndef RE2_UTIL_MUTEX_H_
|
||||
#define RE2_UTIL_MUTEX_H_
|
||||
#if !defined(_WIN32)
|
||||
#ifndef _POSIX_C_SOURCE
|
||||
#define _POSIX_C_SOURCE 200809L
|
||||
#endif
|
||||
#include <unistd.h>
|
||||
#if defined(_POSIX_READER_WRITER_LOCKS) && _POSIX_READER_WRITER_LOCKS > 0
|
||||
#define MUTEX_IS_PTHREAD_RWLOCK
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(MUTEX_IS_PTHREAD_RWLOCK)
|
||||
#include <pthread.h>
|
||||
#include <stdlib.h>
|
||||
typedef pthread_rwlock_t MutexType;
|
||||
#else
|
||||
#include <mutex>
|
||||
typedef std::mutex MutexType;
|
||||
#endif
|
||||
|
||||
namespace re2 {
|
||||
|
||||
#define HAVE_PTHREAD 1
|
||||
#define HAVE_RWLOCK 1
|
||||
|
||||
#if defined(NO_THREADS)
|
||||
typedef int MutexType; // to keep a lock-count
|
||||
#elif defined(HAVE_PTHREAD) && defined(HAVE_RWLOCK)
|
||||
// Needed for pthread_rwlock_*. If it causes problems, you could take it
|
||||
// out, but then you'd have to unset HAVE_RWLOCK (at least on linux -- it
|
||||
// *does* cause problems for FreeBSD, or MacOSX, but isn't needed
|
||||
// for locking there.)
|
||||
# ifdef __linux__
|
||||
# undef _XOPEN_SOURCE
|
||||
# define _XOPEN_SOURCE 500 // may be needed to get the rwlock calls
|
||||
# endif
|
||||
# include <pthread.h>
|
||||
typedef pthread_rwlock_t MutexType;
|
||||
#elif defined(HAVE_PTHREAD)
|
||||
# include <pthread.h>
|
||||
typedef pthread_mutex_t MutexType;
|
||||
#elif defined(WIN32)
|
||||
# define WIN32_LEAN_AND_MEAN // We only need minimal includes
|
||||
# ifdef GMUTEX_TRYLOCK
|
||||
// We need Windows NT or later for TryEnterCriticalSection(). If you
|
||||
// don't need that functionality, you can remove these _WIN32_WINNT
|
||||
// lines, and change TryLock() to assert(0) or something.
|
||||
# ifndef _WIN32_WINNT
|
||||
# define _WIN32_WINNT 0x0400
|
||||
# endif
|
||||
# endif
|
||||
# include <windows.h>
|
||||
typedef CRITICAL_SECTION MutexType;
|
||||
#else
|
||||
# error Need to implement mutex.h for your architecture, or #define NO_THREADS
|
||||
#endif
|
||||
|
||||
class Mutex {
|
||||
public:
|
||||
// Create a Mutex that is not held by anybody.
|
||||
inline Mutex();
|
||||
|
||||
// Destructor
|
||||
inline ~Mutex();
|
||||
|
||||
inline void Lock(); // Block if needed until free then acquire exclusively
|
||||
inline void Unlock(); // Release a lock acquired via Lock()
|
||||
inline bool TryLock(); // If free, Lock() and return true, else return false
|
||||
// Note that on systems that don't support read-write locks, these may
|
||||
// be implemented as synonyms to Lock() and Unlock(). So you can use
|
||||
// these for efficiency, but don't use them anyplace where being able
|
||||
@ -68,80 +45,44 @@ class Mutex {
|
||||
inline void ReaderUnlock(); // Release a read share of this Mutex
|
||||
inline void WriterLock() { Lock(); } // Acquire an exclusive lock
|
||||
inline void WriterUnlock() { Unlock(); } // Release a lock from WriterLock()
|
||||
inline void AssertHeld() { }
|
||||
|
||||
private:
|
||||
MutexType mutex_;
|
||||
|
||||
// Catch the error of writing Mutex when intending MutexLock.
|
||||
Mutex(Mutex *ignored);
|
||||
// Disallow "evil" constructors
|
||||
Mutex(const Mutex&);
|
||||
void operator=(const Mutex&);
|
||||
|
||||
Mutex(const Mutex&) = delete;
|
||||
Mutex& operator=(const Mutex&) = delete;
|
||||
};
|
||||
|
||||
// Now the implementation of Mutex for various systems
|
||||
#if defined(NO_THREADS)
|
||||
#if defined(MUTEX_IS_PTHREAD_RWLOCK)
|
||||
|
||||
// When we don't have threads, we can be either reading or writing,
|
||||
// but not both. We can have lots of readers at once (in no-threads
|
||||
// mode, that's most likely to happen in recursive function calls),
|
||||
// but only one writer. We represent this by having mutex_ be -1 when
|
||||
// writing and a number > 0 when reading (and 0 when no lock is held).
|
||||
//
|
||||
// In debug mode, we assert these invariants, while in non-debug mode
|
||||
// we do nothing, for efficiency. That's why everything is in an
|
||||
// assert.
|
||||
#include <assert.h>
|
||||
|
||||
Mutex::Mutex() : mutex_(0) { }
|
||||
Mutex::~Mutex() { assert(mutex_ == 0); }
|
||||
void Mutex::Lock() { assert(--mutex_ == -1); }
|
||||
void Mutex::Unlock() { assert(mutex_++ == -1); }
|
||||
bool Mutex::TryLock() { if (mutex_) return false; Lock(); return true; }
|
||||
void Mutex::ReaderLock() { assert(++mutex_ > 0); }
|
||||
void Mutex::ReaderUnlock() { assert(mutex_-- > 0); }
|
||||
|
||||
#elif defined(HAVE_PTHREAD) && defined(HAVE_RWLOCK)
|
||||
|
||||
#define SAFE_PTHREAD(fncall) do { if ((fncall) != 0) abort(); } while (0)
|
||||
#define SAFE_PTHREAD(fncall) \
|
||||
do { \
|
||||
if ((fncall) != 0) abort(); \
|
||||
} while (0)
|
||||
|
||||
Mutex::Mutex() { SAFE_PTHREAD(pthread_rwlock_init(&mutex_, NULL)); }
|
||||
Mutex::~Mutex() { SAFE_PTHREAD(pthread_rwlock_destroy(&mutex_)); }
|
||||
void Mutex::Lock() { SAFE_PTHREAD(pthread_rwlock_wrlock(&mutex_)); }
|
||||
void Mutex::Unlock() { SAFE_PTHREAD(pthread_rwlock_unlock(&mutex_)); }
|
||||
bool Mutex::TryLock() { return pthread_rwlock_trywrlock(&mutex_) == 0; }
|
||||
void Mutex::ReaderLock() { SAFE_PTHREAD(pthread_rwlock_rdlock(&mutex_)); }
|
||||
void Mutex::ReaderUnlock() { SAFE_PTHREAD(pthread_rwlock_unlock(&mutex_)); }
|
||||
|
||||
#undef SAFE_PTHREAD
|
||||
|
||||
#elif defined(HAVE_PTHREAD)
|
||||
#else
|
||||
|
||||
#define SAFE_PTHREAD(fncall) do { if ((fncall) != 0) abort(); } while (0)
|
||||
|
||||
Mutex::Mutex() { SAFE_PTHREAD(pthread_mutex_init(&mutex_, NULL)); }
|
||||
Mutex::~Mutex() { SAFE_PTHREAD(pthread_mutex_destroy(&mutex_)); }
|
||||
void Mutex::Lock() { SAFE_PTHREAD(pthread_mutex_lock(&mutex_)); }
|
||||
void Mutex::Unlock() { SAFE_PTHREAD(pthread_mutex_unlock(&mutex_)); }
|
||||
bool Mutex::TryLock() { return pthread_mutex_trylock(&mutex_) == 0; }
|
||||
void Mutex::ReaderLock() { Lock(); } // we don't have read-write locks
|
||||
void Mutex::ReaderUnlock() { Unlock(); }
|
||||
#undef SAFE_PTHREAD
|
||||
|
||||
#elif defined(WIN32)
|
||||
|
||||
Mutex::Mutex() { InitializeCriticalSection(&mutex_); }
|
||||
Mutex::~Mutex() { DeleteCriticalSection(&mutex_); }
|
||||
void Mutex::Lock() { EnterCriticalSection(&mutex_); }
|
||||
void Mutex::Unlock() { LeaveCriticalSection(&mutex_); }
|
||||
bool Mutex::TryLock() { return TryEnterCriticalSection(&mutex_) != 0; }
|
||||
void Mutex::ReaderLock() { Lock(); } // we don't have read-write locks
|
||||
Mutex::Mutex() { }
|
||||
Mutex::~Mutex() { }
|
||||
void Mutex::Lock() { mutex_.lock(); }
|
||||
void Mutex::Unlock() { mutex_.unlock(); }
|
||||
void Mutex::ReaderLock() { Lock(); } // C++11 doesn't have std::shared_mutex.
|
||||
void Mutex::ReaderUnlock() { Unlock(); }
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
// --------------------------------------------------------------------------
|
||||
// Some helper classes
|
||||
|
||||
@ -152,9 +93,9 @@ class MutexLock {
|
||||
~MutexLock() { mu_->Unlock(); }
|
||||
private:
|
||||
Mutex * const mu_;
|
||||
// Disallow "evil" constructors
|
||||
MutexLock(const MutexLock&);
|
||||
void operator=(const MutexLock&);
|
||||
|
||||
MutexLock(const MutexLock&) = delete;
|
||||
MutexLock& operator=(const MutexLock&) = delete;
|
||||
};
|
||||
|
||||
// ReaderMutexLock and WriterMutexLock do the same, for rwlocks
|
||||
@ -164,9 +105,9 @@ class ReaderMutexLock {
|
||||
~ReaderMutexLock() { mu_->ReaderUnlock(); }
|
||||
private:
|
||||
Mutex * const mu_;
|
||||
// Disallow "evil" constructors
|
||||
ReaderMutexLock(const ReaderMutexLock&);
|
||||
void operator=(const ReaderMutexLock&);
|
||||
|
||||
ReaderMutexLock(const ReaderMutexLock&) = delete;
|
||||
ReaderMutexLock& operator=(const ReaderMutexLock&) = delete;
|
||||
};
|
||||
|
||||
class WriterMutexLock {
|
||||
@ -175,37 +116,16 @@ class WriterMutexLock {
|
||||
~WriterMutexLock() { mu_->WriterUnlock(); }
|
||||
private:
|
||||
Mutex * const mu_;
|
||||
// Disallow "evil" constructors
|
||||
WriterMutexLock(const WriterMutexLock&);
|
||||
void operator=(const WriterMutexLock&);
|
||||
|
||||
WriterMutexLock(const WriterMutexLock&) = delete;
|
||||
WriterMutexLock& operator=(const WriterMutexLock&) = delete;
|
||||
};
|
||||
|
||||
// Catch bug where variable name is omitted, e.g. MutexLock (&mu);
|
||||
#define MutexLock(x) COMPILE_ASSERT(0, mutex_lock_decl_missing_var_name)
|
||||
#define ReaderMutexLock(x) COMPILE_ASSERT(0, rmutex_lock_decl_missing_var_name)
|
||||
#define WriterMutexLock(x) COMPILE_ASSERT(0, wmutex_lock_decl_missing_var_name)
|
||||
|
||||
// Provide safe way to declare and use global, linker-initialized mutex. Sigh.
|
||||
#ifdef HAVE_PTHREAD
|
||||
|
||||
#define GLOBAL_MUTEX(name) \
|
||||
static pthread_mutex_t (name) = PTHREAD_MUTEX_INITIALIZER
|
||||
#define GLOBAL_MUTEX_LOCK(name) \
|
||||
pthread_mutex_lock(&(name))
|
||||
#define GLOBAL_MUTEX_UNLOCK(name) \
|
||||
pthread_mutex_unlock(&(name))
|
||||
|
||||
#else
|
||||
|
||||
#define GLOBAL_MUTEX(name) \
|
||||
static Mutex name
|
||||
#define GLOBAL_MUTEX_LOCK(name) \
|
||||
name.Lock()
|
||||
#define GLOBAL_MUTEX_UNLOCK(name) \
|
||||
name.Unlock()
|
||||
|
||||
#endif
|
||||
#define MutexLock(x) static_assert(false, "MutexLock declaration missing variable name")
|
||||
#define ReaderMutexLock(x) static_assert(false, "ReaderMutexLock declaration missing variable name")
|
||||
#define WriterMutexLock(x) static_assert(false, "WriterMutexLock declaration missing variable name")
|
||||
|
||||
} // namespace re2
|
||||
|
||||
#endif /* #define RE2_UTIL_MUTEX_H_ */
|
||||
#endif // UTIL_MUTEX_H_
|
||||
|
@ -6,12 +6,25 @@
|
||||
// The main changes are the addition of the HitLimit method and
|
||||
// compilation as PCRE in namespace re2.
|
||||
|
||||
#include <assert.h>
|
||||
#include <ctype.h>
|
||||
#include <errno.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <limits>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
|
||||
#include "util/util.h"
|
||||
#include "util/flags.h"
|
||||
#include "util/logging.h"
|
||||
#include "util/pcre.h"
|
||||
#include "util/strutil.h"
|
||||
|
||||
#if __GNUC__ > 5
|
||||
// Silence warnings about the wacky formatting in the operator() functions.
|
||||
// Note that we test for Clang first because it defines __GNUC__ as well.
|
||||
#if defined(__clang__)
|
||||
#elif defined(__GNUC__) && __GNUC__ >= 6
|
||||
#pragma GCC diagnostic ignored "-Wmisleading-indentation"
|
||||
#endif
|
||||
|
||||
@ -26,6 +39,42 @@ DEFINE_int32(regexp_stack_limit, 256<<10, "default PCRE stack limit (bytes)");
|
||||
DEFINE_int32(regexp_match_limit, 1000000,
|
||||
"default PCRE match limit (function calls)");
|
||||
|
||||
#ifndef USEPCRE
|
||||
|
||||
// Fake just enough of the PCRE API to allow this file to build. :)
|
||||
|
||||
struct pcre_extra {
|
||||
int flags;
|
||||
int match_limit;
|
||||
int match_limit_recursion;
|
||||
};
|
||||
|
||||
#define PCRE_EXTRA_MATCH_LIMIT 0
|
||||
#define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0
|
||||
#define PCRE_ANCHORED 0
|
||||
#define PCRE_NOTEMPTY 0
|
||||
#define PCRE_ERROR_NOMATCH 1
|
||||
#define PCRE_ERROR_MATCHLIMIT 2
|
||||
#define PCRE_ERROR_RECURSIONLIMIT 3
|
||||
#define PCRE_INFO_CAPTURECOUNT 0
|
||||
|
||||
void pcre_free(void*) {
|
||||
}
|
||||
|
||||
pcre* pcre_compile(const char*, int, const char**, int*, const unsigned char*) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
int pcre_exec(const pcre*, const pcre_extra*, const char*, int, int, int, int*, int) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int pcre_fullinfo(const pcre*, const pcre_extra*, int, void*) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Maximum number of args we can set
|
||||
@ -117,7 +166,7 @@ pcre* PCRE::Compile(Anchor anchor) {
|
||||
// ANCHOR_BOTH Tack a "\z" to the end of the original pattern
|
||||
// and use a pcre anchored match.
|
||||
|
||||
const char* error;
|
||||
const char* error = "";
|
||||
int eoffset;
|
||||
pcre* re;
|
||||
if (anchor != ANCHOR_BOTH) {
|
||||
@ -181,8 +230,8 @@ bool PCRE::FullMatchFunctor::operator ()(const StringPiece& text,
|
||||
if (&a15 == &no_more_args) goto done; args[n++] = &a15;
|
||||
done:
|
||||
|
||||
int consumed;
|
||||
int vec[kVecSize];
|
||||
size_t consumed;
|
||||
int vec[kVecSize] = {};
|
||||
return re.DoMatchImpl(text, ANCHOR_BOTH, &consumed, args, n, vec, kVecSize);
|
||||
}
|
||||
|
||||
@ -224,8 +273,8 @@ bool PCRE::PartialMatchFunctor::operator ()(const StringPiece& text,
|
||||
if (&a15 == &no_more_args) goto done; args[n++] = &a15;
|
||||
done:
|
||||
|
||||
int consumed;
|
||||
int vec[kVecSize];
|
||||
size_t consumed;
|
||||
int vec[kVecSize] = {};
|
||||
return re.DoMatchImpl(text, UNANCHORED, &consumed, args, n, vec, kVecSize);
|
||||
}
|
||||
|
||||
@ -267,8 +316,8 @@ bool PCRE::ConsumeFunctor::operator ()(StringPiece* input,
|
||||
if (&a15 == &no_more_args) goto done; args[n++] = &a15;
|
||||
done:
|
||||
|
||||
int consumed;
|
||||
int vec[kVecSize];
|
||||
size_t consumed;
|
||||
int vec[kVecSize] = {};
|
||||
if (pattern.DoMatchImpl(*input, ANCHOR_START, &consumed,
|
||||
args, n, vec, kVecSize)) {
|
||||
input->remove_prefix(consumed);
|
||||
@ -316,8 +365,8 @@ bool PCRE::FindAndConsumeFunctor::operator ()(StringPiece* input,
|
||||
if (&a15 == &no_more_args) goto done; args[n++] = &a15;
|
||||
done:
|
||||
|
||||
int consumed;
|
||||
int vec[kVecSize];
|
||||
size_t consumed;
|
||||
int vec[kVecSize] = {};
|
||||
if (pattern.DoMatchImpl(*input, UNANCHORED, &consumed,
|
||||
args, n, vec, kVecSize)) {
|
||||
input->remove_prefix(consumed);
|
||||
@ -330,7 +379,7 @@ done:
|
||||
bool PCRE::Replace(string *str,
|
||||
const PCRE& pattern,
|
||||
const StringPiece& rewrite) {
|
||||
int vec[kVecSize];
|
||||
int vec[kVecSize] = {};
|
||||
int matches = pattern.TryMatch(*str, 0, UNANCHORED, true, vec, kVecSize);
|
||||
if (matches == 0)
|
||||
return false;
|
||||
@ -349,12 +398,12 @@ int PCRE::GlobalReplace(string *str,
|
||||
const PCRE& pattern,
|
||||
const StringPiece& rewrite) {
|
||||
int count = 0;
|
||||
int vec[kVecSize];
|
||||
int vec[kVecSize] = {};
|
||||
string out;
|
||||
size_t start = 0;
|
||||
bool last_match_was_empty_string = false;
|
||||
|
||||
for (; start <= str->length();) {
|
||||
while (start <= str->size()) {
|
||||
// If the previous match was for the empty string, we shouldn't
|
||||
// just match again: we'll match in the same way and get an
|
||||
// infinite loop. Instead, we do the match in a special way:
|
||||
@ -370,19 +419,20 @@ int PCRE::GlobalReplace(string *str,
|
||||
matches = pattern.TryMatch(*str, start, ANCHOR_START, false,
|
||||
vec, kVecSize);
|
||||
if (matches <= 0) {
|
||||
if (start < str->length())
|
||||
if (start < str->size())
|
||||
out.push_back((*str)[start]);
|
||||
start++;
|
||||
last_match_was_empty_string = false;
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
matches = pattern.TryMatch(*str, start, UNANCHORED, true, vec, kVecSize);
|
||||
matches = pattern.TryMatch(*str, start, UNANCHORED, true,
|
||||
vec, kVecSize);
|
||||
if (matches <= 0)
|
||||
break;
|
||||
}
|
||||
int matchstart = vec[0], matchend = vec[1];
|
||||
assert(matchstart >= static_cast<int>(start));
|
||||
size_t matchstart = vec[0], matchend = vec[1];
|
||||
assert(matchstart >= start);
|
||||
assert(matchend >= matchstart);
|
||||
|
||||
out.append(*str, start, matchstart - start);
|
||||
@ -395,8 +445,9 @@ int PCRE::GlobalReplace(string *str,
|
||||
if (count == 0)
|
||||
return 0;
|
||||
|
||||
if (start < str->length())
|
||||
out.append(*str, start, str->length() - start);
|
||||
if (start < str->size())
|
||||
out.append(*str, start, str->size() - start);
|
||||
using std::swap;
|
||||
swap(out, *str);
|
||||
return count;
|
||||
}
|
||||
@ -405,7 +456,7 @@ bool PCRE::Extract(const StringPiece &text,
|
||||
const PCRE& pattern,
|
||||
const StringPiece &rewrite,
|
||||
string *out) {
|
||||
int vec[kVecSize];
|
||||
int vec[kVecSize] = {};
|
||||
int matches = pattern.TryMatch(text, 0, UNANCHORED, true, vec, kVecSize);
|
||||
if (matches == 0)
|
||||
return false;
|
||||
@ -424,7 +475,7 @@ string PCRE::QuoteMeta(const StringPiece& unquoted) {
|
||||
// that. (This also makes it identical to the perl function of the
|
||||
// same name except for the null-character special case;
|
||||
// see `perldoc -f quotemeta`.)
|
||||
for (int ii = 0; ii < unquoted.length(); ++ii) {
|
||||
for (size_t ii = 0; ii < unquoted.size(); ++ii) {
|
||||
// Note that using 'isalnum' here raises the benchmark time from
|
||||
// 32ns to 58ns:
|
||||
if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
|
||||
@ -451,7 +502,7 @@ string PCRE::QuoteMeta(const StringPiece& unquoted) {
|
||||
/***** Actual matching and rewriting code *****/
|
||||
|
||||
bool PCRE::HitLimit() {
|
||||
return hit_limit_;
|
||||
return hit_limit_ != 0;
|
||||
}
|
||||
|
||||
void PCRE::ClearHitLimit() {
|
||||
@ -459,11 +510,11 @@ void PCRE::ClearHitLimit() {
|
||||
}
|
||||
|
||||
int PCRE::TryMatch(const StringPiece& text,
|
||||
int startpos,
|
||||
Anchor anchor,
|
||||
bool empty_ok,
|
||||
int *vec,
|
||||
int vecsize) const {
|
||||
size_t startpos,
|
||||
Anchor anchor,
|
||||
bool empty_ok,
|
||||
int *vec,
|
||||
int vecsize) const {
|
||||
pcre* re = (anchor == ANCHOR_BOTH) ? re_full_ : re_partial_;
|
||||
if (re == NULL) {
|
||||
PCREPORT(ERROR) << "Matching against invalid re: " << *error_;
|
||||
@ -499,8 +550,8 @@ int PCRE::TryMatch(const StringPiece& text,
|
||||
int rc = pcre_exec(re, // The regular expression object
|
||||
&extra,
|
||||
(text.data() == NULL) ? "" : text.data(),
|
||||
text.size(),
|
||||
startpos,
|
||||
static_cast<int>(text.size()),
|
||||
static_cast<int>(startpos),
|
||||
options,
|
||||
vec,
|
||||
vecsize);
|
||||
@ -554,18 +605,13 @@ int PCRE::TryMatch(const StringPiece& text,
|
||||
return rc;
|
||||
}
|
||||
|
||||
#if !__clang__
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
|
||||
#endif
|
||||
|
||||
bool PCRE::DoMatchImpl(const StringPiece& text,
|
||||
Anchor anchor,
|
||||
int* consumed,
|
||||
const Arg* const* args,
|
||||
int n,
|
||||
int* vec,
|
||||
int vecsize) const {
|
||||
Anchor anchor,
|
||||
size_t* consumed,
|
||||
const Arg* const* args,
|
||||
int n,
|
||||
int* vec,
|
||||
int vecsize) const {
|
||||
assert((1 + n) * 3 <= vecsize); // results + PCRE workspace
|
||||
int matches = TryMatch(text, 0, anchor, true, vec, vecsize);
|
||||
assert(matches >= 0); // TryMatch never returns negatives
|
||||
@ -589,7 +635,17 @@ bool PCRE::DoMatchImpl(const StringPiece& text,
|
||||
for (int i = 0; i < n; i++) {
|
||||
const int start = vec[2*(i+1)];
|
||||
const int limit = vec[2*(i+1)+1];
|
||||
if (!args[i]->Parse(text.data() + start, limit-start)) {
|
||||
|
||||
// Avoid invoking undefined behavior when text.data() happens
|
||||
// to be null and start happens to be -1, the latter being the
|
||||
// case for an unmatched subexpression. Even if text.data() is
|
||||
// not null, pointing one byte before was a longstanding bug.
|
||||
const char* addr = NULL;
|
||||
if (start != -1) {
|
||||
addr = text.data() + start;
|
||||
}
|
||||
|
||||
if (!args[i]->Parse(addr, limit-start)) {
|
||||
// TODO: Should we indicate what the error was?
|
||||
return false;
|
||||
}
|
||||
@ -598,19 +654,15 @@ bool PCRE::DoMatchImpl(const StringPiece& text,
|
||||
return true;
|
||||
}
|
||||
|
||||
#if !__clang__
|
||||
#pragma GCC diagnostic pop
|
||||
#endif
|
||||
|
||||
bool PCRE::DoMatch(const StringPiece& text,
|
||||
Anchor anchor,
|
||||
int* consumed,
|
||||
const Arg* const args[],
|
||||
int n) const {
|
||||
Anchor anchor,
|
||||
size_t* consumed,
|
||||
const Arg* const args[],
|
||||
int n) const {
|
||||
assert(n >= 0);
|
||||
size_t const vecsize = (1 + n) * 3; // results + PCRE workspace
|
||||
// (as for kVecSize)
|
||||
int *vec = new int[vecsize];
|
||||
const int vecsize = (1 + n) * 3; // results + PCRE workspace
|
||||
// (as for kVecSize)
|
||||
int* vec = new int[vecsize];
|
||||
bool b = DoMatchImpl(text, anchor, consumed, args, n, vec, vecsize);
|
||||
delete[] vec;
|
||||
return b;
|
||||
@ -695,41 +747,52 @@ int PCRE::NumberOfCapturingGroups() const {
|
||||
if (re_partial_ == NULL) return -1;
|
||||
|
||||
int result;
|
||||
CHECK(pcre_fullinfo(re_partial_, // The regular expression object
|
||||
NULL, // We did not study the pattern
|
||||
PCRE_INFO_CAPTURECOUNT,
|
||||
&result) == 0);
|
||||
int rc = pcre_fullinfo(re_partial_, // The regular expression object
|
||||
NULL, // We did not study the pattern
|
||||
PCRE_INFO_CAPTURECOUNT,
|
||||
&result);
|
||||
if (rc != 0) {
|
||||
PCREPORT(ERROR) << "Unexpected return code: " << rc;
|
||||
return -1;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
/***** Parsers for various types *****/
|
||||
|
||||
bool PCRE::Arg::parse_null(const char* str, int n, void* dest) {
|
||||
bool PCRE::Arg::parse_null(const char* str, size_t n, void* dest) {
|
||||
// We fail if somebody asked us to store into a non-NULL void* pointer
|
||||
return (dest == NULL);
|
||||
}
|
||||
|
||||
bool PCRE::Arg::parse_string(const char* str, int n, void* dest) {
|
||||
bool PCRE::Arg::parse_string(const char* str, size_t n, void* dest) {
|
||||
if (dest == NULL) return true;
|
||||
reinterpret_cast<string*>(dest)->assign(str, n);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool PCRE::Arg::parse_stringpiece(const char* str, int n, void* dest) {
|
||||
bool PCRE::Arg::parse_stringpiece(const char* str, size_t n, void* dest) {
|
||||
if (dest == NULL) return true;
|
||||
reinterpret_cast<StringPiece*>(dest)->set(str, n);
|
||||
*(reinterpret_cast<StringPiece*>(dest)) = StringPiece(str, n);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool PCRE::Arg::parse_char(const char* str, int n, void* dest) {
|
||||
bool PCRE::Arg::parse_char(const char* str, size_t n, void* dest) {
|
||||
if (n != 1) return false;
|
||||
if (dest == NULL) return true;
|
||||
*(reinterpret_cast<char*>(dest)) = str[0];
|
||||
return true;
|
||||
}
|
||||
|
||||
bool PCRE::Arg::parse_uchar(const char* str, int n, void* dest) {
|
||||
bool PCRE::Arg::parse_schar(const char* str, size_t n, void* dest) {
|
||||
if (n != 1) return false;
|
||||
if (dest == NULL) return true;
|
||||
*(reinterpret_cast<signed char*>(dest)) = str[0];
|
||||
return true;
|
||||
}
|
||||
|
||||
bool PCRE::Arg::parse_uchar(const char* str, size_t n, void* dest) {
|
||||
if (n != 1) return false;
|
||||
if (dest == NULL) return true;
|
||||
*(reinterpret_cast<unsigned char*>(dest)) = str[0];
|
||||
@ -746,7 +809,7 @@ static const int kMaxNumberLength = 32;
|
||||
// a. "str" if no termination is needed
|
||||
// b. "buf" if the string was copied and null-terminated
|
||||
// c. "" if the input was invalid and has no hope of being parsed
|
||||
static const char* TerminateNumber(char* buf, const char* str, int n) {
|
||||
static const char* TerminateNumber(char* buf, const char* str, size_t n) {
|
||||
if ((n > 0) && isspace(*str)) {
|
||||
// We are less forgiving than the strtoxxx() routines and do not
|
||||
// allow leading spaces.
|
||||
@ -769,9 +832,9 @@ static const char* TerminateNumber(char* buf, const char* str, int n) {
|
||||
}
|
||||
|
||||
bool PCRE::Arg::parse_long_radix(const char* str,
|
||||
int n,
|
||||
void* dest,
|
||||
int radix) {
|
||||
size_t n,
|
||||
void* dest,
|
||||
int radix) {
|
||||
if (n == 0) return false;
|
||||
char buf[kMaxNumberLength+1];
|
||||
str = TerminateNumber(buf, str, n);
|
||||
@ -786,16 +849,16 @@ bool PCRE::Arg::parse_long_radix(const char* str,
|
||||
}
|
||||
|
||||
bool PCRE::Arg::parse_ulong_radix(const char* str,
|
||||
int n,
|
||||
void* dest,
|
||||
int radix) {
|
||||
size_t n,
|
||||
void* dest,
|
||||
int radix) {
|
||||
if (n == 0) return false;
|
||||
char buf[kMaxNumberLength+1];
|
||||
str = TerminateNumber(buf, str, n);
|
||||
if (str[0] == '-') {
|
||||
// strtoul() will silently accept negative numbers and parse
|
||||
// them. This module is more strict and treats them as errors.
|
||||
return false;
|
||||
// strtoul() will silently accept negative numbers and parse
|
||||
// them. This module is more strict and treats them as errors.
|
||||
return false;
|
||||
}
|
||||
|
||||
char* end;
|
||||
@ -809,74 +872,74 @@ bool PCRE::Arg::parse_ulong_radix(const char* str,
|
||||
}
|
||||
|
||||
bool PCRE::Arg::parse_short_radix(const char* str,
|
||||
int n,
|
||||
void* dest,
|
||||
int radix) {
|
||||
size_t n,
|
||||
void* dest,
|
||||
int radix) {
|
||||
long r;
|
||||
if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
|
||||
if ((short)r != r) return false; // Out of range
|
||||
if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
|
||||
if ((short)r != r) return false; // Out of range
|
||||
if (dest == NULL) return true;
|
||||
*(reinterpret_cast<short*>(dest)) = r;
|
||||
*(reinterpret_cast<short*>(dest)) = (short)r;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool PCRE::Arg::parse_ushort_radix(const char* str,
|
||||
int n,
|
||||
void* dest,
|
||||
int radix) {
|
||||
size_t n,
|
||||
void* dest,
|
||||
int radix) {
|
||||
unsigned long r;
|
||||
if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
|
||||
if ((ushort)r != r) return false; // Out of range
|
||||
if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
|
||||
if ((unsigned short)r != r) return false; // Out of range
|
||||
if (dest == NULL) return true;
|
||||
*(reinterpret_cast<unsigned short*>(dest)) = r;
|
||||
*(reinterpret_cast<unsigned short*>(dest)) = (unsigned short)r;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool PCRE::Arg::parse_int_radix(const char* str,
|
||||
int n,
|
||||
void* dest,
|
||||
int radix) {
|
||||
size_t n,
|
||||
void* dest,
|
||||
int radix) {
|
||||
long r;
|
||||
if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
|
||||
if ((int)r != r) return false; // Out of range
|
||||
if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
|
||||
if ((int)r != r) return false; // Out of range
|
||||
if (dest == NULL) return true;
|
||||
*(reinterpret_cast<int*>(dest)) = r;
|
||||
*(reinterpret_cast<int*>(dest)) = (int)r;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool PCRE::Arg::parse_uint_radix(const char* str,
|
||||
int n,
|
||||
void* dest,
|
||||
int radix) {
|
||||
size_t n,
|
||||
void* dest,
|
||||
int radix) {
|
||||
unsigned long r;
|
||||
if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
|
||||
if ((uint)r != r) return false; // Out of range
|
||||
if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
|
||||
if ((unsigned int)r != r) return false; // Out of range
|
||||
if (dest == NULL) return true;
|
||||
*(reinterpret_cast<unsigned int*>(dest)) = r;
|
||||
*(reinterpret_cast<unsigned int*>(dest)) = (unsigned int)r;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool PCRE::Arg::parse_longlong_radix(const char* str,
|
||||
int n,
|
||||
void* dest,
|
||||
int radix) {
|
||||
size_t n,
|
||||
void* dest,
|
||||
int radix) {
|
||||
if (n == 0) return false;
|
||||
char buf[kMaxNumberLength+1];
|
||||
str = TerminateNumber(buf, str, n);
|
||||
char* end;
|
||||
errno = 0;
|
||||
int64 r = strtoll(str, &end, radix);
|
||||
long long r = strtoll(str, &end, radix);
|
||||
if (end != str + n) return false; // Leftover junk
|
||||
if (errno) return false;
|
||||
if (dest == NULL) return true;
|
||||
*(reinterpret_cast<int64*>(dest)) = r;
|
||||
*(reinterpret_cast<long long*>(dest)) = r;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool PCRE::Arg::parse_ulonglong_radix(const char* str,
|
||||
int n,
|
||||
void* dest,
|
||||
int radix) {
|
||||
size_t n,
|
||||
void* dest,
|
||||
int radix) {
|
||||
if (n == 0) return false;
|
||||
char buf[kMaxNumberLength+1];
|
||||
str = TerminateNumber(buf, str, n);
|
||||
@ -887,26 +950,32 @@ bool PCRE::Arg::parse_ulonglong_radix(const char* str,
|
||||
}
|
||||
char* end;
|
||||
errno = 0;
|
||||
uint64 r = strtoull(str, &end, radix);
|
||||
unsigned long long r = strtoull(str, &end, radix);
|
||||
if (end != str + n) return false; // Leftover junk
|
||||
if (errno) return false;
|
||||
if (dest == NULL) return true;
|
||||
*(reinterpret_cast<uint64*>(dest)) = r;
|
||||
*(reinterpret_cast<unsigned long long*>(dest)) = r;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool PCRE::Arg::parse_double(const char* str, int n, void* dest) {
|
||||
static bool parse_double_float(const char* str, size_t n, bool isfloat,
|
||||
void* dest) {
|
||||
if (n == 0) return false;
|
||||
static const int kMaxLength = 200;
|
||||
char buf[kMaxLength];
|
||||
if (n >= kMaxLength) return false;
|
||||
memcpy(buf, str, n);
|
||||
buf[n] = '\0';
|
||||
errno = 0;
|
||||
char* end;
|
||||
double r = strtod(buf, &end);
|
||||
errno = 0;
|
||||
double r;
|
||||
if (isfloat) {
|
||||
r = strtof(buf, &end);
|
||||
} else {
|
||||
r = strtod(buf, &end);
|
||||
}
|
||||
if (end != buf + n) {
|
||||
#ifdef COMPILER_MSVC
|
||||
#ifdef _WIN32
|
||||
// Microsoft's strtod() doesn't handle inf and nan, so we have to
|
||||
// handle it explicitly. Speed is not important here because this
|
||||
// code is only called in unit tests.
|
||||
@ -918,12 +987,12 @@ bool PCRE::Arg::parse_double(const char* str, int n, void* dest) {
|
||||
} else if ('+' == *i) {
|
||||
++i;
|
||||
}
|
||||
if (0 == stricmp(i, "inf") || 0 == stricmp(i, "infinity")) {
|
||||
r = numeric_limits<double>::infinity();
|
||||
if (0 == _stricmp(i, "inf") || 0 == _stricmp(i, "infinity")) {
|
||||
r = std::numeric_limits<double>::infinity();
|
||||
if (!pos)
|
||||
r = -r;
|
||||
} else if (0 == stricmp(i, "nan")) {
|
||||
r = numeric_limits<double>::quiet_NaN();
|
||||
} else if (0 == _stricmp(i, "nan")) {
|
||||
r = std::numeric_limits<double>::quiet_NaN();
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
@ -933,42 +1002,47 @@ bool PCRE::Arg::parse_double(const char* str, int n, void* dest) {
|
||||
}
|
||||
if (errno) return false;
|
||||
if (dest == NULL) return true;
|
||||
*(reinterpret_cast<double*>(dest)) = r;
|
||||
if (isfloat) {
|
||||
*(reinterpret_cast<float*>(dest)) = (float)r;
|
||||
} else {
|
||||
*(reinterpret_cast<double*>(dest)) = r;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool PCRE::Arg::parse_float(const char* str, int n, void* dest) {
|
||||
double r;
|
||||
if (!parse_double(str, n, &r)) return false;
|
||||
if (dest == NULL) return true;
|
||||
*(reinterpret_cast<float*>(dest)) = static_cast<float>(r);
|
||||
return true;
|
||||
bool PCRE::Arg::parse_double(const char* str, size_t n, void* dest) {
|
||||
return parse_double_float(str, n, false, dest);
|
||||
}
|
||||
|
||||
bool PCRE::Arg::parse_float(const char* str, size_t n, void* dest) {
|
||||
return parse_double_float(str, n, true, dest);
|
||||
}
|
||||
|
||||
#define DEFINE_INTEGER_PARSERS(name) \
|
||||
bool PCRE::Arg::parse_##name(const char* str, int n, void* dest) { \
|
||||
return parse_##name##_radix(str, n, dest, 10); \
|
||||
} \
|
||||
bool PCRE::Arg::parse_##name##_hex(const char* str, int n, void* dest) { \
|
||||
return parse_##name##_radix(str, n, dest, 16); \
|
||||
} \
|
||||
bool PCRE::Arg::parse_##name##_octal(const char* str, int n, void* dest) { \
|
||||
return parse_##name##_radix(str, n, dest, 8); \
|
||||
} \
|
||||
bool PCRE::Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \
|
||||
return parse_##name##_radix(str, n, dest, 0); \
|
||||
#define DEFINE_INTEGER_PARSER(name) \
|
||||
bool PCRE::Arg::parse_##name(const char* str, size_t n, void* dest) { \
|
||||
return parse_##name##_radix(str, n, dest, 10); \
|
||||
} \
|
||||
bool PCRE::Arg::parse_##name##_hex(const char* str, size_t n, void* dest) { \
|
||||
return parse_##name##_radix(str, n, dest, 16); \
|
||||
} \
|
||||
bool PCRE::Arg::parse_##name##_octal(const char* str, size_t n, \
|
||||
void* dest) { \
|
||||
return parse_##name##_radix(str, n, dest, 8); \
|
||||
} \
|
||||
bool PCRE::Arg::parse_##name##_cradix(const char* str, size_t n, \
|
||||
void* dest) { \
|
||||
return parse_##name##_radix(str, n, dest, 0); \
|
||||
}
|
||||
|
||||
DEFINE_INTEGER_PARSERS(short);
|
||||
DEFINE_INTEGER_PARSERS(ushort);
|
||||
DEFINE_INTEGER_PARSERS(int);
|
||||
DEFINE_INTEGER_PARSERS(uint);
|
||||
DEFINE_INTEGER_PARSERS(long);
|
||||
DEFINE_INTEGER_PARSERS(ulong);
|
||||
DEFINE_INTEGER_PARSERS(longlong);
|
||||
DEFINE_INTEGER_PARSERS(ulonglong);
|
||||
DEFINE_INTEGER_PARSER(short);
|
||||
DEFINE_INTEGER_PARSER(ushort);
|
||||
DEFINE_INTEGER_PARSER(int);
|
||||
DEFINE_INTEGER_PARSER(uint);
|
||||
DEFINE_INTEGER_PARSER(long);
|
||||
DEFINE_INTEGER_PARSER(ulong);
|
||||
DEFINE_INTEGER_PARSER(longlong);
|
||||
DEFINE_INTEGER_PARSER(ulonglong);
|
||||
|
||||
#undef DEFINE_INTEGER_PARSERS
|
||||
#undef DEFINE_INTEGER_PARSER
|
||||
|
||||
} // namespace re2
|
||||
|
@ -2,6 +2,9 @@
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef UTIL_PCRE_H_
|
||||
#define UTIL_PCRE_H_
|
||||
|
||||
// This is a variant of PCRE's pcrecpp.h, originally written at Google.
|
||||
// The main changes are the addition of the HitLimit method and
|
||||
// compilation as PCRE in namespace re2.
|
||||
@ -167,22 +170,9 @@ namespace re2 {
|
||||
const bool UsingPCRE = true;
|
||||
} // namespace re2
|
||||
#else
|
||||
struct pcre; // opaque
|
||||
namespace re2 {
|
||||
const bool UsingPCRE = false;
|
||||
struct pcre;
|
||||
struct pcre_extra { int flags, match_limit, match_limit_recursion; };
|
||||
#define pcre_free(x) {}
|
||||
#define PCRE_EXTRA_MATCH_LIMIT 0
|
||||
#define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0
|
||||
#define PCRE_ANCHORED 0
|
||||
#define PCRE_NOTEMPTY 0
|
||||
#define PCRE_ERROR_NOMATCH 1
|
||||
#define PCRE_ERROR_MATCHLIMIT 2
|
||||
#define PCRE_ERROR_RECURSIONLIMIT 3
|
||||
#define PCRE_INFO_CAPTURECOUNT 0
|
||||
#define pcre_compile(a,b,c,d,e) ({ (void)(a); (void)(b); *(c)=""; *(d)=0; (void)(e); ((pcre*)0); })
|
||||
#define pcre_exec(a, b, c, d, e, f, g, h) ({ (void)(a); (void)(b); (void)(c); (void)(d); (void)(e); (void)(f); (void)(g); (void)(h); 0; })
|
||||
#define pcre_fullinfo(a, b, c, d) ({ (void)(a); (void)(b); (void)(c); *(d) = 0; 0; })
|
||||
} // namespace re2
|
||||
#endif
|
||||
|
||||
@ -258,7 +248,7 @@ class PCRE {
|
||||
// type, or one of:
|
||||
// string (matched piece is copied to string)
|
||||
// StringPiece (StringPiece is mutated to point to matched piece)
|
||||
// T (where "bool T::ParseFrom(const char*, int)" exists)
|
||||
// T (where "bool T::ParseFrom(const char*, size_t)" exists)
|
||||
// (void*)NULL (the corresponding matched sub-pattern is not copied)
|
||||
//
|
||||
// Returns true iff all of the following conditions are satisfied:
|
||||
@ -452,7 +442,7 @@ class PCRE {
|
||||
// "*consumed" if successful.
|
||||
bool DoMatch(const StringPiece& text,
|
||||
Anchor anchor,
|
||||
int* consumed,
|
||||
size_t* consumed,
|
||||
const Arg* const* args, int n) const;
|
||||
|
||||
// Return the number of capturing subpatterns, or -1 if the
|
||||
@ -475,7 +465,7 @@ class PCRE {
|
||||
// When matching PCRE("(foo)|hello") against "hello", it will return 1.
|
||||
// But the values for all subpattern are filled in into "vec".
|
||||
int TryMatch(const StringPiece& text,
|
||||
int startpos,
|
||||
size_t startpos,
|
||||
Anchor anchor,
|
||||
bool empty_ok,
|
||||
int *vec,
|
||||
@ -492,7 +482,7 @@ class PCRE {
|
||||
// internal implementation for DoMatch
|
||||
bool DoMatchImpl(const StringPiece& text,
|
||||
Anchor anchor,
|
||||
int* consumed,
|
||||
size_t* consumed,
|
||||
const Arg* const args[],
|
||||
int n,
|
||||
int* vec,
|
||||
@ -509,8 +499,10 @@ class PCRE {
|
||||
bool report_errors_; // Silences error logging if false
|
||||
int match_limit_; // Limit on execution resources
|
||||
int stack_limit_; // Limit on stack resources (bytes)
|
||||
mutable int32_t hit_limit_; // Hit limit during execution (bool)?
|
||||
DISALLOW_EVIL_CONSTRUCTORS(PCRE);
|
||||
mutable int32_t hit_limit_; // Hit limit during execution (bool)?
|
||||
|
||||
PCRE(const PCRE&) = delete;
|
||||
PCRE& operator=(const PCRE&) = delete;
|
||||
};
|
||||
|
||||
// PCRE_Options allow you to set the PCRE::Options, plus any pcre
|
||||
@ -565,7 +557,7 @@ class PCRE_Options {
|
||||
template <class T>
|
||||
class _PCRE_MatchObject {
|
||||
public:
|
||||
static inline bool Parse(const char* str, int n, void* dest) {
|
||||
static inline bool Parse(const char* str, size_t n, void* dest) {
|
||||
if (dest == NULL) return true;
|
||||
T* object = reinterpret_cast<T*>(dest);
|
||||
return object->ParseFrom(str, n);
|
||||
@ -580,16 +572,21 @@ class PCRE::Arg {
|
||||
// Constructor specially designed for NULL arguments
|
||||
Arg(void*);
|
||||
|
||||
typedef bool (*Parser)(const char* str, int n, void* dest);
|
||||
typedef bool (*Parser)(const char* str, size_t n, void* dest);
|
||||
|
||||
// Type-specific parsers
|
||||
#define MAKE_PARSER(type,name) \
|
||||
Arg(type* p) : arg_(p), parser_(name) { } \
|
||||
Arg(type* p, Parser parser) : arg_(p), parser_(parser) { } \
|
||||
|
||||
#define MAKE_PARSER(type, name) \
|
||||
Arg(type* p) : arg_(p), parser_(name) {} \
|
||||
Arg(type* p, Parser parser) : arg_(p), parser_(parser) {}
|
||||
|
||||
MAKE_PARSER(char, parse_char);
|
||||
MAKE_PARSER(signed char, parse_schar);
|
||||
MAKE_PARSER(unsigned char, parse_uchar);
|
||||
MAKE_PARSER(float, parse_float);
|
||||
MAKE_PARSER(double, parse_double);
|
||||
MAKE_PARSER(string, parse_string);
|
||||
MAKE_PARSER(StringPiece, parse_stringpiece);
|
||||
|
||||
MAKE_PARSER(short, parse_short);
|
||||
MAKE_PARSER(unsigned short, parse_ushort);
|
||||
MAKE_PARSER(int, parse_int);
|
||||
@ -598,10 +595,6 @@ class PCRE::Arg {
|
||||
MAKE_PARSER(unsigned long, parse_ulong);
|
||||
MAKE_PARSER(long long, parse_longlong);
|
||||
MAKE_PARSER(unsigned long long, parse_ulonglong);
|
||||
MAKE_PARSER(float, parse_float);
|
||||
MAKE_PARSER(double, parse_double);
|
||||
MAKE_PARSER(string, parse_string);
|
||||
MAKE_PARSER(StringPiece, parse_stringpiece);
|
||||
|
||||
#undef MAKE_PARSER
|
||||
|
||||
@ -613,29 +606,31 @@ class PCRE::Arg {
|
||||
}
|
||||
|
||||
// Parse the data
|
||||
bool Parse(const char* str, int n) const;
|
||||
bool Parse(const char* str, size_t n) const;
|
||||
|
||||
private:
|
||||
void* arg_;
|
||||
Parser parser_;
|
||||
|
||||
static bool parse_null (const char* str, int n, void* dest);
|
||||
static bool parse_char (const char* str, int n, void* dest);
|
||||
static bool parse_uchar (const char* str, int n, void* dest);
|
||||
static bool parse_float (const char* str, int n, void* dest);
|
||||
static bool parse_double (const char* str, int n, void* dest);
|
||||
static bool parse_string (const char* str, int n, void* dest);
|
||||
static bool parse_stringpiece (const char* str, int n, void* dest);
|
||||
static bool parse_null (const char* str, size_t n, void* dest);
|
||||
static bool parse_char (const char* str, size_t n, void* dest);
|
||||
static bool parse_schar (const char* str, size_t n, void* dest);
|
||||
static bool parse_uchar (const char* str, size_t n, void* dest);
|
||||
static bool parse_float (const char* str, size_t n, void* dest);
|
||||
static bool parse_double (const char* str, size_t n, void* dest);
|
||||
static bool parse_string (const char* str, size_t n, void* dest);
|
||||
static bool parse_stringpiece (const char* str, size_t n, void* dest);
|
||||
|
||||
#define DECLARE_INTEGER_PARSER(name) \
|
||||
private: \
|
||||
static bool parse_ ## name(const char* str, int n, void* dest); \
|
||||
static bool parse_ ## name ## _radix( \
|
||||
const char* str, int n, void* dest, int radix); \
|
||||
public: \
|
||||
static bool parse_ ## name ## _hex(const char* str, int n, void* dest); \
|
||||
static bool parse_ ## name ## _octal(const char* str, int n, void* dest); \
|
||||
static bool parse_ ## name ## _cradix(const char* str, int n, void* dest)
|
||||
#define DECLARE_INTEGER_PARSER(name) \
|
||||
private: \
|
||||
static bool parse_##name(const char* str, size_t n, void* dest); \
|
||||
static bool parse_##name##_radix(const char* str, size_t n, void* dest, \
|
||||
int radix); \
|
||||
\
|
||||
public: \
|
||||
static bool parse_##name##_hex(const char* str, size_t n, void* dest); \
|
||||
static bool parse_##name##_octal(const char* str, size_t n, void* dest); \
|
||||
static bool parse_##name##_cradix(const char* str, size_t n, void* dest)
|
||||
|
||||
DECLARE_INTEGER_PARSER(short);
|
||||
DECLARE_INTEGER_PARSER(ushort);
|
||||
@ -647,23 +642,27 @@ class PCRE::Arg {
|
||||
DECLARE_INTEGER_PARSER(ulonglong);
|
||||
|
||||
#undef DECLARE_INTEGER_PARSER
|
||||
|
||||
};
|
||||
|
||||
inline PCRE::Arg::Arg() : arg_(NULL), parser_(parse_null) { }
|
||||
inline PCRE::Arg::Arg(void* p) : arg_(p), parser_(parse_null) { }
|
||||
|
||||
inline bool PCRE::Arg::Parse(const char* str, int n) const {
|
||||
inline bool PCRE::Arg::Parse(const char* str, size_t n) const {
|
||||
return (*parser_)(str, n, arg_);
|
||||
}
|
||||
|
||||
// This part of the parser, appropriate only for ints, deals with bases
|
||||
#define MAKE_INTEGER_PARSER(type, name) \
|
||||
inline PCRE::Arg Hex(type* ptr) { \
|
||||
return PCRE::Arg(ptr, PCRE::Arg::parse_ ## name ## _hex); } \
|
||||
inline PCRE::Arg Octal(type* ptr) { \
|
||||
return PCRE::Arg(ptr, PCRE::Arg::parse_ ## name ## _octal); } \
|
||||
inline PCRE::Arg CRadix(type* ptr) { \
|
||||
return PCRE::Arg(ptr, PCRE::Arg::parse_ ## name ## _cradix); }
|
||||
#define MAKE_INTEGER_PARSER(type, name) \
|
||||
inline PCRE::Arg Hex(type* ptr) { \
|
||||
return PCRE::Arg(ptr, PCRE::Arg::parse_##name##_hex); \
|
||||
} \
|
||||
inline PCRE::Arg Octal(type* ptr) { \
|
||||
return PCRE::Arg(ptr, PCRE::Arg::parse_##name##_octal); \
|
||||
} \
|
||||
inline PCRE::Arg CRadix(type* ptr) { \
|
||||
return PCRE::Arg(ptr, PCRE::Arg::parse_##name##_cradix); \
|
||||
}
|
||||
|
||||
MAKE_INTEGER_PARSER(short, short);
|
||||
MAKE_INTEGER_PARSER(unsigned short, ushort);
|
||||
@ -677,3 +676,5 @@ MAKE_INTEGER_PARSER(unsigned long long, ulonglong);
|
||||
#undef MAKE_INTEGER_PARSER
|
||||
|
||||
} // namespace re2
|
||||
|
||||
#endif // UTIL_PCRE_H_
|
||||
|
@ -1,34 +0,0 @@
|
||||
// Copyright 2005-2009 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Modified from Google perftools's tcmalloc_unittest.cc.
|
||||
|
||||
#include "util/random.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
int32 ACMRandom::Next() {
|
||||
const int32 M = 2147483647L; // 2^31-1
|
||||
const int32 A = 16807;
|
||||
// In effect, we are computing seed_ = (seed_ * A) % M, where M = 2^31-1
|
||||
uint32 lo = A * (int32)(seed_ & 0xFFFF);
|
||||
uint32 hi = A * (int32)((uint32)seed_ >> 16);
|
||||
lo += (hi & 0x7FFF) << 16;
|
||||
if (lo > M) {
|
||||
lo &= M;
|
||||
++lo;
|
||||
}
|
||||
lo += hi >> 15;
|
||||
if (lo > M) {
|
||||
lo &= M;
|
||||
++lo;
|
||||
}
|
||||
return (seed_ = (int32) lo);
|
||||
}
|
||||
|
||||
int32 ACMRandom::Uniform(int32 n) {
|
||||
return Next() % n;
|
||||
}
|
||||
|
||||
} // namespace re2
|
@ -1,29 +0,0 @@
|
||||
// Copyright 2005-2009 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Modified from Google perftools's tcmalloc_unittest.cc.
|
||||
|
||||
#ifndef RE2_UTIL_RANDOM_H__
|
||||
#define RE2_UTIL_RANDOM_H__
|
||||
|
||||
#include "util/util.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// ACM minimal standard random number generator. (re-entrant.)
|
||||
class ACMRandom {
|
||||
public:
|
||||
ACMRandom(int32 seed) : seed_(seed) {}
|
||||
int32 Next();
|
||||
int32 Uniform(int32);
|
||||
|
||||
void Reset(int32 seed) { seed_ = seed; }
|
||||
|
||||
private:
|
||||
int32 seed_;
|
||||
};
|
||||
|
||||
} // namespace re2
|
||||
|
||||
#endif // RE2_UTIL_RANDOM_H__
|
@ -11,8 +11,10 @@
|
||||
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
|
||||
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
|
||||
*/
|
||||
|
||||
#include <stdarg.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "util/utf.h"
|
||||
|
||||
namespace re2 {
|
||||
@ -133,7 +135,7 @@ runetochar(char *str, const Rune *rune)
|
||||
*/
|
||||
c = *rune;
|
||||
if(c <= Rune1) {
|
||||
str[0] = c;
|
||||
str[0] = static_cast<char>(c);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@ -142,7 +144,7 @@ runetochar(char *str, const Rune *rune)
|
||||
* 0080-07FF => T2 Tx
|
||||
*/
|
||||
if(c <= Rune2) {
|
||||
str[0] = T2 | (c >> 1*Bitx);
|
||||
str[0] = T2 | static_cast<char>(c >> 1*Bitx);
|
||||
str[1] = Tx | (c & Maskx);
|
||||
return 2;
|
||||
}
|
||||
@ -161,9 +163,9 @@ runetochar(char *str, const Rune *rune)
|
||||
* 0800-FFFF => T3 Tx Tx
|
||||
*/
|
||||
if (c <= Rune3) {
|
||||
str[0] = T3 | (c >> 2*Bitx);
|
||||
str[0] = T3 | static_cast<char>(c >> 2*Bitx);
|
||||
str[1] = Tx | ((c >> 1*Bitx) & Maskx);
|
||||
str[2] = Tx | (c & Maskx);
|
||||
str[2] = Tx | (c & Maskx);
|
||||
return 3;
|
||||
}
|
||||
|
||||
@ -171,7 +173,7 @@ runetochar(char *str, const Rune *rune)
|
||||
* four character sequence (21-bit value)
|
||||
* 10000-1FFFFF => T4 Tx Tx Tx
|
||||
*/
|
||||
str[0] = T4 | (c >> 3*Bitx);
|
||||
str[0] = T4 | static_cast<char>(c >> 3*Bitx);
|
||||
str[1] = Tx | ((c >> 2*Bitx) & Maskx);
|
||||
str[2] = Tx | ((c >> 1*Bitx) & Maskx);
|
||||
str[3] = Tx | (c & Maskx);
|
||||
|
@ -2,97 +2,111 @@
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef UTIL_SPARSE_ARRAY_H_
|
||||
#define UTIL_SPARSE_ARRAY_H_
|
||||
|
||||
// DESCRIPTION
|
||||
//
|
||||
//
|
||||
// SparseArray<T>(m) is a map from integers in [0, m) to T values.
|
||||
// It requires (sizeof(T)+sizeof(int))*m memory, but it provides
|
||||
// fast iteration through the elements in the array and fast clearing
|
||||
// of the array. The array has a concept of certain elements being
|
||||
// uninitialized (having no value).
|
||||
//
|
||||
//
|
||||
// Insertion and deletion are constant time operations.
|
||||
//
|
||||
// Allocating the array is a constant time operation
|
||||
//
|
||||
// Allocating the array is a constant time operation
|
||||
// when memory allocation is a constant time operation.
|
||||
//
|
||||
//
|
||||
// Clearing the array is a constant time operation (unusual!).
|
||||
//
|
||||
//
|
||||
// Iterating through the array is an O(n) operation, where n
|
||||
// is the number of items in the array (not O(m)).
|
||||
//
|
||||
// The array iterator visits entries in the order they were first
|
||||
// The array iterator visits entries in the order they were first
|
||||
// inserted into the array. It is safe to add items to the array while
|
||||
// using an iterator: the iterator will visit indices added to the array
|
||||
// during the iteration, but will not re-visit indices whose values
|
||||
// change after visiting. Thus SparseArray can be a convenient
|
||||
// implementation of a work queue.
|
||||
//
|
||||
//
|
||||
// The SparseArray implementation is NOT thread-safe. It is up to the
|
||||
// caller to make sure only one thread is accessing the array. (Typically
|
||||
// these arrays are temporary values and used in situations where speed is
|
||||
// important.)
|
||||
//
|
||||
//
|
||||
// The SparseArray interface does not present all the usual STL bells and
|
||||
// whistles.
|
||||
//
|
||||
//
|
||||
// Implemented with reference to Briggs & Torczon, An Efficient
|
||||
// Representation for Sparse Sets, ACM Letters on Programming Languages
|
||||
// and Systems, Volume 2, Issue 1-4 (March-Dec. 1993), pp. 59-69.
|
||||
//
|
||||
//
|
||||
// Briggs & Torczon popularized this technique, but it had been known
|
||||
// long before their paper. They point out that Aho, Hopcroft, and
|
||||
// Ullman's 1974 Design and Analysis of Computer Algorithms and Bentley's
|
||||
// 1986 Programming Pearls both hint at the technique in exercises to the
|
||||
// reader (in Aho & Hopcroft, exercise 2.12; in Bentley, column 1
|
||||
// exercise 8).
|
||||
//
|
||||
//
|
||||
// Briggs & Torczon describe a sparse set implementation. I have
|
||||
// trivially generalized it to create a sparse array (actually the original
|
||||
// target of the AHU and Bentley exercises).
|
||||
|
||||
// IMPLEMENTATION
|
||||
//
|
||||
// SparseArray uses a vector dense_ and an array sparse_to_dense_, both of
|
||||
// size max_size_. At any point, the number of elements in the sparse array is
|
||||
// size_.
|
||||
//
|
||||
// The vector dense_ contains the size_ elements in the sparse array (with
|
||||
// SparseArray is an array dense_ and an array sparse_, both of size max_size_.
|
||||
// At any point, the number of elements in the sparse array is size_.
|
||||
//
|
||||
// The array dense_ contains the size_ elements in the sparse array (with
|
||||
// their indices),
|
||||
// in the order that the elements were first inserted. This array is dense:
|
||||
// the size_ pairs are dense_[0] through dense_[size_-1].
|
||||
//
|
||||
// The array sparse_to_dense_ maps from indices in [0,m) to indices in
|
||||
// [0,size_).
|
||||
// For indices present in the array, dense_[sparse_to_dense_[i]].index_ == i.
|
||||
// For indices not present in the array, sparse_to_dense_ can contain
|
||||
// any value at all, perhaps outside the range [0, size_) but perhaps not.
|
||||
//
|
||||
// The lax requirement on sparse_to_dense_ values makes clearing
|
||||
// the array very easy: set size_ to 0. Lookups are slightly more
|
||||
// complicated. An index i has a value in the array if and only if:
|
||||
// sparse_to_dense_[i] is in [0, size_) AND
|
||||
// dense_[sparse_to_dense_[i]].index_ == i.
|
||||
// If both these properties hold, only then it is safe to refer to
|
||||
// dense_[sparse_to_dense_[i]].value_
|
||||
// The array sparse_ maps from indices in [0,m) to indices in [0,size_).
|
||||
// For indices present in the array, dense_[sparse_[i]].index_ == i.
|
||||
// For indices not present in the array, sparse_ can contain any value at all,
|
||||
// perhaps outside the range [0, size_) but perhaps not.
|
||||
//
|
||||
// The lax requirement on sparse_ values makes clearing the array very easy:
|
||||
// set size_ to 0. Lookups are slightly more complicated.
|
||||
// An index i has a value in the array if and only if:
|
||||
// sparse_[i] is in [0, size_) AND
|
||||
// dense_[sparse_[i]].index_ == i.
|
||||
// If both these properties hold, only then it is safe to refer to
|
||||
// dense_[sparse_[i]].value_
|
||||
// as the value associated with index i.
|
||||
//
|
||||
// To insert a new entry, set sparse_to_dense_[i] to size_,
|
||||
// To insert a new entry, set sparse_[i] to size_,
|
||||
// initialize dense_[size_], and then increment size_.
|
||||
//
|
||||
// Deletion of specific values from the array is implemented by
|
||||
// swapping dense_[size_-1] and the dense_ being deleted and then
|
||||
// updating the appropriate sparse_to_dense_ entries.
|
||||
//
|
||||
// updating the appropriate sparse_ entries.
|
||||
//
|
||||
// To make the sparse array as efficient as possible for non-primitive types,
|
||||
// elements may or may not be destroyed when they are deleted from the sparse
|
||||
// array through a call to erase(), erase_existing() or resize(). They
|
||||
// immediately become inaccessible, but they are only guaranteed to be
|
||||
// destroyed when the SparseArray destructor is called.
|
||||
//
|
||||
// A moved-from SparseArray will be empty.
|
||||
|
||||
#ifndef RE2_UTIL_SPARSE_ARRAY_H__
|
||||
#define RE2_UTIL_SPARSE_ARRAY_H__
|
||||
// Doing this simplifies the logic below.
|
||||
#ifndef __has_feature
|
||||
#define __has_feature(x) 0
|
||||
#endif
|
||||
|
||||
#include "util/util.h"
|
||||
#include <assert.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#if __has_feature(memory_sanitizer)
|
||||
#include <sanitizer/msan_interface.h>
|
||||
#endif
|
||||
#include <algorithm>
|
||||
#include <memory>
|
||||
#include <type_traits>
|
||||
#include <utility>
|
||||
|
||||
namespace re2 {
|
||||
|
||||
@ -100,36 +114,49 @@ template<typename Value>
|
||||
class SparseArray {
|
||||
public:
|
||||
SparseArray();
|
||||
SparseArray(int max_size);
|
||||
explicit SparseArray(int max_size);
|
||||
~SparseArray();
|
||||
|
||||
// IndexValue pairs: exposed in SparseArray::iterator.
|
||||
class IndexValue;
|
||||
static_assert(std::is_trivially_destructible<IndexValue>::value,
|
||||
"IndexValue must be trivially destructible");
|
||||
|
||||
typedef IndexValue value_type;
|
||||
typedef typename vector<IndexValue>::iterator iterator;
|
||||
typedef typename vector<IndexValue>::const_iterator const_iterator;
|
||||
typedef IndexValue* iterator;
|
||||
typedef const IndexValue* const_iterator;
|
||||
|
||||
inline const IndexValue& iv(int i) const;
|
||||
SparseArray(const SparseArray& src);
|
||||
SparseArray(SparseArray&& src) /*noexcept*/;
|
||||
|
||||
SparseArray& operator=(const SparseArray& src);
|
||||
SparseArray& operator=(SparseArray&& src) /*noexcept*/;
|
||||
|
||||
const IndexValue& iv(int i) const;
|
||||
|
||||
// Return the number of entries in the array.
|
||||
int size() const {
|
||||
return size_;
|
||||
}
|
||||
|
||||
// Indicate whether the array is empty.
|
||||
int empty() const {
|
||||
return size_ == 0;
|
||||
}
|
||||
|
||||
// Iterate over the array.
|
||||
iterator begin() {
|
||||
return dense_.begin();
|
||||
return dense_.get();
|
||||
}
|
||||
iterator end() {
|
||||
return dense_.begin() + size_;
|
||||
return dense_.get() + size_;
|
||||
}
|
||||
|
||||
const_iterator begin() const {
|
||||
return dense_.begin();
|
||||
return dense_.get();
|
||||
}
|
||||
const_iterator end() const {
|
||||
return dense_.begin() + size_;
|
||||
return dense_.get() + size_;
|
||||
}
|
||||
|
||||
// Change the maximum size of the array.
|
||||
@ -148,39 +175,68 @@ class SparseArray {
|
||||
}
|
||||
|
||||
// Check whether index i is in the array.
|
||||
inline bool has_index(int i) const;
|
||||
bool has_index(int i) const;
|
||||
|
||||
// Comparison function for sorting.
|
||||
// Can sort the sparse array so that future iterations
|
||||
// will visit indices in increasing order using
|
||||
// sort(arr.begin(), arr.end(), arr.less);
|
||||
// std::sort(arr.begin(), arr.end(), arr.less);
|
||||
static bool less(const IndexValue& a, const IndexValue& b);
|
||||
|
||||
public:
|
||||
// Set the value at index i to v.
|
||||
inline iterator set(int i, Value v);
|
||||
iterator set(int i, const Value& v) {
|
||||
return SetInternal(true, i, v);
|
||||
}
|
||||
iterator set(int i, Value&& v) { // NOLINT
|
||||
return SetInternal(true, i, std::move(v));
|
||||
}
|
||||
|
||||
pair<iterator, bool> insert(const value_type& new_value);
|
||||
std::pair<iterator, bool> insert(const value_type& v) {
|
||||
return InsertInternal(v);
|
||||
}
|
||||
std::pair<iterator, bool> insert(value_type&& v) { // NOLINT
|
||||
return InsertInternal(std::move(v));
|
||||
}
|
||||
|
||||
// Returns the value at index i
|
||||
// or defaultv if index i is not initialized in the array.
|
||||
inline Value get(int i, Value defaultv) const;
|
||||
template <typename... Args>
|
||||
std::pair<iterator, bool> emplace(Args&&... args) { // NOLINT
|
||||
return InsertInternal(value_type(std::forward<Args>(args)...));
|
||||
}
|
||||
|
||||
iterator find(int i);
|
||||
iterator find(int i) {
|
||||
if (has_index(i))
|
||||
return dense_.get() + sparse_[i];
|
||||
return end();
|
||||
}
|
||||
|
||||
const_iterator find(int i) const;
|
||||
const_iterator find(int i) const {
|
||||
if (has_index(i))
|
||||
return dense_.get() + sparse_[i];
|
||||
return end();
|
||||
}
|
||||
|
||||
// Change the value at index i to v.
|
||||
// Fast but unsafe: only use if has_index(i) is true.
|
||||
inline iterator set_existing(int i, Value v);
|
||||
iterator set_existing(int i, const Value& v) {
|
||||
return SetExistingInternal(i, v);
|
||||
}
|
||||
iterator set_existing(int i, Value&& v) { // NOLINT
|
||||
return SetExistingInternal(i, std::move(v));
|
||||
}
|
||||
|
||||
// Set the value at the new index i to v.
|
||||
// Fast but unsafe: only use if has_index(i) is false.
|
||||
inline iterator set_new(int i, Value v);
|
||||
iterator set_new(int i, const Value& v) {
|
||||
return SetInternal(false, i, v);
|
||||
}
|
||||
iterator set_new(int i, Value&& v) { // NOLINT
|
||||
return SetInternal(false, i, std::move(v));
|
||||
}
|
||||
|
||||
// Get the value at index i from the array..
|
||||
// Fast but unsafe: only use if has_index(i) is true.
|
||||
inline Value get_existing(int i) const;
|
||||
const Value& get_existing(int i) const;
|
||||
|
||||
// Erasing items from the array during iteration is in general
|
||||
// NOT safe. There is one special case, which is that the current
|
||||
@ -201,37 +257,132 @@ class SparseArray {
|
||||
// the iterators could walk past the end of the array.
|
||||
|
||||
// Erases the element at index i from the array.
|
||||
inline void erase(int i);
|
||||
void erase(int i);
|
||||
|
||||
// Erases the element at index i from the array.
|
||||
// Fast but unsafe: only use if has_index(i) is true.
|
||||
inline void erase_existing(int i);
|
||||
void erase_existing(int i);
|
||||
|
||||
private:
|
||||
template <typename U>
|
||||
std::pair<iterator, bool> InsertInternal(U&& v) {
|
||||
DebugCheckInvariants();
|
||||
std::pair<iterator, bool> p;
|
||||
if (has_index(v.index_)) {
|
||||
p = {dense_.get() + sparse_[v.index_], false};
|
||||
} else {
|
||||
p = {set_new(std::forward<U>(v).index_, std::forward<U>(v).second), true};
|
||||
}
|
||||
DebugCheckInvariants();
|
||||
return p;
|
||||
}
|
||||
|
||||
template <typename U>
|
||||
iterator SetInternal(bool allow_overwrite, int i, U&& v) { // NOLINT
|
||||
DebugCheckInvariants();
|
||||
if (static_cast<uint32_t>(i) >= static_cast<uint32_t>(max_size_)) {
|
||||
assert(false && "illegal index");
|
||||
// Semantically, end() would be better here, but we already know
|
||||
// the user did something stupid, so begin() insulates them from
|
||||
// dereferencing an invalid pointer.
|
||||
return begin();
|
||||
}
|
||||
if (!allow_overwrite) {
|
||||
assert(!has_index(i));
|
||||
create_index(i);
|
||||
} else {
|
||||
if (!has_index(i))
|
||||
create_index(i);
|
||||
}
|
||||
return set_existing(i, std::forward<U>(v)); // NOLINT
|
||||
}
|
||||
|
||||
template <typename U>
|
||||
iterator SetExistingInternal(int i, U&& v) { // NOLINT
|
||||
DebugCheckInvariants();
|
||||
assert(has_index(i));
|
||||
dense_[sparse_[i]].value() = std::forward<U>(v);
|
||||
DebugCheckInvariants();
|
||||
return dense_.get() + sparse_[i];
|
||||
}
|
||||
|
||||
// Add the index i to the array.
|
||||
// Only use if has_index(i) is known to be false.
|
||||
// Since it doesn't set the value associated with i,
|
||||
// this function is private, only intended as a helper
|
||||
// for other methods.
|
||||
inline void create_index(int i);
|
||||
void create_index(int i);
|
||||
|
||||
// In debug mode, verify that some invariant properties of the class
|
||||
// are being maintained. This is called at the end of the constructor
|
||||
// and at the beginning and end of all public non-const member functions.
|
||||
inline void DebugCheckInvariants() const;
|
||||
void DebugCheckInvariants() const;
|
||||
|
||||
int size_;
|
||||
int max_size_;
|
||||
int* sparse_to_dense_;
|
||||
vector<IndexValue> dense_;
|
||||
bool valgrind_;
|
||||
// Initializes memory for elements [min, max).
|
||||
void MaybeInitializeMemory(int min, int max) {
|
||||
#if __has_feature(memory_sanitizer)
|
||||
__msan_unpoison(sparse_.get() + min, (max - min) * sizeof sparse_[0]);
|
||||
#elif defined(RE2_ON_VALGRIND)
|
||||
for (int i = min; i < max; i++) {
|
||||
sparse_[i] = 0xababababU;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
DISALLOW_EVIL_CONSTRUCTORS(SparseArray);
|
||||
int size_ = 0;
|
||||
int max_size_ = 0;
|
||||
std::unique_ptr<int[]> sparse_;
|
||||
std::unique_ptr<IndexValue[]> dense_;
|
||||
};
|
||||
|
||||
template<typename Value>
|
||||
SparseArray<Value>::SparseArray()
|
||||
: size_(0), max_size_(0), sparse_to_dense_(NULL), dense_(), valgrind_(RunningOnValgrind()) {}
|
||||
SparseArray<Value>::SparseArray() = default;
|
||||
|
||||
template<typename Value>
|
||||
SparseArray<Value>::SparseArray(const SparseArray& src)
|
||||
: size_(src.size_),
|
||||
max_size_(src.max_size_),
|
||||
sparse_(new int[max_size_]),
|
||||
dense_(new IndexValue[max_size_]) {
|
||||
std::copy_n(src.sparse_.get(), max_size_, sparse_.get());
|
||||
std::copy_n(src.dense_.get(), max_size_, dense_.get());
|
||||
}
|
||||
|
||||
template<typename Value>
|
||||
SparseArray<Value>::SparseArray(SparseArray&& src) /*noexcept*/ // NOLINT
|
||||
: size_(src.size_),
|
||||
max_size_(src.max_size_),
|
||||
sparse_(std::move(src.sparse_)),
|
||||
dense_(std::move(src.dense_)) {
|
||||
src.size_ = 0;
|
||||
src.max_size_ = 0;
|
||||
}
|
||||
|
||||
template<typename Value>
|
||||
SparseArray<Value>& SparseArray<Value>::operator=(const SparseArray& src) {
|
||||
size_ = src.size_;
|
||||
max_size_ = src.max_size_;
|
||||
std::unique_ptr<int[]> a(new int[max_size_]);
|
||||
std::copy_n(src.sparse_.get(), src.max_size_, a.get());
|
||||
sparse_ = std::move(a);
|
||||
std::unique_ptr<IndexValue[]> b(new IndexValue[max_size_]);
|
||||
std::copy_n(src.dense_.get(), src.max_size_, b.get());
|
||||
dense_ = std::move(b);
|
||||
return *this;
|
||||
}
|
||||
|
||||
template<typename Value>
|
||||
SparseArray<Value>& SparseArray<Value>::operator=(
|
||||
SparseArray&& src) /*noexcept*/ { // NOLINT
|
||||
size_ = src.size_;
|
||||
max_size_ = src.max_size_;
|
||||
sparse_ = std::move(src.sparse_);
|
||||
dense_ = std::move(src.dense_);
|
||||
// clear out the source
|
||||
src.size_ = 0;
|
||||
src.max_size_ = 0;
|
||||
return *this;
|
||||
}
|
||||
|
||||
// IndexValue pairs: exposed in SparseArray::iterator.
|
||||
template<typename Value>
|
||||
@ -242,48 +393,55 @@ class SparseArray<Value>::IndexValue {
|
||||
typedef Value second_type;
|
||||
|
||||
IndexValue() {}
|
||||
IndexValue(int index, const Value& value) : second(value), index_(index) {}
|
||||
IndexValue(int i, const Value& v) : index_(i), second(v) {}
|
||||
IndexValue(int i, Value&& v) : index_(i), second(std::move(v)) {}
|
||||
|
||||
int index() const { return index_; }
|
||||
Value value() const { return second; }
|
||||
|
||||
// Provide the data in the 'second' member so that the utilities
|
||||
// in map-util work.
|
||||
Value second;
|
||||
Value& value() /*&*/ { return second; }
|
||||
const Value& value() const /*&*/ { return second; }
|
||||
//Value&& value() /*&&*/ { return std::move(second); } // NOLINT
|
||||
|
||||
private:
|
||||
int index_;
|
||||
|
||||
public:
|
||||
// Provide the data in the 'second' member so that the utilities
|
||||
// in map-util work.
|
||||
// TODO(billydonahue): 'second' is public for short-term compatibility.
|
||||
// Users will be transitioned to using value() accessor.
|
||||
Value second;
|
||||
};
|
||||
|
||||
template<typename Value>
|
||||
const typename SparseArray<Value>::IndexValue&
|
||||
SparseArray<Value>::iv(int i) const {
|
||||
DCHECK_GE(i, 0);
|
||||
DCHECK_LT(i, size_);
|
||||
assert(i >= 0);
|
||||
assert(i < size_);
|
||||
return dense_[i];
|
||||
}
|
||||
|
||||
// Change the maximum size of the array.
|
||||
// Invalidates all iterators.
|
||||
template<typename Value>
|
||||
void SparseArray<Value>::resize(int new_max_size) {
|
||||
void SparseArray<Value>::resize(int max_size) {
|
||||
DebugCheckInvariants();
|
||||
if (new_max_size > max_size_) {
|
||||
int* a = new int[new_max_size];
|
||||
if (sparse_to_dense_) {
|
||||
memmove(a, sparse_to_dense_, max_size_*sizeof a[0]);
|
||||
// Don't need to zero the memory but appease Valgrind.
|
||||
if (valgrind_) {
|
||||
for (int i = max_size_; i < new_max_size; i++)
|
||||
a[i] = 0xababababU;
|
||||
}
|
||||
delete[] sparse_to_dense_;
|
||||
if (max_size > max_size_) {
|
||||
std::unique_ptr<int[]> a(new int[max_size]);
|
||||
if (sparse_) {
|
||||
std::copy_n(sparse_.get(), max_size_, a.get());
|
||||
}
|
||||
sparse_to_dense_ = a;
|
||||
sparse_ = std::move(a);
|
||||
|
||||
dense_.resize(new_max_size);
|
||||
std::unique_ptr<IndexValue[]> b(new IndexValue[max_size]);
|
||||
if (dense_) {
|
||||
std::copy_n(dense_.get(), max_size_, b.get());
|
||||
}
|
||||
dense_ = std::move(b);
|
||||
|
||||
MaybeInitializeMemory(max_size_, max_size);
|
||||
}
|
||||
max_size_ = new_max_size;
|
||||
max_size_ = max_size;
|
||||
if (size_ > max_size_)
|
||||
size_ = max_size_;
|
||||
DebugCheckInvariants();
|
||||
@ -292,97 +450,20 @@ void SparseArray<Value>::resize(int new_max_size) {
|
||||
// Check whether index i is in the array.
|
||||
template<typename Value>
|
||||
bool SparseArray<Value>::has_index(int i) const {
|
||||
DCHECK_GE(i, 0);
|
||||
DCHECK_LT(i, max_size_);
|
||||
if (static_cast<uint>(i) >= static_cast<uint>(max_size_)) {
|
||||
assert(i >= 0);
|
||||
assert(i < max_size_);
|
||||
if (static_cast<uint32_t>(i) >= static_cast<uint32_t>(max_size_)) {
|
||||
return false;
|
||||
}
|
||||
// Unsigned comparison avoids checking sparse_to_dense_[i] < 0.
|
||||
return (uint)sparse_to_dense_[i] < (uint)size_ &&
|
||||
dense_[sparse_to_dense_[i]].index_ == i;
|
||||
}
|
||||
|
||||
// Set the value at index i to v.
|
||||
template<typename Value>
|
||||
typename SparseArray<Value>::iterator SparseArray<Value>::set(int i, Value v) {
|
||||
DebugCheckInvariants();
|
||||
if (static_cast<uint>(i) >= static_cast<uint>(max_size_)) {
|
||||
// Semantically, end() would be better here, but we already know
|
||||
// the user did something stupid, so begin() insulates them from
|
||||
// dereferencing an invalid pointer.
|
||||
return begin();
|
||||
}
|
||||
if (!has_index(i))
|
||||
create_index(i);
|
||||
return set_existing(i, v);
|
||||
// Unsigned comparison avoids checking sparse_[i] < 0.
|
||||
return (uint32_t)sparse_[i] < (uint32_t)size_ &&
|
||||
dense_[sparse_[i]].index_ == i;
|
||||
}
|
||||
|
||||
template<typename Value>
|
||||
pair<typename SparseArray<Value>::iterator, bool> SparseArray<Value>::insert(
|
||||
const value_type& new_value) {
|
||||
DebugCheckInvariants();
|
||||
pair<typename SparseArray<Value>::iterator, bool> p;
|
||||
if (has_index(new_value.index_)) {
|
||||
p = make_pair(dense_.begin() + sparse_to_dense_[new_value.index_], false);
|
||||
} else {
|
||||
p = make_pair(set_new(new_value.index_, new_value.second), true);
|
||||
}
|
||||
DebugCheckInvariants();
|
||||
return p;
|
||||
}
|
||||
|
||||
template<typename Value>
|
||||
Value SparseArray<Value>::get(int i, Value defaultv) const {
|
||||
if (!has_index(i))
|
||||
return defaultv;
|
||||
return get_existing(i);
|
||||
}
|
||||
|
||||
template<typename Value>
|
||||
typename SparseArray<Value>::iterator SparseArray<Value>::find(int i) {
|
||||
if (has_index(i))
|
||||
return dense_.begin() + sparse_to_dense_[i];
|
||||
return end();
|
||||
}
|
||||
|
||||
template<typename Value>
|
||||
typename SparseArray<Value>::const_iterator
|
||||
SparseArray<Value>::find(int i) const {
|
||||
if (has_index(i)) {
|
||||
return dense_.begin() + sparse_to_dense_[i];
|
||||
}
|
||||
return end();
|
||||
}
|
||||
|
||||
template<typename Value>
|
||||
typename SparseArray<Value>::iterator
|
||||
SparseArray<Value>::set_existing(int i, Value v) {
|
||||
DebugCheckInvariants();
|
||||
DCHECK(has_index(i));
|
||||
dense_[sparse_to_dense_[i]].second = v;
|
||||
DebugCheckInvariants();
|
||||
return dense_.begin() + sparse_to_dense_[i];
|
||||
}
|
||||
|
||||
template<typename Value>
|
||||
typename SparseArray<Value>::iterator
|
||||
SparseArray<Value>::set_new(int i, Value v) {
|
||||
DebugCheckInvariants();
|
||||
if (static_cast<uint>(i) >= static_cast<uint>(max_size_)) {
|
||||
// Semantically, end() would be better here, but we already know
|
||||
// the user did something stupid, so begin() insulates them from
|
||||
// dereferencing an invalid pointer.
|
||||
return begin();
|
||||
}
|
||||
DCHECK(!has_index(i));
|
||||
create_index(i);
|
||||
return set_existing(i, v);
|
||||
}
|
||||
|
||||
template<typename Value>
|
||||
Value SparseArray<Value>::get_existing(int i) const {
|
||||
DCHECK(has_index(i));
|
||||
return dense_[sparse_to_dense_[i]].second;
|
||||
const Value& SparseArray<Value>::get_existing(int i) const {
|
||||
assert(has_index(i));
|
||||
return dense_[sparse_[i]].second;
|
||||
}
|
||||
|
||||
template<typename Value>
|
||||
@ -396,11 +477,11 @@ void SparseArray<Value>::erase(int i) {
|
||||
template<typename Value>
|
||||
void SparseArray<Value>::erase_existing(int i) {
|
||||
DebugCheckInvariants();
|
||||
DCHECK(has_index(i));
|
||||
int di = sparse_to_dense_[i];
|
||||
assert(has_index(i));
|
||||
int di = sparse_[i];
|
||||
if (di < size_ - 1) {
|
||||
dense_[di] = dense_[size_ - 1];
|
||||
sparse_to_dense_[dense_[di].index_] = di;
|
||||
dense_[di] = std::move(dense_[size_ - 1]);
|
||||
sparse_[dense_[di].index_] = di;
|
||||
}
|
||||
size_--;
|
||||
DebugCheckInvariants();
|
||||
@ -408,38 +489,30 @@ void SparseArray<Value>::erase_existing(int i) {
|
||||
|
||||
template<typename Value>
|
||||
void SparseArray<Value>::create_index(int i) {
|
||||
DCHECK(!has_index(i));
|
||||
DCHECK_LT(size_, max_size_);
|
||||
sparse_to_dense_[i] = size_;
|
||||
assert(!has_index(i));
|
||||
assert(size_ < max_size_);
|
||||
sparse_[i] = size_;
|
||||
dense_[size_].index_ = i;
|
||||
size_++;
|
||||
}
|
||||
|
||||
template<typename Value> SparseArray<Value>::SparseArray(int max_size) {
|
||||
max_size_ = max_size;
|
||||
sparse_to_dense_ = new int[max_size];
|
||||
valgrind_ = RunningOnValgrind();
|
||||
dense_.resize(max_size);
|
||||
// Don't need to zero the new memory, but appease Valgrind.
|
||||
if (valgrind_) {
|
||||
for (int i = 0; i < max_size; i++) {
|
||||
sparse_to_dense_[i] = 0xababababU;
|
||||
dense_[i].index_ = 0xababababU;
|
||||
}
|
||||
}
|
||||
sparse_.reset(new int[max_size]);
|
||||
dense_.reset(new IndexValue[max_size]);
|
||||
size_ = 0;
|
||||
MaybeInitializeMemory(size_, max_size);
|
||||
max_size_ = max_size;
|
||||
DebugCheckInvariants();
|
||||
}
|
||||
|
||||
template<typename Value> SparseArray<Value>::~SparseArray() {
|
||||
DebugCheckInvariants();
|
||||
delete[] sparse_to_dense_;
|
||||
}
|
||||
|
||||
template<typename Value> void SparseArray<Value>::DebugCheckInvariants() const {
|
||||
DCHECK_LE(0, size_);
|
||||
DCHECK_LE(size_, max_size_);
|
||||
DCHECK(size_ == 0 || sparse_to_dense_ != NULL);
|
||||
assert(0 <= size_);
|
||||
assert(size_ <= max_size_);
|
||||
assert(size_ == 0 || sparse_ != NULL);
|
||||
}
|
||||
|
||||
// Comparison function for sorting.
|
||||
@ -450,4 +523,4 @@ template<typename Value> bool SparseArray<Value>::less(const IndexValue& a,
|
||||
|
||||
} // namespace re2
|
||||
|
||||
#endif // RE2_UTIL_SPARSE_ARRAY_H__
|
||||
#endif // UTIL_SPARSE_ARRAY_H_
|
||||
|
@ -2,178 +2,265 @@
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef UTIL_SPARSE_SET_H_
|
||||
#define UTIL_SPARSE_SET_H_
|
||||
|
||||
// DESCRIPTION
|
||||
//
|
||||
// SparseSet<T>(m) is a set of integers in [0, m).
|
||||
//
|
||||
// SparseSet(m) is a set of integers in [0, m).
|
||||
// It requires sizeof(int)*m memory, but it provides
|
||||
// fast iteration through the elements in the set and fast clearing
|
||||
// of the set.
|
||||
//
|
||||
//
|
||||
// Insertion and deletion are constant time operations.
|
||||
//
|
||||
// Allocating the set is a constant time operation
|
||||
//
|
||||
// Allocating the set is a constant time operation
|
||||
// when memory allocation is a constant time operation.
|
||||
//
|
||||
//
|
||||
// Clearing the set is a constant time operation (unusual!).
|
||||
//
|
||||
//
|
||||
// Iterating through the set is an O(n) operation, where n
|
||||
// is the number of items in the set (not O(m)).
|
||||
//
|
||||
// The set iterator visits entries in the order they were first
|
||||
// inserted into the array. It is safe to add items to the set while
|
||||
// The set iterator visits entries in the order they were first
|
||||
// inserted into the set. It is safe to add items to the set while
|
||||
// using an iterator: the iterator will visit indices added to the set
|
||||
// during the iteration, but will not re-visit indices whose values
|
||||
// change after visiting. Thus SparseSet can be a convenient
|
||||
// implementation of a work queue.
|
||||
//
|
||||
//
|
||||
// The SparseSet implementation is NOT thread-safe. It is up to the
|
||||
// caller to make sure only one thread is accessing the set. (Typically
|
||||
// these sets are temporary values and used in situations where speed is
|
||||
// important.)
|
||||
//
|
||||
//
|
||||
// The SparseSet interface does not present all the usual STL bells and
|
||||
// whistles.
|
||||
//
|
||||
//
|
||||
// Implemented with reference to Briggs & Torczon, An Efficient
|
||||
// Representation for Sparse Sets, ACM Letters on Programming Languages
|
||||
// and Systems, Volume 2, Issue 1-4 (March-Dec. 1993), pp. 59-69.
|
||||
//
|
||||
// For a generalization to sparse array, see sparse_array.h.
|
||||
//
|
||||
// This is a specialization of sparse array; see sparse_array.h.
|
||||
|
||||
// IMPLEMENTATION
|
||||
//
|
||||
// See sparse_array.h for implementation details
|
||||
// See sparse_array.h for implementation details.
|
||||
|
||||
#ifndef RE2_UTIL_SPARSE_SET_H__
|
||||
#define RE2_UTIL_SPARSE_SET_H__
|
||||
// Doing this simplifies the logic below.
|
||||
#ifndef __has_feature
|
||||
#define __has_feature(x) 0
|
||||
#endif
|
||||
|
||||
#include "util/util.h"
|
||||
#include <assert.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#if __has_feature(memory_sanitizer)
|
||||
#include <sanitizer/msan_interface.h>
|
||||
#endif
|
||||
#include <algorithm>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
|
||||
namespace re2 {
|
||||
|
||||
class SparseSet {
|
||||
template<typename Value>
|
||||
class SparseSetT {
|
||||
public:
|
||||
SparseSet()
|
||||
: size_(0), max_size_(0), sparse_to_dense_(NULL), dense_(NULL), valgrind_(RunningOnValgrind()) {}
|
||||
|
||||
SparseSet(int max_size) {
|
||||
max_size_ = max_size;
|
||||
sparse_to_dense_ = new int[max_size];
|
||||
dense_ = new int[max_size];
|
||||
valgrind_ = RunningOnValgrind();
|
||||
// Don't need to zero the memory, but do so anyway
|
||||
// to appease Valgrind.
|
||||
if (valgrind_) {
|
||||
for (int i = 0; i < max_size; i++) {
|
||||
dense_[i] = 0xababababU;
|
||||
sparse_to_dense_[i] = 0xababababU;
|
||||
}
|
||||
}
|
||||
size_ = 0;
|
||||
}
|
||||
|
||||
~SparseSet() {
|
||||
delete[] sparse_to_dense_;
|
||||
delete[] dense_;
|
||||
}
|
||||
SparseSetT();
|
||||
explicit SparseSetT(int max_size);
|
||||
~SparseSetT();
|
||||
|
||||
typedef int* iterator;
|
||||
typedef const int* const_iterator;
|
||||
|
||||
int size() const { return size_; }
|
||||
iterator begin() { return dense_; }
|
||||
iterator end() { return dense_ + size_; }
|
||||
const_iterator begin() const { return dense_; }
|
||||
const_iterator end() const { return dense_ + size_; }
|
||||
// Return the number of entries in the set.
|
||||
int size() const {
|
||||
return size_;
|
||||
}
|
||||
|
||||
// Change the maximum size of the array.
|
||||
// Indicate whether the set is empty.
|
||||
int empty() const {
|
||||
return size_ == 0;
|
||||
}
|
||||
|
||||
// Iterate over the set.
|
||||
iterator begin() {
|
||||
return dense_.get();
|
||||
}
|
||||
iterator end() {
|
||||
return dense_.get() + size_;
|
||||
}
|
||||
|
||||
const_iterator begin() const {
|
||||
return dense_.get();
|
||||
}
|
||||
const_iterator end() const {
|
||||
return dense_.get() + size_;
|
||||
}
|
||||
|
||||
// Change the maximum size of the set.
|
||||
// Invalidates all iterators.
|
||||
void resize(int new_max_size) {
|
||||
if (size_ > new_max_size)
|
||||
size_ = new_max_size;
|
||||
if (new_max_size > max_size_) {
|
||||
int* a = new int[new_max_size];
|
||||
if (sparse_to_dense_) {
|
||||
memmove(a, sparse_to_dense_, max_size_*sizeof a[0]);
|
||||
if (valgrind_) {
|
||||
for (int i = max_size_; i < new_max_size; i++)
|
||||
a[i] = 0xababababU;
|
||||
}
|
||||
delete[] sparse_to_dense_;
|
||||
}
|
||||
sparse_to_dense_ = a;
|
||||
void resize(int max_size);
|
||||
|
||||
a = new int[new_max_size];
|
||||
if (dense_) {
|
||||
memmove(a, dense_, size_*sizeof a[0]);
|
||||
if (valgrind_) {
|
||||
for (int i = size_; i < new_max_size; i++)
|
||||
a[i] = 0xababababU;
|
||||
}
|
||||
delete[] dense_;
|
||||
}
|
||||
dense_ = a;
|
||||
}
|
||||
max_size_ = new_max_size;
|
||||
}
|
||||
|
||||
// Return the maximum size of the array.
|
||||
// Return the maximum size of the set.
|
||||
// Indices can be in the range [0, max_size).
|
||||
int max_size() const { return max_size_; }
|
||||
|
||||
// Clear the array.
|
||||
void clear() { size_ = 0; }
|
||||
|
||||
// Check whether i is in the array.
|
||||
bool contains(int i) const {
|
||||
DCHECK_GE(i, 0);
|
||||
DCHECK_LT(i, max_size_);
|
||||
if (static_cast<uint>(i) >= static_cast<uint>(max_size_)) {
|
||||
return false;
|
||||
}
|
||||
// Unsigned comparison avoids checking sparse_to_dense_[i] < 0.
|
||||
return (uint)sparse_to_dense_[i] < (uint)size_ &&
|
||||
dense_[sparse_to_dense_[i]] == i;
|
||||
int max_size() const {
|
||||
return max_size_;
|
||||
}
|
||||
|
||||
// Adds i to the set.
|
||||
void insert(int i) {
|
||||
if (!contains(i))
|
||||
insert_new(i);
|
||||
// Clear the set.
|
||||
void clear() {
|
||||
size_ = 0;
|
||||
}
|
||||
|
||||
// Set the value at the new index i to v.
|
||||
// Check whether index i is in the set.
|
||||
bool contains(int i) const;
|
||||
|
||||
// Comparison function for sorting.
|
||||
// Can sort the sparse set so that future iterations
|
||||
// will visit indices in increasing order using
|
||||
// std::sort(arr.begin(), arr.end(), arr.less);
|
||||
static bool less(int a, int b);
|
||||
|
||||
public:
|
||||
// Insert index i into the set.
|
||||
iterator insert(int i) {
|
||||
return InsertInternal(true, i);
|
||||
}
|
||||
|
||||
// Insert index i into the set.
|
||||
// Fast but unsafe: only use if contains(i) is false.
|
||||
void insert_new(int i) {
|
||||
if (static_cast<uint>(i) >= static_cast<uint>(max_size_)) {
|
||||
iterator insert_new(int i) {
|
||||
return InsertInternal(false, i);
|
||||
}
|
||||
|
||||
private:
|
||||
iterator InsertInternal(bool allow_existing, int i) {
|
||||
DebugCheckInvariants();
|
||||
if (static_cast<uint32_t>(i) >= static_cast<uint32_t>(max_size_)) {
|
||||
assert(false && "illegal index");
|
||||
// Semantically, end() would be better here, but we already know
|
||||
// the user did something stupid, so begin() insulates them from
|
||||
// dereferencing an invalid pointer.
|
||||
return;
|
||||
return begin();
|
||||
}
|
||||
DCHECK(!contains(i));
|
||||
DCHECK_LT(size_, max_size_);
|
||||
sparse_to_dense_[i] = size_;
|
||||
dense_[size_] = i;
|
||||
size_++;
|
||||
if (!allow_existing) {
|
||||
assert(!contains(i));
|
||||
create_index(i);
|
||||
} else {
|
||||
if (!contains(i))
|
||||
create_index(i);
|
||||
}
|
||||
DebugCheckInvariants();
|
||||
return dense_.get() + sparse_[i];
|
||||
}
|
||||
|
||||
// Comparison function for sorting.
|
||||
// Can sort the sparse array so that future iterations
|
||||
// will visit indices in increasing order using
|
||||
// sort(arr.begin(), arr.end(), arr.less);
|
||||
static bool less(int a, int b) { return a < b; }
|
||||
// Add the index i to the set.
|
||||
// Only use if contains(i) is known to be false.
|
||||
// This function is private, only intended as a helper
|
||||
// for other methods.
|
||||
void create_index(int i);
|
||||
|
||||
private:
|
||||
int size_;
|
||||
int max_size_;
|
||||
int* sparse_to_dense_;
|
||||
int* dense_;
|
||||
bool valgrind_;
|
||||
// In debug mode, verify that some invariant properties of the class
|
||||
// are being maintained. This is called at the end of the constructor
|
||||
// and at the beginning and end of all public non-const member functions.
|
||||
void DebugCheckInvariants() const;
|
||||
|
||||
DISALLOW_EVIL_CONSTRUCTORS(SparseSet);
|
||||
// Initializes memory for elements [min, max).
|
||||
void MaybeInitializeMemory(int min, int max) {
|
||||
#if __has_feature(memory_sanitizer)
|
||||
__msan_unpoison(sparse_.get() + min, (max - min) * sizeof sparse_[0]);
|
||||
#elif defined(RE2_ON_VALGRIND)
|
||||
for (int i = min; i < max; i++) {
|
||||
sparse_[i] = 0xababababU;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
int size_ = 0;
|
||||
int max_size_ = 0;
|
||||
std::unique_ptr<int[]> sparse_;
|
||||
std::unique_ptr<int[]> dense_;
|
||||
};
|
||||
|
||||
template<typename Value>
|
||||
SparseSetT<Value>::SparseSetT() = default;
|
||||
|
||||
// Change the maximum size of the set.
|
||||
// Invalidates all iterators.
|
||||
template<typename Value>
|
||||
void SparseSetT<Value>::resize(int max_size) {
|
||||
DebugCheckInvariants();
|
||||
if (max_size > max_size_) {
|
||||
std::unique_ptr<int[]> a(new int[max_size]);
|
||||
if (sparse_) {
|
||||
std::copy_n(sparse_.get(), max_size_, a.get());
|
||||
}
|
||||
sparse_ = std::move(a);
|
||||
|
||||
std::unique_ptr<int[]> b(new int[max_size]);
|
||||
if (dense_) {
|
||||
std::copy_n(dense_.get(), max_size_, b.get());
|
||||
}
|
||||
dense_ = std::move(b);
|
||||
|
||||
MaybeInitializeMemory(max_size_, max_size);
|
||||
}
|
||||
max_size_ = max_size;
|
||||
if (size_ > max_size_)
|
||||
size_ = max_size_;
|
||||
DebugCheckInvariants();
|
||||
}
|
||||
|
||||
// Check whether index i is in the set.
|
||||
template<typename Value>
|
||||
bool SparseSetT<Value>::contains(int i) const {
|
||||
assert(i >= 0);
|
||||
assert(i < max_size_);
|
||||
if (static_cast<uint32_t>(i) >= static_cast<uint32_t>(max_size_)) {
|
||||
return false;
|
||||
}
|
||||
// Unsigned comparison avoids checking sparse_[i] < 0.
|
||||
return (uint32_t)sparse_[i] < (uint32_t)size_ &&
|
||||
dense_[sparse_[i]] == i;
|
||||
}
|
||||
|
||||
template<typename Value>
|
||||
void SparseSetT<Value>::create_index(int i) {
|
||||
assert(!contains(i));
|
||||
assert(size_ < max_size_);
|
||||
sparse_[i] = size_;
|
||||
dense_[size_] = i;
|
||||
size_++;
|
||||
}
|
||||
|
||||
template<typename Value> SparseSetT<Value>::SparseSetT(int max_size) {
|
||||
sparse_.reset(new int[max_size]);
|
||||
dense_.reset(new int[max_size]);
|
||||
size_ = 0;
|
||||
MaybeInitializeMemory(size_, max_size);
|
||||
max_size_ = max_size;
|
||||
DebugCheckInvariants();
|
||||
}
|
||||
|
||||
template<typename Value> SparseSetT<Value>::~SparseSetT() {
|
||||
DebugCheckInvariants();
|
||||
}
|
||||
|
||||
template<typename Value> void SparseSetT<Value>::DebugCheckInvariants() const {
|
||||
assert(0 <= size_);
|
||||
assert(size_ <= max_size_);
|
||||
assert(size_ == 0 || sparse_ != NULL);
|
||||
}
|
||||
|
||||
// Comparison function for sorting.
|
||||
template<typename Value> bool SparseSetT<Value>::less(int a, int b) {
|
||||
return a < b;
|
||||
}
|
||||
|
||||
typedef SparseSetT<void> SparseSet;
|
||||
|
||||
} // namespace re2
|
||||
|
||||
#endif // RE2_UTIL_SPARSE_SET_H__
|
||||
#endif // UTIL_SPARSE_SET_H_
|
||||
|
@ -1,87 +0,0 @@
|
||||
// Copyright 2004 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "re2/stringpiece.h"
|
||||
#include "util/util.h"
|
||||
|
||||
using re2::StringPiece;
|
||||
|
||||
std::ostream& operator<<(std::ostream& o, const StringPiece& piece) {
|
||||
o.write(piece.data(), piece.size());
|
||||
return o;
|
||||
}
|
||||
|
||||
bool StringPiece::_equal(const StringPiece& x, const StringPiece& y) {
|
||||
int len = x.size();
|
||||
if (len != y.size()) {
|
||||
return false;
|
||||
}
|
||||
const char* p = x.data();
|
||||
const char* p2 = y.data();
|
||||
// Test last byte in case strings share large common prefix
|
||||
if ((len > 0) && (p[len-1] != p2[len-1])) return false;
|
||||
const char* p_limit = p + len;
|
||||
for (; p < p_limit; p++, p2++) {
|
||||
if (*p != *p2)
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void StringPiece::CopyToString(string* target) const {
|
||||
target->assign(ptr_, length_);
|
||||
}
|
||||
|
||||
int StringPiece::copy(char* buf, size_type n, size_type pos) const {
|
||||
int ret = min(length_ - pos, n);
|
||||
memcpy(buf, ptr_ + pos, ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int StringPiece::find(const StringPiece& s, size_type pos) const {
|
||||
if (length_ < 0 || pos > static_cast<size_type>(length_))
|
||||
return npos;
|
||||
|
||||
const char* result = std::search(ptr_ + pos, ptr_ + length_,
|
||||
s.ptr_, s.ptr_ + s.length_);
|
||||
const size_type xpos = result - ptr_;
|
||||
return xpos + s.length_ <= static_cast<size_type>(length_) ? xpos : npos;
|
||||
}
|
||||
|
||||
int StringPiece::find(char c, size_type pos) const {
|
||||
if (length_ <= 0 || pos >= static_cast<size_type>(length_)) {
|
||||
return npos;
|
||||
}
|
||||
const char* result = std::find(ptr_ + pos, ptr_ + length_, c);
|
||||
return result != ptr_ + length_ ? result - ptr_ : npos;
|
||||
}
|
||||
|
||||
int StringPiece::rfind(const StringPiece& s, size_type pos) const {
|
||||
if (length_ < s.length_) return npos;
|
||||
const size_t ulen = length_;
|
||||
if (s.length_ == 0) return min(ulen, pos);
|
||||
|
||||
const char* last = ptr_ + min(ulen - s.length_, pos) + s.length_;
|
||||
const char* result = std::find_end(ptr_, last, s.ptr_, s.ptr_ + s.length_);
|
||||
return result != last ? result - ptr_ : npos;
|
||||
}
|
||||
|
||||
int StringPiece::rfind(char c, size_type pos) const {
|
||||
if (length_ <= 0) return npos;
|
||||
for (int i = min(pos, static_cast<size_type>(length_ - 1));
|
||||
i >= 0; --i) {
|
||||
if (ptr_[i] == c) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return npos;
|
||||
}
|
||||
|
||||
StringPiece StringPiece::substr(size_type pos, size_type n) const {
|
||||
if (pos > static_cast<size_type>(length_)) pos = length_;
|
||||
if (n > length_ - pos) n = length_ - pos;
|
||||
return StringPiece(ptr_ + pos, n);
|
||||
}
|
||||
|
||||
const StringPiece::size_type StringPiece::npos = size_type(-1);
|
@ -1,78 +0,0 @@
|
||||
// Copyright 2002 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "util/util.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
static void StringAppendV(string* dst, const char* format, va_list ap) {
|
||||
// First try with a small fixed size buffer
|
||||
char space[1024];
|
||||
|
||||
// It's possible for methods that use a va_list to invalidate
|
||||
// the data in it upon use. The fix is to make a copy
|
||||
// of the structure before using it and use that copy instead.
|
||||
va_list backup_ap;
|
||||
va_copy(backup_ap, ap);
|
||||
int result = vsnprintf(space, sizeof(space), format, backup_ap);
|
||||
va_end(backup_ap);
|
||||
|
||||
if ((result >= 0) && (static_cast<size_t>(result) < sizeof(space))) {
|
||||
// It fit
|
||||
dst->append(space, result);
|
||||
return;
|
||||
}
|
||||
|
||||
// Repeatedly increase buffer size until it fits
|
||||
int length = sizeof(space);
|
||||
while (true) {
|
||||
if (result < 0) {
|
||||
// Older behavior: just try doubling the buffer size
|
||||
length *= 2;
|
||||
} else {
|
||||
// We need exactly "result+1" characters
|
||||
length = result+1;
|
||||
}
|
||||
char* buf = new char[length];
|
||||
|
||||
// Restore the va_list before we use it again
|
||||
va_copy(backup_ap, ap);
|
||||
result = vsnprintf(buf, length, format, backup_ap);
|
||||
va_end(backup_ap);
|
||||
|
||||
if ((result >= 0) && (result < length)) {
|
||||
// It fit
|
||||
dst->append(buf, result);
|
||||
delete[] buf;
|
||||
return;
|
||||
}
|
||||
delete[] buf;
|
||||
}
|
||||
}
|
||||
|
||||
string StringPrintf(const char* format, ...) {
|
||||
va_list ap;
|
||||
va_start(ap, format);
|
||||
string result;
|
||||
StringAppendV(&result, format, ap);
|
||||
va_end(ap);
|
||||
return result;
|
||||
}
|
||||
|
||||
void SStringPrintf(string* dst, const char* format, ...) {
|
||||
va_list ap;
|
||||
va_start(ap, format);
|
||||
dst->clear();
|
||||
StringAppendV(dst, format, ap);
|
||||
va_end(ap);
|
||||
}
|
||||
|
||||
void StringAppendF(string* dst, const char* format, ...) {
|
||||
va_list ap;
|
||||
va_start(ap, format);
|
||||
StringAppendV(dst, format, ap);
|
||||
va_end(ap);
|
||||
}
|
||||
|
||||
} // namespace re2
|
@ -2,8 +2,15 @@
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "util/util.h"
|
||||
#include "re2/stringpiece.h"
|
||||
#include <stdarg.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "util/strutil.h"
|
||||
|
||||
#ifdef _WIN32
|
||||
#define snprintf _snprintf
|
||||
#define vsnprintf _vsnprintf
|
||||
#endif
|
||||
|
||||
namespace re2 {
|
||||
|
||||
@ -12,16 +19,16 @@ namespace re2 {
|
||||
// Copies 'src' to 'dest', escaping dangerous characters using
|
||||
// C-style escape sequences. 'src' and 'dest' should not overlap.
|
||||
// Returns the number of bytes written to 'dest' (not including the \0)
|
||||
// or -1 if there was insufficient space.
|
||||
// or (size_t)-1 if there was insufficient space.
|
||||
// ----------------------------------------------------------------------
|
||||
int CEscapeString(const char* src, int src_len, char* dest,
|
||||
int dest_len) {
|
||||
static size_t CEscapeString(const char* src, size_t src_len,
|
||||
char* dest, size_t dest_len) {
|
||||
const char* src_end = src + src_len;
|
||||
int used = 0;
|
||||
size_t used = 0;
|
||||
|
||||
for (; src < src_end; src++) {
|
||||
if (dest_len - used < 2) // Need space for two letter escape
|
||||
return -1;
|
||||
if (dest_len - used < 2) // space for two-character escape
|
||||
return (size_t)-1;
|
||||
|
||||
unsigned char c = *src;
|
||||
switch (c) {
|
||||
@ -36,9 +43,9 @@ int CEscapeString(const char* src, int src_len, char* dest,
|
||||
// digit then that digit must be escaped too to prevent it being
|
||||
// interpreted as part of the character code by C.
|
||||
if (c < ' ' || c > '~') {
|
||||
if (dest_len - used < 4) // need space for 4 letter escape
|
||||
return -1;
|
||||
sprintf(dest + used, "\\%03o", c);
|
||||
if (dest_len - used < 5) // space for four-character escape + \0
|
||||
return (size_t)-1;
|
||||
snprintf(dest + used, 5, "\\%03o", c);
|
||||
used += 4;
|
||||
} else {
|
||||
dest[used++] = c; break;
|
||||
@ -47,51 +54,111 @@ int CEscapeString(const char* src, int src_len, char* dest,
|
||||
}
|
||||
|
||||
if (dest_len - used < 1) // make sure that there is room for \0
|
||||
return -1;
|
||||
return (size_t)-1;
|
||||
|
||||
dest[used] = '\0'; // doesn't count towards return value though
|
||||
return used;
|
||||
}
|
||||
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// CEscape()
|
||||
// Copies 'src' to result, escaping dangerous characters using
|
||||
// C-style escape sequences. 'src' and 'dest' should not overlap.
|
||||
// C-style escape sequences. 'src' and 'dest' should not overlap.
|
||||
// ----------------------------------------------------------------------
|
||||
string CEscape(const StringPiece& src) {
|
||||
const int dest_length = src.size() * 4 + 1; // Maximum possible expansion
|
||||
char* dest = new char[dest_length];
|
||||
const int len = CEscapeString(src.data(), src.size(),
|
||||
dest, dest_length);
|
||||
string s = string(dest, len);
|
||||
const size_t dest_len = src.size() * 4 + 1; // Maximum possible expansion
|
||||
char* dest = new char[dest_len];
|
||||
const size_t used = CEscapeString(src.data(), src.size(),
|
||||
dest, dest_len);
|
||||
string s = string(dest, used);
|
||||
delete[] dest;
|
||||
return s;
|
||||
}
|
||||
|
||||
string PrefixSuccessor(const StringPiece& prefix) {
|
||||
void PrefixSuccessor(string* prefix) {
|
||||
// We can increment the last character in the string and be done
|
||||
// unless that character is 255, in which case we have to erase the
|
||||
// last character and increment the previous character, unless that
|
||||
// is 255, etc. If the string is empty or consists entirely of
|
||||
// 255's, we just return the empty string.
|
||||
bool done = false;
|
||||
string limit(prefix.data(), prefix.size());
|
||||
int index = limit.length() - 1;
|
||||
while (!done && index >= 0) {
|
||||
if ((limit[index]&255) == 255) {
|
||||
limit.erase(index);
|
||||
index--;
|
||||
while (!prefix->empty()) {
|
||||
char& c = prefix->back();
|
||||
if (c == '\xff') { // char literal avoids signed/unsigned.
|
||||
prefix->pop_back();
|
||||
} else {
|
||||
limit[index]++;
|
||||
done = true;
|
||||
++c;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!done) {
|
||||
return "";
|
||||
} else {
|
||||
return limit;
|
||||
}
|
||||
}
|
||||
|
||||
static void StringAppendV(string* dst, const char* format, va_list ap) {
|
||||
// First try with a small fixed size buffer
|
||||
char space[1024];
|
||||
|
||||
// It's possible for methods that use a va_list to invalidate
|
||||
// the data in it upon use. The fix is to make a copy
|
||||
// of the structure before using it and use that copy instead.
|
||||
va_list backup_ap;
|
||||
va_copy(backup_ap, ap);
|
||||
int result = vsnprintf(space, sizeof(space), format, backup_ap);
|
||||
va_end(backup_ap);
|
||||
|
||||
if ((result >= 0) && (static_cast<size_t>(result) < sizeof(space))) {
|
||||
// It fit
|
||||
dst->append(space, result);
|
||||
return;
|
||||
}
|
||||
|
||||
// Repeatedly increase buffer size until it fits
|
||||
int length = sizeof(space);
|
||||
while (true) {
|
||||
if (result < 0) {
|
||||
// Older behavior: just try doubling the buffer size
|
||||
length *= 2;
|
||||
} else {
|
||||
// We need exactly "result+1" characters
|
||||
length = result+1;
|
||||
}
|
||||
char* buf = new char[length];
|
||||
|
||||
// Restore the va_list before we use it again
|
||||
va_copy(backup_ap, ap);
|
||||
result = vsnprintf(buf, length, format, backup_ap);
|
||||
va_end(backup_ap);
|
||||
|
||||
if ((result >= 0) && (result < length)) {
|
||||
// It fit
|
||||
dst->append(buf, result);
|
||||
delete[] buf;
|
||||
return;
|
||||
}
|
||||
delete[] buf;
|
||||
}
|
||||
}
|
||||
|
||||
string StringPrintf(const char* format, ...) {
|
||||
va_list ap;
|
||||
va_start(ap, format);
|
||||
string result;
|
||||
StringAppendV(&result, format, ap);
|
||||
va_end(ap);
|
||||
return result;
|
||||
}
|
||||
|
||||
void SStringPrintf(string* dst, const char* format, ...) {
|
||||
va_list ap;
|
||||
va_start(ap, format);
|
||||
dst->clear();
|
||||
StringAppendV(dst, format, ap);
|
||||
va_end(ap);
|
||||
}
|
||||
|
||||
void StringAppendF(string* dst, const char* format, ...) {
|
||||
va_list ap;
|
||||
va_start(ap, format);
|
||||
StringAppendV(dst, format, ap);
|
||||
va_end(ap);
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
|
23
contrib/libre2/util/strutil.h
Normal file
23
contrib/libre2/util/strutil.h
Normal file
@ -0,0 +1,23 @@
|
||||
// Copyright 2016 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef UTIL_STRUTIL_H_
|
||||
#define UTIL_STRUTIL_H_
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "re2/stringpiece.h"
|
||||
#include "util/util.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
string CEscape(const StringPiece& src);
|
||||
void PrefixSuccessor(string* prefix);
|
||||
string StringPrintf(const char* format, ...);
|
||||
void SStringPrintf(string* dst, const char* format, ...);
|
||||
void StringAppendF(string* dst, const char* format, ...);
|
||||
|
||||
} // namespace re2
|
||||
|
||||
#endif // UTIL_STRUTIL_H_
|
@ -3,7 +3,10 @@
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include <stdio.h>
|
||||
#ifndef _WIN32
|
||||
#include <sys/resource.h>
|
||||
#endif
|
||||
|
||||
#include "util/test.h"
|
||||
|
||||
DEFINE_string(test_tmpdir, "/var/tmp", "temp directory");
|
||||
@ -21,15 +24,7 @@ void RegisterTest(void (*fn)(void), const char *name) {
|
||||
tests[ntests++].name = name;
|
||||
}
|
||||
|
||||
namespace re2 {
|
||||
int64 VirtualProcessSize() {
|
||||
struct rusage ru;
|
||||
getrusage(RUSAGE_SELF, &ru);
|
||||
return (int64)ru.ru_maxrss*1024;
|
||||
}
|
||||
} // namespace re2
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
int main(int argc, char** argv) {
|
||||
for (int i = 0; i < ntests; i++) {
|
||||
printf("%s\n", tests[i].name);
|
||||
tests[i].fn();
|
||||
|
@ -2,11 +2,12 @@
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef RE2_UTIL_TEST_H__
|
||||
#define RE2_UTIL_TEST_H__
|
||||
#ifndef UTIL_TEST_H_
|
||||
#define UTIL_TEST_H_
|
||||
|
||||
#include "util/util.h"
|
||||
#include "util/flags.h"
|
||||
#include "util/logging.h"
|
||||
|
||||
#define TEST(x, y) \
|
||||
void x##y(void); \
|
||||
@ -31,27 +32,15 @@ class TestRegisterer {
|
||||
#define EXPECT_GE CHECK_GE
|
||||
#define EXPECT_FALSE(x) CHECK(!(x))
|
||||
|
||||
#define ARRAYSIZE arraysize
|
||||
|
||||
#define EXPECT_TRUE_M(x, y) CHECK(x) << (y)
|
||||
#define EXPECT_FALSE_M(x, y) CHECK(!(x)) << (y)
|
||||
#define ASSERT_TRUE_M(x, y) CHECK(x) << (y)
|
||||
#define ASSERT_EQUALS(x, y) CHECK_EQ(x, y)
|
||||
|
||||
const bool UsingMallocCounter = false;
|
||||
namespace testing {
|
||||
class MallocCounter {
|
||||
public:
|
||||
MallocCounter(int x) { }
|
||||
MallocCounter(int x) {}
|
||||
static const int THIS_THREAD_ONLY = 0;
|
||||
long long HeapGrowth() { return 0; }
|
||||
long long PeakHeapGrowth() { return 0; }
|
||||
void Reset() { }
|
||||
void Reset() {}
|
||||
};
|
||||
} // namespace testing
|
||||
|
||||
namespace re2 {
|
||||
int64 VirtualProcessSize();
|
||||
} // namespace re2
|
||||
|
||||
#endif // RE2_UTIL_TEST_H__
|
||||
#endif // UTIL_TEST_H_
|
||||
|
@ -1,44 +0,0 @@
|
||||
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include <pthread.h>
|
||||
|
||||
#include "util/util.h"
|
||||
#include "util/thread.h"
|
||||
|
||||
Thread::Thread() {
|
||||
pid_ = 0;
|
||||
running_ = 0;
|
||||
joinable_ = 0;
|
||||
}
|
||||
|
||||
Thread::~Thread() {
|
||||
}
|
||||
|
||||
void *startThread(void *v) {
|
||||
Thread* t = (Thread*)v;
|
||||
t->Run();
|
||||
return 0;
|
||||
}
|
||||
|
||||
void Thread::Start() {
|
||||
CHECK(!running_);
|
||||
pthread_create(&pid_, 0, startThread, this);
|
||||
running_ = true;
|
||||
if (!joinable_)
|
||||
pthread_detach(pid_);
|
||||
}
|
||||
|
||||
void Thread::Join() {
|
||||
CHECK(running_);
|
||||
CHECK(joinable_);
|
||||
void *val;
|
||||
pthread_join(pid_, &val);
|
||||
running_ = 0;
|
||||
}
|
||||
|
||||
void Thread::SetJoinable(bool j) {
|
||||
CHECK(!running_);
|
||||
joinable_ = j;
|
||||
}
|
@ -1,26 +0,0 @@
|
||||
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef RE2_UTIL_THREAD_H__
|
||||
#define RE2_UTIL_THREAD_H__
|
||||
|
||||
#include <pthread.h>
|
||||
|
||||
class Thread {
|
||||
public:
|
||||
Thread();
|
||||
virtual ~Thread();
|
||||
void Start();
|
||||
void Join();
|
||||
void SetJoinable(bool);
|
||||
virtual void Run() = 0;
|
||||
|
||||
private:
|
||||
pthread_t pid_;
|
||||
bool running_;
|
||||
bool joinable_;
|
||||
};
|
||||
|
||||
#endif // RE2_UTIL_THREAD_H__
|
||||
|
@ -14,8 +14,9 @@
|
||||
* This file and rune.cc have been converted to compile as C++ code
|
||||
* in name space re2.
|
||||
*/
|
||||
#ifndef RE2_UTIL_UTF_H__
|
||||
#define RE2_UTIL_UTF_H__
|
||||
|
||||
#ifndef UTIL_UTF_H_
|
||||
#define UTIL_UTF_H_
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
@ -40,4 +41,4 @@ char* utfrune(const char*, Rune);
|
||||
|
||||
} // namespace re2
|
||||
|
||||
#endif // RE2_UTIL_UTF_H__
|
||||
#endif // UTIL_UTF_H_
|
||||
|
@ -2,125 +2,21 @@
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef RE2_UTIL_UTIL_H__
|
||||
#define RE2_UTIL_UTIL_H__
|
||||
#ifndef UTIL_UTIL_H_
|
||||
#define UTIL_UTIL_H_
|
||||
|
||||
// C
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include <stddef.h> // For size_t
|
||||
#include <assert.h>
|
||||
#include <stdarg.h>
|
||||
#include <sys/time.h>
|
||||
#include <time.h>
|
||||
#include <ctype.h> // For isdigit, isalpha.
|
||||
|
||||
// C++
|
||||
#include <vector>
|
||||
// TODO(junyer): Get rid of this.
|
||||
#include <string>
|
||||
#include <algorithm>
|
||||
#include <iosfwd>
|
||||
#include <map>
|
||||
#include <stack>
|
||||
#include <ostream>
|
||||
#include <utility>
|
||||
#include <set>
|
||||
|
||||
// Use std names.
|
||||
using std::set;
|
||||
using std::pair;
|
||||
using std::vector;
|
||||
using std::string;
|
||||
using std::min;
|
||||
using std::max;
|
||||
using std::ostream;
|
||||
using std::map;
|
||||
using std::stack;
|
||||
using std::sort;
|
||||
using std::swap;
|
||||
using std::make_pair;
|
||||
|
||||
#if defined(__GNUC__) && !defined(USE_CXX0X) && !defined(_LIBCPP_ABI_VERSION) && !defined(OS_ANDROID)
|
||||
#define arraysize(array) (int)(sizeof(array)/sizeof((array)[0]))
|
||||
|
||||
#include <tr1/unordered_set>
|
||||
using std::tr1::unordered_set;
|
||||
|
||||
#else
|
||||
|
||||
#include <unordered_set>
|
||||
#if defined(WIN32) || defined(OS_ANDROID)
|
||||
using std::tr1::unordered_set;
|
||||
#else
|
||||
using std::unordered_set;
|
||||
#ifndef FALLTHROUGH_INTENDED
|
||||
#define FALLTHROUGH_INTENDED do { } while (0)
|
||||
#endif
|
||||
|
||||
#ifndef NO_THREAD_SAFETY_ANALYSIS
|
||||
#define NO_THREAD_SAFETY_ANALYSIS
|
||||
#endif
|
||||
|
||||
namespace re2 {
|
||||
|
||||
typedef int8_t int8;
|
||||
typedef uint8_t uint8;
|
||||
typedef int16_t int16;
|
||||
typedef uint16_t uint16;
|
||||
typedef int32_t int32;
|
||||
typedef uint32_t uint32;
|
||||
typedef int64_t int64;
|
||||
typedef uint64_t uint64;
|
||||
|
||||
typedef unsigned long ulong;
|
||||
typedef unsigned int uint;
|
||||
typedef unsigned short ushort;
|
||||
|
||||
// COMPILE_ASSERT causes a compile error about msg if expr is not true.
|
||||
#if __cplusplus >= 201103L
|
||||
#define COMPILE_ASSERT(expr, msg) static_assert(expr, #msg)
|
||||
#else
|
||||
template<bool> struct CompileAssert {};
|
||||
#define COMPILE_ASSERT(expr, msg) \
|
||||
typedef CompileAssert<(bool(expr))> msg[bool(expr) ? 1 : -1]
|
||||
#endif
|
||||
|
||||
// DISALLOW_EVIL_CONSTRUCTORS disallows the copy and operator= functions.
|
||||
// It goes in the private: declarations in a class.
|
||||
#define DISALLOW_EVIL_CONSTRUCTORS(TypeName) \
|
||||
TypeName(const TypeName&); \
|
||||
void operator=(const TypeName&)
|
||||
|
||||
#define arraysize(array) (sizeof(array)/sizeof((array)[0]))
|
||||
|
||||
class StringPiece;
|
||||
|
||||
string CEscape(const StringPiece& src);
|
||||
int CEscapeString(const char* src, int src_len, char* dest, int dest_len);
|
||||
|
||||
extern string StringPrintf(const char* format, ...);
|
||||
extern void SStringPrintf(string* dst, const char* format, ...);
|
||||
extern void StringAppendF(string* dst, const char* format, ...);
|
||||
extern string PrefixSuccessor(const StringPiece& prefix);
|
||||
|
||||
uint32 hashword(const uint32*, size_t, uint32);
|
||||
void hashword2(const uint32*, size_t, uint32*, uint32*);
|
||||
|
||||
static inline uint32 Hash32StringWithSeed(const char* s, int len, uint32 seed) {
|
||||
return hashword((uint32*)s, len/4, seed);
|
||||
}
|
||||
|
||||
static inline uint64 Hash64StringWithSeed(const char* s, int len, uint32 seed) {
|
||||
uint32 x, y;
|
||||
x = seed;
|
||||
y = 0;
|
||||
hashword2((uint32*)s, len/4, &x, &y);
|
||||
return ((uint64)x << 32) | y;
|
||||
}
|
||||
|
||||
int RunningOnValgrind();
|
||||
|
||||
} // namespace re2
|
||||
|
||||
#include "util/arena.h"
|
||||
#include "util/logging.h"
|
||||
#include "util/mutex.h"
|
||||
#include "util/utf.h"
|
||||
|
||||
#endif // RE2_UTIL_UTIL_H__
|
||||
#endif // UTIL_UTIL_H_
|
||||
|
@ -1,24 +0,0 @@
|
||||
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "util/util.h"
|
||||
#include "util/valgrind.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
#ifndef __has_feature
|
||||
#define __has_feature(x) 0
|
||||
#endif
|
||||
|
||||
int RunningOnValgrind() {
|
||||
#if __has_feature(memory_sanitizer)
|
||||
return true;
|
||||
#elif defined(RUNNING_ON_VALGRIND)
|
||||
return RUNNING_ON_VALGRIND;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace re2
|
File diff suppressed because it is too large
Load Diff
@ -623,7 +623,7 @@ struct ReplaceRegexpImpl
|
||||
{
|
||||
re2_st::StringPiece matches[max_captures];
|
||||
|
||||
int start_pos = 0;
|
||||
size_t start_pos = 0;
|
||||
while (start_pos < input.length())
|
||||
{
|
||||
/// If no more replacements possible for current string
|
||||
|
Loading…
Reference in New Issue
Block a user