mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-22 15:42:02 +00:00
Updated re2 to the latest version [#CLICKHOUSE-2]
This commit is contained in:
parent
9f57a1f7a5
commit
3e3d7b354a
@ -1,33 +1,30 @@
|
|||||||
set (re2_sources
|
set (re2_sources
|
||||||
./re2/tostring.cc
|
|
||||||
./re2/dfa.cc
|
|
||||||
./re2/prefilter.cc
|
|
||||||
./re2/compile.cc
|
|
||||||
./re2/regexp.cc
|
|
||||||
./re2/onepass.cc
|
|
||||||
./re2/prefilter_tree.cc
|
|
||||||
./re2/set.cc
|
|
||||||
./re2/filtered_re2.cc
|
|
||||||
./re2/perl_groups.cc
|
|
||||||
./re2/parse.cc
|
|
||||||
./re2/nfa.cc
|
|
||||||
./re2/bitstate.cc
|
./re2/bitstate.cc
|
||||||
./re2/simplify.cc
|
./re2/compile.cc
|
||||||
./re2/unicode_groups.cc
|
./re2/dfa.cc
|
||||||
|
./re2/filtered_re2.cc
|
||||||
./re2/mimics_pcre.cc
|
./re2/mimics_pcre.cc
|
||||||
./re2/re2.cc
|
./re2/nfa.cc
|
||||||
|
./re2/onepass.cc
|
||||||
|
./re2/parse.cc
|
||||||
|
./re2/perl_groups.cc
|
||||||
|
./re2/prefilter.cc
|
||||||
|
./re2/prefilter_tree.cc
|
||||||
./re2/prog.cc
|
./re2/prog.cc
|
||||||
|
./re2/re2.cc
|
||||||
|
./re2/regexp.cc
|
||||||
|
./re2/set.cc
|
||||||
|
./re2/simplify.cc
|
||||||
|
./re2/stringpiece.cc
|
||||||
|
./re2/tostring.cc
|
||||||
./re2/unicode_casefold.cc
|
./re2/unicode_casefold.cc
|
||||||
./util/strutil.cc
|
./re2/unicode_groups.cc
|
||||||
./util/stringpiece.cc
|
./util/benchmark.cc
|
||||||
./util/hash.cc
|
./util/fuzz.cc
|
||||||
./util/arena.cc
|
|
||||||
./util/valgrind.cc
|
|
||||||
./util/pcre.cc
|
./util/pcre.cc
|
||||||
./util/stringprintf.cc
|
|
||||||
./util/rune.cc
|
./util/rune.cc
|
||||||
./util/random.cc
|
./util/strutil.cc
|
||||||
./util/thread.cc
|
./util/test.cc
|
||||||
)
|
)
|
||||||
|
|
||||||
# Building re2 which is thread-safe and re2_st which is not.
|
# Building re2 which is thread-safe and re2_st which is not.
|
||||||
|
@ -1,9 +1 @@
|
|||||||
Source: hg clone https://re2.googlecode.com/hg re2
|
https://github.com/google/re2/tree/7cf8b88e8f70f97fd4926b56aa87e7f53b2717e0
|
||||||
|
|
||||||
Latest commit:
|
|
||||||
|
|
||||||
changeset: 118:1b483548272e
|
|
||||||
tag: tip
|
|
||||||
user: Russ Cox <rsc@swtch.com>
|
|
||||||
date: Mon Oct 06 15:08:47 2014 -0400
|
|
||||||
summary: doc: import clarifications from Go tree
|
|
||||||
|
@ -1,10 +0,0 @@
|
|||||||
#!/bin/sh
|
|
||||||
|
|
||||||
rm -rf re2_st
|
|
||||||
mkdir -p re2_st
|
|
||||||
|
|
||||||
for i in filtered_re2.h re2.h set.h stringpiece.h variadic_function.h;
|
|
||||||
do
|
|
||||||
cp $1/re2/$i re2_st/$i
|
|
||||||
sed -i -r 's/using re2::RE2;//g;s/namespace re2/namespace re2_st/g;s/re2::/re2_st::/g;s/\"re2\//\"re2_st\//g;s/(.*?_H)/\1_ST/g' re2_st/$i;
|
|
||||||
done
|
|
113
contrib/libre2/re2/bitmap256.h
Normal file
113
contrib/libre2/re2/bitmap256.h
Normal file
@ -0,0 +1,113 @@
|
|||||||
|
// Copyright 2016 The RE2 Authors. All Rights Reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
#ifndef RE2_BITMAP256_H_
|
||||||
|
#define RE2_BITMAP256_H_
|
||||||
|
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
#include <intrin.h>
|
||||||
|
#endif
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
#include "util/util.h"
|
||||||
|
#include "util/logging.h"
|
||||||
|
|
||||||
|
namespace re2 {
|
||||||
|
|
||||||
|
class Bitmap256 {
|
||||||
|
public:
|
||||||
|
Bitmap256() {
|
||||||
|
memset(words_, 0, sizeof words_);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Tests the bit with index c.
|
||||||
|
bool Test(int c) const {
|
||||||
|
DCHECK_GE(c, 0);
|
||||||
|
DCHECK_LE(c, 255);
|
||||||
|
|
||||||
|
return (words_[c / 64] & (1ULL << (c % 64))) != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sets the bit with index c.
|
||||||
|
void Set(int c) {
|
||||||
|
DCHECK_GE(c, 0);
|
||||||
|
DCHECK_LE(c, 255);
|
||||||
|
|
||||||
|
words_[c / 64] |= (1ULL << (c % 64));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Finds the next non-zero bit with index >= c.
|
||||||
|
// Returns -1 if no such bit exists.
|
||||||
|
int FindNextSetBit(int c) const;
|
||||||
|
|
||||||
|
private:
|
||||||
|
// Finds the least significant non-zero bit in n.
|
||||||
|
static int FindLSBSet(uint64_t n) {
|
||||||
|
DCHECK_NE(n, 0);
|
||||||
|
|
||||||
|
#if defined(__GNUC__)
|
||||||
|
return __builtin_ctzll(n);
|
||||||
|
#elif defined(_MSC_VER) && defined(_M_X64)
|
||||||
|
unsigned long c;
|
||||||
|
_BitScanForward64(&c, n);
|
||||||
|
return static_cast<int>(c);
|
||||||
|
#elif defined(_MSC_VER) && defined(_M_IX86)
|
||||||
|
unsigned long c;
|
||||||
|
if (static_cast<uint32_t>(n) != 0) {
|
||||||
|
_BitScanForward(&c, static_cast<uint32_t>(n));
|
||||||
|
return static_cast<int>(c);
|
||||||
|
} else {
|
||||||
|
_BitScanForward(&c, static_cast<uint32_t>(n >> 32));
|
||||||
|
return static_cast<int>(c) + 32;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
int c = 63;
|
||||||
|
for (int shift = 1 << 5; shift != 0; shift >>= 1) {
|
||||||
|
uint64_t word = n << shift;
|
||||||
|
if (word != 0) {
|
||||||
|
n = word;
|
||||||
|
c -= shift;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return c;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t words_[4];
|
||||||
|
};
|
||||||
|
|
||||||
|
int Bitmap256::FindNextSetBit(int c) const {
|
||||||
|
DCHECK_GE(c, 0);
|
||||||
|
DCHECK_LE(c, 255);
|
||||||
|
|
||||||
|
// Check the word that contains the bit. Mask out any lower bits.
|
||||||
|
int i = c / 64;
|
||||||
|
uint64_t word = words_[i] & (~0ULL << (c % 64));
|
||||||
|
if (word != 0)
|
||||||
|
return (i * 64) + FindLSBSet(word);
|
||||||
|
|
||||||
|
// Check any following words.
|
||||||
|
i++;
|
||||||
|
switch (i) {
|
||||||
|
case 1:
|
||||||
|
if (words_[1] != 0)
|
||||||
|
return (1 * 64) + FindLSBSet(words_[1]);
|
||||||
|
FALLTHROUGH_INTENDED;
|
||||||
|
case 2:
|
||||||
|
if (words_[2] != 0)
|
||||||
|
return (2 * 64) + FindLSBSet(words_[2]);
|
||||||
|
FALLTHROUGH_INTENDED;
|
||||||
|
case 3:
|
||||||
|
if (words_[3] != 0)
|
||||||
|
return (3 * 64) + FindLSBSet(words_[3]);
|
||||||
|
FALLTHROUGH_INTENDED;
|
||||||
|
default:
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace re2
|
||||||
|
|
||||||
|
#endif // RE2_BITMAP256_H_
|
@ -17,6 +17,11 @@
|
|||||||
// SearchBitState is a fast replacement for the NFA code on small
|
// SearchBitState is a fast replacement for the NFA code on small
|
||||||
// regexps and texts when SearchOnePass cannot be used.
|
// regexps and texts when SearchOnePass cannot be used.
|
||||||
|
|
||||||
|
#include <stddef.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
#include "util/logging.h"
|
||||||
#include "re2/prog.h"
|
#include "re2/prog.h"
|
||||||
#include "re2/regexp.h"
|
#include "re2/regexp.h"
|
||||||
|
|
||||||
@ -60,8 +65,8 @@ class BitState {
|
|||||||
int ncap_;
|
int ncap_;
|
||||||
|
|
||||||
static const int VisitedBits = 32;
|
static const int VisitedBits = 32;
|
||||||
uint32 *visited_; // bitmap: (Inst*, char*) pairs already backtracked
|
uint32_t *visited_; // bitmap: (Inst*, char*) pairs already backtracked
|
||||||
int nvisited_; // # of words in bitmap
|
size_t nvisited_; // # of words in bitmap
|
||||||
|
|
||||||
Job *job_; // stack of text positions to explore
|
Job *job_; // stack of text positions to explore
|
||||||
int njob_;
|
int njob_;
|
||||||
@ -94,7 +99,7 @@ BitState::~BitState() {
|
|||||||
// If so, remember that it was visited so that the next time,
|
// If so, remember that it was visited so that the next time,
|
||||||
// we don't repeat the visit.
|
// we don't repeat the visit.
|
||||||
bool BitState::ShouldVisit(int id, const char* p) {
|
bool BitState::ShouldVisit(int id, const char* p) {
|
||||||
uint n = id * (text_.size() + 1) + (p - text_.begin());
|
size_t n = id * (text_.size() + 1) + (p - text_.begin());
|
||||||
if (visited_[n/VisitedBits] & (1 << (n & (VisitedBits-1))))
|
if (visited_[n/VisitedBits] & (1 << (n & (VisitedBits-1))))
|
||||||
return false;
|
return false;
|
||||||
visited_[n/VisitedBits] |= 1 << (n & (VisitedBits-1));
|
visited_[n/VisitedBits] |= 1 << (n & (VisitedBits-1));
|
||||||
@ -103,7 +108,6 @@ bool BitState::ShouldVisit(int id, const char* p) {
|
|||||||
|
|
||||||
// Grow the stack.
|
// Grow the stack.
|
||||||
bool BitState::GrowStack() {
|
bool BitState::GrowStack() {
|
||||||
// VLOG(0) << "Reallocate.";
|
|
||||||
maxjob_ *= 2;
|
maxjob_ *= 2;
|
||||||
Job* newjob = new Job[maxjob_];
|
Job* newjob = new Job[maxjob_];
|
||||||
memmove(newjob, job_, njob_*sizeof job_[0]);
|
memmove(newjob, job_, njob_*sizeof job_[0]);
|
||||||
@ -141,6 +145,7 @@ void BitState::Push(int id, const char* p, int arg) {
|
|||||||
// Return whether it succeeded.
|
// Return whether it succeeded.
|
||||||
bool BitState::TrySearch(int id0, const char* p0) {
|
bool BitState::TrySearch(int id0, const char* p0) {
|
||||||
bool matched = false;
|
bool matched = false;
|
||||||
|
bool inaltmatch = false;
|
||||||
const char* end = text_.end();
|
const char* end = text_.end();
|
||||||
njob_ = 0;
|
njob_ = 0;
|
||||||
Push(id0, p0, 0);
|
Push(id0, p0, 0);
|
||||||
@ -159,46 +164,37 @@ bool BitState::TrySearch(int id0, const char* p0) {
|
|||||||
// would have, but we avoid the stack
|
// would have, but we avoid the stack
|
||||||
// manipulation.
|
// manipulation.
|
||||||
if (0) {
|
if (0) {
|
||||||
|
Next:
|
||||||
|
// If the Match of a non-greedy AltMatch failed,
|
||||||
|
// we stop ourselves from trying the ByteRange,
|
||||||
|
// which would steer us off the short circuit.
|
||||||
|
if (prog_->inst(id)->last() || inaltmatch)
|
||||||
|
continue;
|
||||||
|
id++;
|
||||||
|
|
||||||
CheckAndLoop:
|
CheckAndLoop:
|
||||||
if (!ShouldVisit(id, p))
|
if (!ShouldVisit(id, p))
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Visit ip, p.
|
// Visit ip, p.
|
||||||
// VLOG(0) << "Job: " << ip->id() << " "
|
|
||||||
// << (p - text_.begin()) << " " << arg;
|
|
||||||
Prog::Inst* ip = prog_->inst(id);
|
Prog::Inst* ip = prog_->inst(id);
|
||||||
switch (ip->opcode()) {
|
switch (ip->opcode()) {
|
||||||
case kInstFail:
|
|
||||||
default:
|
default:
|
||||||
LOG(DFATAL) << "Unexpected opcode: " << ip->opcode() << " arg " << arg;
|
LOG(DFATAL) << "Unexpected opcode: " << ip->opcode() << " arg " << arg;
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
case kInstAlt:
|
case kInstFail:
|
||||||
// Cannot just
|
|
||||||
// Push(ip->out1(), p, 0);
|
|
||||||
// Push(ip->out(), p, 0);
|
|
||||||
// If, during the processing of ip->out(), we encounter
|
|
||||||
// ip->out1() via another path, we want to process it then.
|
|
||||||
// Pushing it here will inhibit that. Instead, re-push
|
|
||||||
// ip with arg==1 as a reminder to push ip->out1() later.
|
|
||||||
switch (arg) {
|
|
||||||
case 0:
|
|
||||||
Push(id, p, 1); // come back when we're done
|
|
||||||
id = ip->out();
|
|
||||||
goto CheckAndLoop;
|
|
||||||
|
|
||||||
case 1:
|
|
||||||
// Finished ip->out(); try ip->out1().
|
|
||||||
arg = 0;
|
|
||||||
id = ip->out1();
|
|
||||||
goto CheckAndLoop;
|
|
||||||
}
|
|
||||||
LOG(DFATAL) << "Bad arg in kInstCapture: " << arg;
|
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
case kInstAltMatch:
|
case kInstAltMatch:
|
||||||
// One opcode is byte range; the other leads to match.
|
switch (arg) {
|
||||||
|
case 0:
|
||||||
|
inaltmatch = true;
|
||||||
|
Push(id, p, 1); // come back when we're done
|
||||||
|
|
||||||
|
// One opcode is ByteRange; the other leads to Match
|
||||||
|
// (possibly via Nop or Capture).
|
||||||
if (ip->greedy(prog_)) {
|
if (ip->greedy(prog_)) {
|
||||||
// out1 is the match
|
// out1 is the match
|
||||||
Push(ip->out1(), p, 0);
|
Push(ip->out1(), p, 0);
|
||||||
@ -211,29 +207,43 @@ bool BitState::TrySearch(int id0, const char* p0) {
|
|||||||
id = ip->out();
|
id = ip->out();
|
||||||
goto CheckAndLoop;
|
goto CheckAndLoop;
|
||||||
|
|
||||||
|
case 1:
|
||||||
|
inaltmatch = false;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
LOG(DFATAL) << "Bad arg in kInstAltMatch: " << arg;
|
||||||
|
continue;
|
||||||
|
|
||||||
case kInstByteRange: {
|
case kInstByteRange: {
|
||||||
int c = -1;
|
int c = -1;
|
||||||
if (p < end)
|
if (p < end)
|
||||||
c = *p & 0xFF;
|
c = *p & 0xFF;
|
||||||
if (ip->Matches(c)) {
|
if (!ip->Matches(c))
|
||||||
|
goto Next;
|
||||||
|
|
||||||
|
if (!ip->last())
|
||||||
|
Push(id+1, p, 0); // try the next when we're done
|
||||||
id = ip->out();
|
id = ip->out();
|
||||||
p++;
|
p++;
|
||||||
goto CheckAndLoop;
|
goto CheckAndLoop;
|
||||||
}
|
}
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
case kInstCapture:
|
case kInstCapture:
|
||||||
switch (arg) {
|
switch (arg) {
|
||||||
case 0:
|
case 0:
|
||||||
|
if (!ip->last())
|
||||||
|
Push(id+1, p, 0); // try the next when we're done
|
||||||
|
|
||||||
if (0 <= ip->cap() && ip->cap() < ncap_) {
|
if (0 <= ip->cap() && ip->cap() < ncap_) {
|
||||||
// Capture p to register, but save old value.
|
// Capture p to register, but save old value.
|
||||||
Push(id, cap_[ip->cap()], 1); // come back when we're done
|
Push(id, cap_[ip->cap()], 1); // come back when we're done
|
||||||
cap_[ip->cap()] = p;
|
cap_[ip->cap()] = p;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Continue on.
|
// Continue on.
|
||||||
id = ip->out();
|
id = ip->out();
|
||||||
goto CheckAndLoop;
|
goto CheckAndLoop;
|
||||||
|
|
||||||
case 1:
|
case 1:
|
||||||
// Finished ip->out(); restore the old value.
|
// Finished ip->out(); restore the old value.
|
||||||
cap_[ip->cap()] = p;
|
cap_[ip->cap()] = p;
|
||||||
@ -244,19 +254,23 @@ bool BitState::TrySearch(int id0, const char* p0) {
|
|||||||
|
|
||||||
case kInstEmptyWidth:
|
case kInstEmptyWidth:
|
||||||
if (ip->empty() & ~Prog::EmptyFlags(context_, p))
|
if (ip->empty() & ~Prog::EmptyFlags(context_, p))
|
||||||
continue;
|
goto Next;
|
||||||
|
|
||||||
|
if (!ip->last())
|
||||||
|
Push(id+1, p, 0); // try the next when we're done
|
||||||
id = ip->out();
|
id = ip->out();
|
||||||
goto CheckAndLoop;
|
goto CheckAndLoop;
|
||||||
|
|
||||||
case kInstNop:
|
case kInstNop:
|
||||||
|
if (!ip->last())
|
||||||
|
Push(id+1, p, 0); // try the next when we're done
|
||||||
id = ip->out();
|
id = ip->out();
|
||||||
goto CheckAndLoop;
|
goto CheckAndLoop;
|
||||||
|
|
||||||
case kInstMatch: {
|
case kInstMatch: {
|
||||||
if (endmatch_ && p != text_.end())
|
if (endmatch_ && p != text_.end())
|
||||||
continue;
|
goto Next;
|
||||||
|
|
||||||
// VLOG(0) << "Found match.";
|
|
||||||
// We found a match. If the caller doesn't care
|
// We found a match. If the caller doesn't care
|
||||||
// where the match is, no point going further.
|
// where the match is, no point going further.
|
||||||
if (nsubmatch_ == 0)
|
if (nsubmatch_ == 0)
|
||||||
@ -270,7 +284,9 @@ bool BitState::TrySearch(int id0, const char* p0) {
|
|||||||
if (submatch_[0].data() == NULL ||
|
if (submatch_[0].data() == NULL ||
|
||||||
(longest_ && p > submatch_[0].end())) {
|
(longest_ && p > submatch_[0].end())) {
|
||||||
for (int i = 0; i < nsubmatch_; i++)
|
for (int i = 0; i < nsubmatch_; i++)
|
||||||
submatch_[i] = StringPiece(cap_[2*i], cap_[2*i+1] - cap_[2*i]);
|
submatch_[i] =
|
||||||
|
StringPiece(cap_[2 * i],
|
||||||
|
static_cast<size_t>(cap_[2 * i + 1] - cap_[2 * i]));
|
||||||
}
|
}
|
||||||
|
|
||||||
// If going for first match, we're done.
|
// If going for first match, we're done.
|
||||||
@ -282,7 +298,7 @@ bool BitState::TrySearch(int id0, const char* p0) {
|
|||||||
return true;
|
return true;
|
||||||
|
|
||||||
// Otherwise, continue on in hope of a longer match.
|
// Otherwise, continue on in hope of a longer match.
|
||||||
continue;
|
goto Next;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -308,13 +324,12 @@ bool BitState::Search(const StringPiece& text, const StringPiece& context,
|
|||||||
submatch_ = submatch;
|
submatch_ = submatch;
|
||||||
nsubmatch_ = nsubmatch;
|
nsubmatch_ = nsubmatch;
|
||||||
for (int i = 0; i < nsubmatch_; i++)
|
for (int i = 0; i < nsubmatch_; i++)
|
||||||
submatch_[i] = NULL;
|
submatch_[i] = StringPiece();
|
||||||
|
|
||||||
// Allocate scratch space.
|
// Allocate scratch space.
|
||||||
nvisited_ = (prog_->size() * (text.size()+1) + VisitedBits-1) / VisitedBits;
|
nvisited_ = (prog_->size() * (text.size()+1) + VisitedBits-1) / VisitedBits;
|
||||||
visited_ = new uint32[nvisited_];
|
visited_ = new uint32_t[nvisited_];
|
||||||
memset(visited_, 0, nvisited_*sizeof visited_[0]);
|
memset(visited_, 0, nvisited_*sizeof visited_[0]);
|
||||||
// VLOG(0) << "nvisited_ = " << nvisited_;
|
|
||||||
|
|
||||||
ncap_ = 2*nsubmatch;
|
ncap_ = 2*nsubmatch;
|
||||||
if (ncap_ < 2)
|
if (ncap_ < 2)
|
||||||
@ -338,6 +353,14 @@ bool BitState::Search(const StringPiece& text, const StringPiece& context,
|
|||||||
// but we are not clearing visited_ between calls to TrySearch,
|
// but we are not clearing visited_ between calls to TrySearch,
|
||||||
// so no work is duplicated and it ends up still being linear.
|
// so no work is duplicated and it ends up still being linear.
|
||||||
for (const char* p = text.begin(); p <= text.end(); p++) {
|
for (const char* p = text.begin(); p <= text.end(); p++) {
|
||||||
|
// Try to use memchr to find the first byte quickly.
|
||||||
|
int fb = prog_->first_byte();
|
||||||
|
if (fb >= 0 && p < text.end() && (p[0] & 0xFF) != fb) {
|
||||||
|
p = reinterpret_cast<const char*>(memchr(p, fb, text.end() - p));
|
||||||
|
if (p == NULL)
|
||||||
|
p = text.end();
|
||||||
|
}
|
||||||
|
|
||||||
cap_[0] = p;
|
cap_[0] = p;
|
||||||
if (TrySearch(prog_->start(), p)) // Match must be leftmost; done.
|
if (TrySearch(prog_->start(), p)) // Match must be leftmost; done.
|
||||||
return true;
|
return true;
|
||||||
|
@ -8,6 +8,13 @@
|
|||||||
// This file's external interface is just Regexp::CompileToProg.
|
// This file's external interface is just Regexp::CompileToProg.
|
||||||
// The Compiler class defined in this file is private.
|
// The Compiler class defined in this file is private.
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <unordered_map>
|
||||||
|
#include <utility>
|
||||||
|
|
||||||
|
#include "util/logging.h"
|
||||||
|
#include "util/utf.h"
|
||||||
#include "re2/prog.h"
|
#include "re2/prog.h"
|
||||||
#include "re2/re2.h"
|
#include "re2/re2.h"
|
||||||
#include "re2/regexp.h"
|
#include "re2/regexp.h"
|
||||||
@ -28,14 +35,14 @@ namespace re2 {
|
|||||||
// is always the fail instruction, which never appears on a list.
|
// is always the fail instruction, which never appears on a list.
|
||||||
|
|
||||||
struct PatchList {
|
struct PatchList {
|
||||||
uint32 p;
|
uint32_t p;
|
||||||
|
|
||||||
// Returns patch list containing just p.
|
// Returns patch list containing just p.
|
||||||
static PatchList Mk(uint32 p);
|
static PatchList Mk(uint32_t p);
|
||||||
|
|
||||||
// Patches all the entries on l to have value v.
|
// Patches all the entries on l to have value v.
|
||||||
// Caller must not ever use patch list again.
|
// Caller must not ever use patch list again.
|
||||||
static void Patch(Prog::Inst *inst0, PatchList l, uint32 v);
|
static void Patch(Prog::Inst *inst0, PatchList l, uint32_t v);
|
||||||
|
|
||||||
// Deref returns the next pointer pointed at by p.
|
// Deref returns the next pointer pointed at by p.
|
||||||
static PatchList Deref(Prog::Inst *inst0, PatchList l);
|
static PatchList Deref(Prog::Inst *inst0, PatchList l);
|
||||||
@ -47,7 +54,7 @@ struct PatchList {
|
|||||||
static PatchList nullPatchList = { 0 };
|
static PatchList nullPatchList = { 0 };
|
||||||
|
|
||||||
// Returns patch list containing just p.
|
// Returns patch list containing just p.
|
||||||
PatchList PatchList::Mk(uint32 p) {
|
PatchList PatchList::Mk(uint32_t p) {
|
||||||
PatchList l;
|
PatchList l;
|
||||||
l.p = p;
|
l.p = p;
|
||||||
return l;
|
return l;
|
||||||
@ -64,7 +71,7 @@ PatchList PatchList::Deref(Prog::Inst* inst0, PatchList l) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Patches all the entries on l to have value v.
|
// Patches all the entries on l to have value v.
|
||||||
void PatchList::Patch(Prog::Inst *inst0, PatchList l, uint32 val) {
|
void PatchList::Patch(Prog::Inst *inst0, PatchList l, uint32_t val) {
|
||||||
while (l.p != 0) {
|
while (l.p != 0) {
|
||||||
Prog::Inst* ip = &inst0[l.p>>1];
|
Prog::Inst* ip = &inst0[l.p>>1];
|
||||||
if (l.p&1) {
|
if (l.p&1) {
|
||||||
@ -103,17 +110,17 @@ PatchList PatchList::Append(Prog::Inst* inst0, PatchList l1, PatchList l2) {
|
|||||||
|
|
||||||
// Compiled program fragment.
|
// Compiled program fragment.
|
||||||
struct Frag {
|
struct Frag {
|
||||||
uint32 begin;
|
uint32_t begin;
|
||||||
PatchList end;
|
PatchList end;
|
||||||
|
|
||||||
Frag() : begin(0) { end.p = 0; } // needed so Frag can go in vector
|
Frag() : begin(0) { end.p = 0; } // needed so Frag can go in vector
|
||||||
Frag(uint32 begin, PatchList end) : begin(begin), end(end) {}
|
Frag(uint32_t begin, PatchList end) : begin(begin), end(end) {}
|
||||||
};
|
};
|
||||||
|
|
||||||
// Input encodings.
|
// Input encodings.
|
||||||
enum Encoding {
|
enum Encoding {
|
||||||
kEncodingUTF8 = 1, // UTF-8 (0-10FFFF)
|
kEncodingUTF8 = 1, // UTF-8 (0-10FFFF)
|
||||||
kEncodingLatin1, // Latin1 (0-FF)
|
kEncodingLatin1, // Latin-1 (0-FF)
|
||||||
};
|
};
|
||||||
|
|
||||||
class Compiler : public Regexp::Walker<Frag> {
|
class Compiler : public Regexp::Walker<Frag> {
|
||||||
@ -125,12 +132,11 @@ class Compiler : public Regexp::Walker<Frag> {
|
|||||||
// Caller is responsible for deleting Prog when finished with it.
|
// Caller is responsible for deleting Prog when finished with it.
|
||||||
// If reversed is true, compiles for walking over the input
|
// If reversed is true, compiles for walking over the input
|
||||||
// string backward (reverses all concatenations).
|
// string backward (reverses all concatenations).
|
||||||
static Prog *Compile(Regexp* re, bool reversed, int64 max_mem);
|
static Prog *Compile(Regexp* re, bool reversed, int64_t max_mem);
|
||||||
|
|
||||||
// Compiles alternation of all the re to a new Prog.
|
// Compiles alternation of all the re to a new Prog.
|
||||||
// Each re has a match with an id equal to its index in the vector.
|
// Each re has a match with an id equal to its index in the vector.
|
||||||
static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor,
|
static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem);
|
||||||
Regexp* re);
|
|
||||||
|
|
||||||
// Interface for Regexp::Walker, which helps traverse the Regexp.
|
// Interface for Regexp::Walker, which helps traverse the Regexp.
|
||||||
// The walk is purely post-recursive: given the machines for the
|
// The walk is purely post-recursive: given the machines for the
|
||||||
@ -162,7 +168,7 @@ class Compiler : public Regexp::Walker<Frag> {
|
|||||||
Frag NoMatch();
|
Frag NoMatch();
|
||||||
|
|
||||||
// Returns a fragment that matches the empty string.
|
// Returns a fragment that matches the empty string.
|
||||||
Frag Match(int32 id);
|
Frag Match(int32_t id);
|
||||||
|
|
||||||
// Returns a no-op fragment.
|
// Returns a no-op fragment.
|
||||||
Frag Nop();
|
Frag Nop();
|
||||||
@ -178,9 +184,6 @@ class Compiler : public Regexp::Walker<Frag> {
|
|||||||
// Returns -1 if no more instructions are available.
|
// Returns -1 if no more instructions are available.
|
||||||
int AllocInst(int n);
|
int AllocInst(int n);
|
||||||
|
|
||||||
// Deletes unused instructions.
|
|
||||||
void Trim();
|
|
||||||
|
|
||||||
// Rune range compiler.
|
// Rune range compiler.
|
||||||
|
|
||||||
// Begins a new alternation.
|
// Begins a new alternation.
|
||||||
@ -193,19 +196,35 @@ class Compiler : public Regexp::Walker<Frag> {
|
|||||||
void Add_80_10ffff();
|
void Add_80_10ffff();
|
||||||
|
|
||||||
// New suffix that matches the byte range lo-hi, then goes to next.
|
// New suffix that matches the byte range lo-hi, then goes to next.
|
||||||
int RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next);
|
int UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next);
|
||||||
int UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next);
|
int CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next);
|
||||||
|
|
||||||
|
// Returns true iff the suffix is cached.
|
||||||
|
bool IsCachedRuneByteSuffix(int id);
|
||||||
|
|
||||||
// Adds a suffix to alternation.
|
// Adds a suffix to alternation.
|
||||||
void AddSuffix(int id);
|
void AddSuffix(int id);
|
||||||
|
|
||||||
|
// Adds a suffix to the trie starting from the given root node.
|
||||||
|
// Returns zero iff allocating an instruction fails. Otherwise, returns
|
||||||
|
// the current root node, which might be different from what was given.
|
||||||
|
int AddSuffixRecursive(int root, int id);
|
||||||
|
|
||||||
|
// Finds the trie node for the given suffix. Returns a Frag in order to
|
||||||
|
// distinguish between pointing at the root node directly (end.p == 0)
|
||||||
|
// and pointing at an Alt's out1 or out (end.p&1 == 1 or 0, respectively).
|
||||||
|
Frag FindByteRange(int root, int id);
|
||||||
|
|
||||||
|
// Compares two ByteRanges and returns true iff they are equal.
|
||||||
|
bool ByteRangeEqual(int id1, int id2);
|
||||||
|
|
||||||
// Returns the alternation of all the added suffixes.
|
// Returns the alternation of all the added suffixes.
|
||||||
Frag EndRange();
|
Frag EndRange();
|
||||||
|
|
||||||
// Single rune.
|
// Single rune.
|
||||||
Frag Literal(Rune r, bool foldcase);
|
Frag Literal(Rune r, bool foldcase);
|
||||||
|
|
||||||
void Setup(Regexp::ParseFlags, int64, RE2::Anchor);
|
void Setup(Regexp::ParseFlags, int64_t, RE2::Anchor);
|
||||||
Prog* Finish();
|
Prog* Finish();
|
||||||
|
|
||||||
// Returns .* where dot = any byte
|
// Returns .* where dot = any byte
|
||||||
@ -223,14 +242,15 @@ class Compiler : public Regexp::Walker<Frag> {
|
|||||||
int inst_len_; // Number of instructions used.
|
int inst_len_; // Number of instructions used.
|
||||||
int inst_cap_; // Number of instructions allocated.
|
int inst_cap_; // Number of instructions allocated.
|
||||||
|
|
||||||
int64 max_mem_; // Total memory budget.
|
int64_t max_mem_; // Total memory budget.
|
||||||
|
|
||||||
map<uint64, int> rune_cache_;
|
std::unordered_map<uint64_t, int> rune_cache_;
|
||||||
Frag rune_range_;
|
Frag rune_range_;
|
||||||
|
|
||||||
RE2::Anchor anchor_; // anchor mode for RE2::Set
|
RE2::Anchor anchor_; // anchor mode for RE2::Set
|
||||||
|
|
||||||
DISALLOW_EVIL_CONSTRUCTORS(Compiler);
|
Compiler(const Compiler&) = delete;
|
||||||
|
Compiler& operator=(const Compiler&) = delete;
|
||||||
};
|
};
|
||||||
|
|
||||||
Compiler::Compiler() {
|
Compiler::Compiler() {
|
||||||
@ -265,6 +285,7 @@ int Compiler::AllocInst(int n) {
|
|||||||
while (inst_len_ + n > inst_cap_)
|
while (inst_len_ + n > inst_cap_)
|
||||||
inst_cap_ *= 2;
|
inst_cap_ *= 2;
|
||||||
Prog::Inst* ip = new Prog::Inst[inst_cap_];
|
Prog::Inst* ip = new Prog::Inst[inst_cap_];
|
||||||
|
if (inst_ != NULL)
|
||||||
memmove(ip, inst_, inst_len_ * sizeof ip[0]);
|
memmove(ip, inst_, inst_len_ * sizeof ip[0]);
|
||||||
memset(ip + inst_len_, 0, (inst_cap_ - inst_len_) * sizeof ip[0]);
|
memset(ip + inst_len_, 0, (inst_cap_ - inst_len_) * sizeof ip[0]);
|
||||||
delete[] inst_;
|
delete[] inst_;
|
||||||
@ -275,16 +296,6 @@ int Compiler::AllocInst(int n) {
|
|||||||
return id;
|
return id;
|
||||||
}
|
}
|
||||||
|
|
||||||
void Compiler::Trim() {
|
|
||||||
if (inst_len_ < inst_cap_) {
|
|
||||||
Prog::Inst* ip = new Prog::Inst[inst_len_];
|
|
||||||
memmove(ip, inst_, inst_len_ * sizeof ip[0]);
|
|
||||||
delete[] inst_;
|
|
||||||
inst_ = ip;
|
|
||||||
inst_cap_ = inst_len_;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// These routines are somewhat hard to visualize in text --
|
// These routines are somewhat hard to visualize in text --
|
||||||
// see http://swtch.com/~rsc/regexp/regexp1.html for
|
// see http://swtch.com/~rsc/regexp/regexp1.html for
|
||||||
// pictures explaining what is going on here.
|
// pictures explaining what is going on here.
|
||||||
@ -393,16 +404,6 @@ Frag Compiler::ByteRange(int lo, int hi, bool foldcase) {
|
|||||||
if (id < 0)
|
if (id < 0)
|
||||||
return NoMatch();
|
return NoMatch();
|
||||||
inst_[id].InitByteRange(lo, hi, foldcase, 0);
|
inst_[id].InitByteRange(lo, hi, foldcase, 0);
|
||||||
prog_->byte_inst_count_++;
|
|
||||||
prog_->MarkByteRange(lo, hi);
|
|
||||||
if (foldcase && lo <= 'z' && hi >= 'a') {
|
|
||||||
if (lo < 'a')
|
|
||||||
lo = 'a';
|
|
||||||
if (hi > 'z')
|
|
||||||
hi = 'z';
|
|
||||||
if (lo <= hi)
|
|
||||||
prog_->MarkByteRange(lo + 'A' - 'a', hi + 'A' - 'a');
|
|
||||||
}
|
|
||||||
return Frag(id, PatchList::Mk(id << 1));
|
return Frag(id, PatchList::Mk(id << 1));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -416,7 +417,7 @@ Frag Compiler::Nop() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Returns a fragment that signals a match.
|
// Returns a fragment that signals a match.
|
||||||
Frag Compiler::Match(int32 match_id) {
|
Frag Compiler::Match(int32_t match_id) {
|
||||||
int id = AllocInst(1);
|
int id = AllocInst(1);
|
||||||
if (id < 0)
|
if (id < 0)
|
||||||
return NoMatch();
|
return NoMatch();
|
||||||
@ -430,16 +431,6 @@ Frag Compiler::EmptyWidth(EmptyOp empty) {
|
|||||||
if (id < 0)
|
if (id < 0)
|
||||||
return NoMatch();
|
return NoMatch();
|
||||||
inst_[id].InitEmptyWidth(empty, 0);
|
inst_[id].InitEmptyWidth(empty, 0);
|
||||||
if (empty & (kEmptyBeginLine|kEmptyEndLine))
|
|
||||||
prog_->MarkByteRange('\n', '\n');
|
|
||||||
if (empty & (kEmptyWordBoundary|kEmptyNonWordBoundary)) {
|
|
||||||
int j;
|
|
||||||
for (int i = 0; i < 256; i = j) {
|
|
||||||
for (j = i+1; j < 256 && Prog::IsWordChar(i) == Prog::IsWordChar(j); j++)
|
|
||||||
;
|
|
||||||
prog_->MarkByteRange(i, j-1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return Frag(id, PatchList::Mk(id << 1));
|
return Frag(id, PatchList::Mk(id << 1));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -482,7 +473,7 @@ void Compiler::BeginRange() {
|
|||||||
rune_range_.end = nullPatchList;
|
rune_range_.end = nullPatchList;
|
||||||
}
|
}
|
||||||
|
|
||||||
int Compiler::UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase,
|
int Compiler::UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase,
|
||||||
int next) {
|
int next) {
|
||||||
Frag f = ByteRange(lo, hi, foldcase);
|
Frag f = ByteRange(lo, hi, foldcase);
|
||||||
if (next != 0) {
|
if (next != 0) {
|
||||||
@ -493,18 +484,18 @@ int Compiler::UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase,
|
|||||||
return f.begin;
|
return f.begin;
|
||||||
}
|
}
|
||||||
|
|
||||||
int Compiler::RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next) {
|
static uint64_t MakeRuneCacheKey(uint8_t lo, uint8_t hi, bool foldcase,
|
||||||
// In Latin1 mode, there's no point in caching.
|
int next) {
|
||||||
// In forward UTF-8 mode, only need to cache continuation bytes.
|
return (uint64_t)next << 17 |
|
||||||
if (encoding_ == kEncodingLatin1 ||
|
(uint64_t)lo << 9 |
|
||||||
(encoding_ == kEncodingUTF8 &&
|
(uint64_t)hi << 1 |
|
||||||
!reversed_ &&
|
(uint64_t)foldcase;
|
||||||
!(0x80 <= lo && hi <= 0xbf))) {
|
|
||||||
return UncachedRuneByteSuffix(lo, hi, foldcase, next);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
uint64 key = ((uint64)next << 17) | (lo<<9) | (hi<<1) | foldcase;
|
int Compiler::CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase,
|
||||||
map<uint64, int>::iterator it = rune_cache_.find(key);
|
int next) {
|
||||||
|
uint64_t key = MakeRuneCacheKey(lo, hi, foldcase, next);
|
||||||
|
std::unordered_map<uint64_t, int>::const_iterator it = rune_cache_.find(key);
|
||||||
if (it != rune_cache_.end())
|
if (it != rune_cache_.end())
|
||||||
return it->second;
|
return it->second;
|
||||||
int id = UncachedRuneByteSuffix(lo, hi, foldcase, next);
|
int id = UncachedRuneByteSuffix(lo, hi, foldcase, next);
|
||||||
@ -512,12 +503,31 @@ int Compiler::RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next) {
|
|||||||
return id;
|
return id;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool Compiler::IsCachedRuneByteSuffix(int id) {
|
||||||
|
uint8_t lo = inst_[id].lo_;
|
||||||
|
uint8_t hi = inst_[id].hi_;
|
||||||
|
bool foldcase = inst_[id].foldcase() != 0;
|
||||||
|
int next = inst_[id].out();
|
||||||
|
|
||||||
|
uint64_t key = MakeRuneCacheKey(lo, hi, foldcase, next);
|
||||||
|
return rune_cache_.find(key) != rune_cache_.end();
|
||||||
|
}
|
||||||
|
|
||||||
void Compiler::AddSuffix(int id) {
|
void Compiler::AddSuffix(int id) {
|
||||||
|
if (failed_)
|
||||||
|
return;
|
||||||
|
|
||||||
if (rune_range_.begin == 0) {
|
if (rune_range_.begin == 0) {
|
||||||
rune_range_.begin = id;
|
rune_range_.begin = id;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (encoding_ == kEncodingUTF8) {
|
||||||
|
// Build a trie in order to reduce fanout.
|
||||||
|
rune_range_.begin = AddSuffixRecursive(rune_range_.begin, id);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
int alt = AllocInst(1);
|
int alt = AllocInst(1);
|
||||||
if (alt < 0) {
|
if (alt < 0) {
|
||||||
rune_range_.begin = 0;
|
rune_range_.begin = 0;
|
||||||
@ -527,6 +537,102 @@ void Compiler::AddSuffix(int id) {
|
|||||||
rune_range_.begin = alt;
|
rune_range_.begin = alt;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int Compiler::AddSuffixRecursive(int root, int id) {
|
||||||
|
DCHECK(inst_[root].opcode() == kInstAlt ||
|
||||||
|
inst_[root].opcode() == kInstByteRange);
|
||||||
|
|
||||||
|
Frag f = FindByteRange(root, id);
|
||||||
|
if (IsNoMatch(f)) {
|
||||||
|
int alt = AllocInst(1);
|
||||||
|
if (alt < 0)
|
||||||
|
return 0;
|
||||||
|
inst_[alt].InitAlt(root, id);
|
||||||
|
return alt;
|
||||||
|
}
|
||||||
|
|
||||||
|
int br;
|
||||||
|
if (f.end.p == 0)
|
||||||
|
br = root;
|
||||||
|
else if (f.end.p&1)
|
||||||
|
br = inst_[f.begin].out1();
|
||||||
|
else
|
||||||
|
br = inst_[f.begin].out();
|
||||||
|
|
||||||
|
if (IsCachedRuneByteSuffix(br)) {
|
||||||
|
// We can't fiddle with cached suffixes, so make a clone of the head.
|
||||||
|
int byterange = AllocInst(1);
|
||||||
|
if (byterange < 0)
|
||||||
|
return 0;
|
||||||
|
inst_[byterange].InitByteRange(inst_[br].lo(), inst_[br].hi(),
|
||||||
|
inst_[br].foldcase(), inst_[br].out());
|
||||||
|
|
||||||
|
// Ensure that the parent points to the clone, not to the original.
|
||||||
|
// Note that this could leave the head unreachable except via the cache.
|
||||||
|
br = byterange;
|
||||||
|
if (f.end.p == 0)
|
||||||
|
root = br;
|
||||||
|
else if (f.end.p&1)
|
||||||
|
inst_[f.begin].out1_ = br;
|
||||||
|
else
|
||||||
|
inst_[f.begin].set_out(br);
|
||||||
|
}
|
||||||
|
|
||||||
|
int out = inst_[id].out();
|
||||||
|
if (!IsCachedRuneByteSuffix(id)) {
|
||||||
|
// The head should be the instruction most recently allocated, so free it
|
||||||
|
// instead of leaving it unreachable.
|
||||||
|
DCHECK_EQ(id, inst_len_-1);
|
||||||
|
inst_[id].out_opcode_ = 0;
|
||||||
|
inst_[id].out1_ = 0;
|
||||||
|
inst_len_--;
|
||||||
|
}
|
||||||
|
|
||||||
|
out = AddSuffixRecursive(inst_[br].out(), out);
|
||||||
|
if (out == 0)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
inst_[br].set_out(out);
|
||||||
|
return root;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool Compiler::ByteRangeEqual(int id1, int id2) {
|
||||||
|
return inst_[id1].lo() == inst_[id2].lo() &&
|
||||||
|
inst_[id1].hi() == inst_[id2].hi() &&
|
||||||
|
inst_[id1].foldcase() == inst_[id2].foldcase();
|
||||||
|
}
|
||||||
|
|
||||||
|
Frag Compiler::FindByteRange(int root, int id) {
|
||||||
|
if (inst_[root].opcode() == kInstByteRange) {
|
||||||
|
if (ByteRangeEqual(root, id))
|
||||||
|
return Frag(root, nullPatchList);
|
||||||
|
else
|
||||||
|
return NoMatch();
|
||||||
|
}
|
||||||
|
|
||||||
|
while (inst_[root].opcode() == kInstAlt) {
|
||||||
|
int out1 = inst_[root].out1();
|
||||||
|
if (ByteRangeEqual(out1, id))
|
||||||
|
return Frag(root, PatchList::Mk((root << 1) | 1));
|
||||||
|
|
||||||
|
// CharClass is a sorted list of ranges, so if out1 of the root Alt wasn't
|
||||||
|
// what we're looking for, then we can stop immediately. Unfortunately, we
|
||||||
|
// can't short-circuit the search in reverse mode.
|
||||||
|
if (!reversed_)
|
||||||
|
return NoMatch();
|
||||||
|
|
||||||
|
int out = inst_[root].out();
|
||||||
|
if (inst_[out].opcode() == kInstAlt)
|
||||||
|
root = out;
|
||||||
|
else if (ByteRangeEqual(out, id))
|
||||||
|
return Frag(root, PatchList::Mk(root << 1));
|
||||||
|
else
|
||||||
|
return NoMatch();
|
||||||
|
}
|
||||||
|
|
||||||
|
LOG(DFATAL) << "should never happen";
|
||||||
|
return NoMatch();
|
||||||
|
}
|
||||||
|
|
||||||
Frag Compiler::EndRange() {
|
Frag Compiler::EndRange() {
|
||||||
return rune_range_;
|
return rune_range_;
|
||||||
}
|
}
|
||||||
@ -550,12 +656,13 @@ void Compiler::AddRuneRange(Rune lo, Rune hi, bool foldcase) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) {
|
void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) {
|
||||||
// Latin1 is easy: runes *are* bytes.
|
// Latin-1 is easy: runes *are* bytes.
|
||||||
if (lo > hi || lo > 0xFF)
|
if (lo > hi || lo > 0xFF)
|
||||||
return;
|
return;
|
||||||
if (hi > 0xFF)
|
if (hi > 0xFF)
|
||||||
hi = 0xFF;
|
hi = 0xFF;
|
||||||
AddSuffix(RuneByteSuffix(lo, hi, foldcase, 0));
|
AddSuffix(UncachedRuneByteSuffix(static_cast<uint8_t>(lo),
|
||||||
|
static_cast<uint8_t>(hi), foldcase, 0));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Table describing how to make a UTF-8 matching machine
|
// Table describing how to make a UTF-8 matching machine
|
||||||
@ -591,12 +698,13 @@ static struct ByteRangeProg {
|
|||||||
|
|
||||||
void Compiler::Add_80_10ffff() {
|
void Compiler::Add_80_10ffff() {
|
||||||
int inst[arraysize(prog_80_10ffff)] = { 0 }; // does not need to be initialized; silences gcc warning
|
int inst[arraysize(prog_80_10ffff)] = { 0 }; // does not need to be initialized; silences gcc warning
|
||||||
for (size_t i = 0; i < arraysize(prog_80_10ffff); i++) {
|
for (int i = 0; i < arraysize(prog_80_10ffff); i++) {
|
||||||
const ByteRangeProg& p = prog_80_10ffff[i];
|
const ByteRangeProg& p = prog_80_10ffff[i];
|
||||||
int next = 0;
|
int next = 0;
|
||||||
if (p.next >= 0)
|
if (p.next >= 0)
|
||||||
next = inst[p.next];
|
next = inst[p.next];
|
||||||
inst[i] = UncachedRuneByteSuffix(p.lo, p.hi, false, next);
|
inst[i] = UncachedRuneByteSuffix(static_cast<uint8_t>(p.lo),
|
||||||
|
static_cast<uint8_t>(p.hi), false, next);
|
||||||
if ((p.lo & 0xC0) != 0x80)
|
if ((p.lo & 0xC0) != 0x80)
|
||||||
AddSuffix(inst[i]);
|
AddSuffix(inst[i]);
|
||||||
}
|
}
|
||||||
@ -625,13 +733,14 @@ void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) {
|
|||||||
|
|
||||||
// ASCII range is always a special case.
|
// ASCII range is always a special case.
|
||||||
if (hi < Runeself) {
|
if (hi < Runeself) {
|
||||||
AddSuffix(RuneByteSuffix(lo, hi, foldcase, 0));
|
AddSuffix(UncachedRuneByteSuffix(static_cast<uint8_t>(lo),
|
||||||
|
static_cast<uint8_t>(hi), foldcase, 0));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Split range into sections that agree on leading bytes.
|
// Split range into sections that agree on leading bytes.
|
||||||
for (int i = 1; i < UTFmax; i++) {
|
for (int i = 1; i < UTFmax; i++) {
|
||||||
uint m = (1<<(6*i)) - 1; // last i bytes of a UTF-8 sequence
|
uint32_t m = (1<<(6*i)) - 1; // last i bytes of a UTF-8 sequence
|
||||||
if ((lo & ~m) != (hi & ~m)) {
|
if ((lo & ~m) != (hi & ~m)) {
|
||||||
if ((lo & m) != 0) {
|
if ((lo & m) != 0) {
|
||||||
AddRuneRangeUTF8(lo, lo|m, foldcase);
|
AddRuneRangeUTF8(lo, lo|m, foldcase);
|
||||||
@ -647,19 +756,55 @@ void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Finally. Generate byte matching equivalent for lo-hi.
|
// Finally. Generate byte matching equivalent for lo-hi.
|
||||||
uint8 ulo[UTFmax], uhi[UTFmax];
|
uint8_t ulo[UTFmax], uhi[UTFmax];
|
||||||
int n = runetochar(reinterpret_cast<char*>(ulo), &lo);
|
int n = runetochar(reinterpret_cast<char*>(ulo), &lo);
|
||||||
int m = runetochar(reinterpret_cast<char*>(uhi), &hi);
|
int m = runetochar(reinterpret_cast<char*>(uhi), &hi);
|
||||||
(void)m; // USED(m)
|
(void)m; // USED(m)
|
||||||
DCHECK_EQ(n, m);
|
DCHECK_EQ(n, m);
|
||||||
|
|
||||||
|
// The logic below encodes this thinking:
|
||||||
|
//
|
||||||
|
// 1. When we have built the whole suffix, we know that it cannot
|
||||||
|
// possibly be a suffix of anything longer: in forward mode, nothing
|
||||||
|
// else can occur before the leading byte; in reverse mode, nothing
|
||||||
|
// else can occur after the last continuation byte or else the leading
|
||||||
|
// byte would have to change. Thus, there is no benefit to caching
|
||||||
|
// the first byte of the suffix whereas there is a cost involved in
|
||||||
|
// cloning it if it begins a common prefix, which is fairly likely.
|
||||||
|
//
|
||||||
|
// 2. Conversely, the last byte of the suffix cannot possibly be a
|
||||||
|
// prefix of anything because next == 0, so we will never want to
|
||||||
|
// clone it, but it is fairly likely to be a common suffix. Perhaps
|
||||||
|
// more so in reverse mode than in forward mode because the former is
|
||||||
|
// "converging" towards lower entropy, but caching is still worthwhile
|
||||||
|
// for the latter in cases such as 80-BF.
|
||||||
|
//
|
||||||
|
// 3. Handling the bytes between the first and the last is less
|
||||||
|
// straightforward and, again, the approach depends on whether we are
|
||||||
|
// "converging" towards lower entropy: in forward mode, a single byte
|
||||||
|
// is unlikely to be part of a common suffix whereas a byte range
|
||||||
|
// is more likely so; in reverse mode, a byte range is unlikely to
|
||||||
|
// be part of a common suffix whereas a single byte is more likely
|
||||||
|
// so. The same benefit versus cost argument applies here.
|
||||||
int id = 0;
|
int id = 0;
|
||||||
if (reversed_) {
|
if (reversed_) {
|
||||||
for (int i = 0; i < n; i++)
|
for (int i = 0; i < n; i++) {
|
||||||
id = RuneByteSuffix(ulo[i], uhi[i], false, id);
|
// In reverse UTF-8 mode: cache the leading byte; don't cache the last
|
||||||
|
// continuation byte; cache anything else iff it's a single byte (XX-XX).
|
||||||
|
if (i == 0 || (ulo[i] == uhi[i] && i != n-1))
|
||||||
|
id = CachedRuneByteSuffix(ulo[i], uhi[i], false, id);
|
||||||
|
else
|
||||||
|
id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
for (int i = n-1; i >= 0; i--)
|
for (int i = n-1; i >= 0; i--) {
|
||||||
id = RuneByteSuffix(ulo[i], uhi[i], false, id);
|
// In forward UTF-8 mode: don't cache the leading byte; cache the last
|
||||||
|
// continuation byte; cache anything else iff it's a byte range (XX-YY).
|
||||||
|
if (i == n-1 || (ulo[i] < uhi[i] && i != 0))
|
||||||
|
id = CachedRuneByteSuffix(ulo[i], uhi[i], false, id);
|
||||||
|
else
|
||||||
|
id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
AddSuffix(id);
|
AddSuffix(id);
|
||||||
}
|
}
|
||||||
@ -699,11 +844,11 @@ Frag Compiler::Literal(Rune r, bool foldcase) {
|
|||||||
case kEncodingUTF8: {
|
case kEncodingUTF8: {
|
||||||
if (r < Runeself) // Make common case fast.
|
if (r < Runeself) // Make common case fast.
|
||||||
return ByteRange(r, r, foldcase);
|
return ByteRange(r, r, foldcase);
|
||||||
uint8 buf[UTFmax];
|
uint8_t buf[UTFmax];
|
||||||
int n = runetochar(reinterpret_cast<char*>(buf), &r);
|
int n = runetochar(reinterpret_cast<char*>(buf), &r);
|
||||||
Frag f = ByteRange((uint8)buf[0], buf[0], false);
|
Frag f = ByteRange((uint8_t)buf[0], buf[0], false);
|
||||||
for (int i = 1; i < n; i++)
|
for (int i = 1; i < n; i++)
|
||||||
f = Cat(f, ByteRange((uint8)buf[i], buf[i], false));
|
f = Cat(f, ByteRange((uint8_t)buf[i], buf[i], false));
|
||||||
return f;
|
return f;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -732,9 +877,11 @@ Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags,
|
|||||||
|
|
||||||
case kRegexpHaveMatch: {
|
case kRegexpHaveMatch: {
|
||||||
Frag f = Match(re->match_id());
|
Frag f = Match(re->match_id());
|
||||||
// Remember unanchored match to end of string.
|
if (anchor_ == RE2::ANCHOR_BOTH) {
|
||||||
if (anchor_ != RE2::ANCHOR_BOTH)
|
// Append \z or else the subexpression will effectively be unanchored.
|
||||||
f = Cat(DotStar(), Cat(EmptyWidth(kEmptyEndText), f));
|
// Complemented by the UNANCHORED case in CompileSet().
|
||||||
|
f = Cat(EmptyWidth(kEmptyEndText), f);
|
||||||
|
}
|
||||||
return f;
|
return f;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -753,16 +900,16 @@ Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags,
|
|||||||
}
|
}
|
||||||
|
|
||||||
case kRegexpStar:
|
case kRegexpStar:
|
||||||
return Star(child_frags[0], re->parse_flags()&Regexp::NonGreedy);
|
return Star(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
|
||||||
|
|
||||||
case kRegexpPlus:
|
case kRegexpPlus:
|
||||||
return Plus(child_frags[0], re->parse_flags()&Regexp::NonGreedy);
|
return Plus(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
|
||||||
|
|
||||||
case kRegexpQuest:
|
case kRegexpQuest:
|
||||||
return Quest(child_frags[0], re->parse_flags()&Regexp::NonGreedy);
|
return Quest(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
|
||||||
|
|
||||||
case kRegexpLiteral:
|
case kRegexpLiteral:
|
||||||
return Literal(re->rune(), re->parse_flags()&Regexp::FoldCase);
|
return Literal(re->rune(), (re->parse_flags()&Regexp::FoldCase) != 0);
|
||||||
|
|
||||||
case kRegexpLiteralString: {
|
case kRegexpLiteralString: {
|
||||||
// Concatenation of literals.
|
// Concatenation of literals.
|
||||||
@ -770,7 +917,8 @@ Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags,
|
|||||||
return Nop();
|
return Nop();
|
||||||
Frag f;
|
Frag f;
|
||||||
for (int i = 0; i < re->nrunes(); i++) {
|
for (int i = 0; i < re->nrunes(); i++) {
|
||||||
Frag f1 = Literal(re->runes()[i], re->parse_flags()&Regexp::FoldCase);
|
Frag f1 = Literal(re->runes()[i],
|
||||||
|
(re->parse_flags()&Regexp::FoldCase) != 0);
|
||||||
if (i == 0)
|
if (i == 0)
|
||||||
f = f1;
|
f = f1;
|
||||||
else
|
else
|
||||||
@ -815,7 +963,8 @@ Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags,
|
|||||||
// If this range contains all of A-Za-z or none of it,
|
// If this range contains all of A-Za-z or none of it,
|
||||||
// the fold flag is unnecessary; don't bother.
|
// the fold flag is unnecessary; don't bother.
|
||||||
bool fold = foldascii;
|
bool fold = foldascii;
|
||||||
if ((i->lo <= 'A' && 'z' <= i->hi) || i->hi < 'A' || 'z' < i->lo)
|
if ((i->lo <= 'A' && 'z' <= i->hi) || i->hi < 'A' || 'z' < i->lo ||
|
||||||
|
('Z' < i->lo && i->hi < 'a'))
|
||||||
fold = false;
|
fold = false;
|
||||||
|
|
||||||
AddRuneRange(i->lo, i->hi, fold);
|
AddRuneRange(i->lo, i->hi, fold);
|
||||||
@ -949,7 +1098,7 @@ static bool IsAnchorEnd(Regexp** pre, int depth) {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem,
|
void Compiler::Setup(Regexp::ParseFlags flags, int64_t max_mem,
|
||||||
RE2::Anchor anchor) {
|
RE2::Anchor anchor) {
|
||||||
prog_->set_flags(flags);
|
prog_->set_flags(flags);
|
||||||
|
|
||||||
@ -958,11 +1107,11 @@ void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem,
|
|||||||
max_mem_ = max_mem;
|
max_mem_ = max_mem;
|
||||||
if (max_mem <= 0) {
|
if (max_mem <= 0) {
|
||||||
max_inst_ = 100000; // more than enough
|
max_inst_ = 100000; // more than enough
|
||||||
} else if (max_mem <= static_cast<int64>(sizeof(Prog))) {
|
} else if (static_cast<size_t>(max_mem) <= sizeof(Prog)) {
|
||||||
// No room for anything.
|
// No room for anything.
|
||||||
max_inst_ = 0;
|
max_inst_ = 0;
|
||||||
} else {
|
} else {
|
||||||
int64 m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst);
|
int64_t m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst);
|
||||||
// Limit instruction count so that inst->id() fits nicely in an int.
|
// Limit instruction count so that inst->id() fits nicely in an int.
|
||||||
// SparseArray also assumes that the indices (inst->id()) are ints.
|
// SparseArray also assumes that the indices (inst->id()) are ints.
|
||||||
// The call to WalkExponential uses 2*max_inst_ below,
|
// The call to WalkExponential uses 2*max_inst_ below,
|
||||||
@ -978,7 +1127,7 @@ void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem,
|
|||||||
if (m > Prog::Inst::kMaxInst)
|
if (m > Prog::Inst::kMaxInst)
|
||||||
m = Prog::Inst::kMaxInst;
|
m = Prog::Inst::kMaxInst;
|
||||||
|
|
||||||
max_inst_ = m;
|
max_inst_ = static_cast<int>(m);
|
||||||
}
|
}
|
||||||
|
|
||||||
anchor_ = anchor;
|
anchor_ = anchor;
|
||||||
@ -989,10 +1138,9 @@ void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem,
|
|||||||
// If reversed is true, compiles a program that expects
|
// If reversed is true, compiles a program that expects
|
||||||
// to run over the input string backward (reverses all concatenations).
|
// to run over the input string backward (reverses all concatenations).
|
||||||
// The reversed flag is also recorded in the returned program.
|
// The reversed flag is also recorded in the returned program.
|
||||||
Prog* Compiler::Compile(Regexp* re, bool reversed, int64 max_mem) {
|
Prog* Compiler::Compile(Regexp* re, bool reversed, int64_t max_mem) {
|
||||||
Compiler c;
|
Compiler c;
|
||||||
|
c.Setup(re->parse_flags(), max_mem, RE2::UNANCHORED /* unused */);
|
||||||
c.Setup(re->parse_flags(), max_mem, RE2::ANCHOR_BOTH /* unused */);
|
|
||||||
c.reversed_ = reversed;
|
c.reversed_ = reversed;
|
||||||
|
|
||||||
// Simplify to remove things like counted repetitions
|
// Simplify to remove things like counted repetitions
|
||||||
@ -1007,7 +1155,7 @@ Prog* Compiler::Compile(Regexp* re, bool reversed, int64 max_mem) {
|
|||||||
bool is_anchor_end = IsAnchorEnd(&sre, 0);
|
bool is_anchor_end = IsAnchorEnd(&sre, 0);
|
||||||
|
|
||||||
// Generate fragment for entire regexp.
|
// Generate fragment for entire regexp.
|
||||||
Frag f = c.WalkExponential(sre, Frag(), 2*c.max_inst_);
|
Frag all = c.WalkExponential(sre, Frag(), 2*c.max_inst_);
|
||||||
sre->Decref();
|
sre->Decref();
|
||||||
if (c.failed_)
|
if (c.failed_)
|
||||||
return NULL;
|
return NULL;
|
||||||
@ -1016,10 +1164,10 @@ Prog* Compiler::Compile(Regexp* re, bool reversed, int64 max_mem) {
|
|||||||
// Turn off c.reversed_ (if it is set) to force the remaining concatenations
|
// Turn off c.reversed_ (if it is set) to force the remaining concatenations
|
||||||
// to behave normally.
|
// to behave normally.
|
||||||
c.reversed_ = false;
|
c.reversed_ = false;
|
||||||
Frag all = c.Cat(f, c.Match(0));
|
all = c.Cat(all, c.Match(0));
|
||||||
c.prog_->set_start(all.begin);
|
|
||||||
|
|
||||||
if (reversed) {
|
c.prog_->set_reversed(reversed);
|
||||||
|
if (c.prog_->reversed()) {
|
||||||
c.prog_->set_anchor_start(is_anchor_end);
|
c.prog_->set_anchor_start(is_anchor_end);
|
||||||
c.prog_->set_anchor_end(is_anchor_start);
|
c.prog_->set_anchor_end(is_anchor_start);
|
||||||
} else {
|
} else {
|
||||||
@ -1027,15 +1175,12 @@ Prog* Compiler::Compile(Regexp* re, bool reversed, int64 max_mem) {
|
|||||||
c.prog_->set_anchor_end(is_anchor_end);
|
c.prog_->set_anchor_end(is_anchor_end);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
c.prog_->set_start(all.begin);
|
||||||
|
if (!c.prog_->anchor_start()) {
|
||||||
// Also create unanchored version, which starts with a .*? loop.
|
// Also create unanchored version, which starts with a .*? loop.
|
||||||
if (c.prog_->anchor_start()) {
|
all = c.Cat(c.DotStar(), all);
|
||||||
c.prog_->set_start_unanchored(c.prog_->start());
|
|
||||||
} else {
|
|
||||||
Frag unanchored = c.Cat(c.DotStar(), all);
|
|
||||||
c.prog_->set_start_unanchored(unanchored.begin);
|
|
||||||
}
|
}
|
||||||
|
c.prog_->set_start_unanchored(all.begin);
|
||||||
c.prog_->set_reversed(reversed);
|
|
||||||
|
|
||||||
// Hand ownership of prog_ to caller.
|
// Hand ownership of prog_ to caller.
|
||||||
return c.Finish();
|
return c.Finish();
|
||||||
@ -1050,22 +1195,20 @@ Prog* Compiler::Finish() {
|
|||||||
inst_len_ = 1;
|
inst_len_ = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Trim instruction to minimum array and transfer to Prog.
|
// Hand off the array to Prog.
|
||||||
Trim();
|
|
||||||
prog_->inst_ = inst_;
|
prog_->inst_ = inst_;
|
||||||
prog_->size_ = inst_len_;
|
prog_->size_ = inst_len_;
|
||||||
inst_ = NULL;
|
inst_ = NULL;
|
||||||
|
|
||||||
// Compute byte map.
|
|
||||||
prog_->ComputeByteMap();
|
|
||||||
|
|
||||||
prog_->Optimize();
|
prog_->Optimize();
|
||||||
|
prog_->Flatten();
|
||||||
|
prog_->ComputeByteMap();
|
||||||
|
|
||||||
// Record remaining memory for DFA.
|
// Record remaining memory for DFA.
|
||||||
if (max_mem_ <= 0) {
|
if (max_mem_ <= 0) {
|
||||||
prog_->set_dfa_mem(1<<20);
|
prog_->set_dfa_mem(1<<20);
|
||||||
} else {
|
} else {
|
||||||
int64 m = max_mem_ - sizeof(Prog) - inst_len_*sizeof(Prog::Inst);
|
int64_t m = max_mem_ - sizeof(Prog) - prog_->size_*sizeof(Prog::Inst);
|
||||||
if (m < 0)
|
if (m < 0)
|
||||||
m = 0;
|
m = 0;
|
||||||
prog_->set_dfa_mem(m);
|
prog_->set_dfa_mem(m);
|
||||||
@ -1077,11 +1220,11 @@ Prog* Compiler::Finish() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Converts Regexp to Prog.
|
// Converts Regexp to Prog.
|
||||||
Prog* Regexp::CompileToProg(int64 max_mem) {
|
Prog* Regexp::CompileToProg(int64_t max_mem) {
|
||||||
return Compiler::Compile(this, false, max_mem);
|
return Compiler::Compile(this, false, max_mem);
|
||||||
}
|
}
|
||||||
|
|
||||||
Prog* Regexp::CompileToReverseProg(int64 max_mem) {
|
Prog* Regexp::CompileToReverseProg(int64_t max_mem) {
|
||||||
return Compiler::Compile(this, true, max_mem);
|
return Compiler::Compile(this, true, max_mem);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1090,41 +1233,41 @@ Frag Compiler::DotStar() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Compiles RE set to Prog.
|
// Compiles RE set to Prog.
|
||||||
Prog* Compiler::CompileSet(const RE2::Options& options, RE2::Anchor anchor,
|
Prog* Compiler::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) {
|
||||||
Regexp* re) {
|
|
||||||
Compiler c;
|
Compiler c;
|
||||||
|
c.Setup(re->parse_flags(), max_mem, anchor);
|
||||||
|
|
||||||
Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(options.ParseFlags());
|
Regexp* sre = re->Simplify();
|
||||||
c.Setup(pf, options.max_mem(), anchor);
|
if (sre == NULL)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
// Compile alternation of fragments.
|
Frag all = c.WalkExponential(sre, Frag(), 2*c.max_inst_);
|
||||||
Frag all = c.WalkExponential(re, Frag(), 2*c.max_inst_);
|
sre->Decref();
|
||||||
re->Decref();
|
|
||||||
if (c.failed_)
|
if (c.failed_)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
if (anchor == RE2::UNANCHORED) {
|
|
||||||
// The trailing .* was added while handling kRegexpHaveMatch.
|
|
||||||
// We just have to add the leading one.
|
|
||||||
all = c.Cat(c.DotStar(), all);
|
|
||||||
}
|
|
||||||
|
|
||||||
c.prog_->set_start(all.begin);
|
|
||||||
c.prog_->set_start_unanchored(all.begin);
|
|
||||||
c.prog_->set_anchor_start(true);
|
c.prog_->set_anchor_start(true);
|
||||||
c.prog_->set_anchor_end(true);
|
c.prog_->set_anchor_end(true);
|
||||||
|
|
||||||
|
if (anchor == RE2::UNANCHORED) {
|
||||||
|
// Prepend .* or else the expression will effectively be anchored.
|
||||||
|
// Complemented by the ANCHOR_BOTH case in PostVisit().
|
||||||
|
all = c.Cat(c.DotStar(), all);
|
||||||
|
}
|
||||||
|
c.prog_->set_start(all.begin);
|
||||||
|
c.prog_->set_start_unanchored(all.begin);
|
||||||
|
|
||||||
Prog* prog = c.Finish();
|
Prog* prog = c.Finish();
|
||||||
if (prog == NULL)
|
if (prog == NULL)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
// Make sure DFA has enough memory to operate,
|
// Make sure DFA has enough memory to operate,
|
||||||
// since we're not going to fall back to the NFA.
|
// since we're not going to fall back to the NFA.
|
||||||
bool failed;
|
bool dfa_failed = false;
|
||||||
StringPiece sp = "hello, world";
|
StringPiece sp = "hello, world";
|
||||||
prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch,
|
prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch,
|
||||||
NULL, &failed, NULL);
|
NULL, &dfa_failed, NULL);
|
||||||
if (failed) {
|
if (dfa_failed) {
|
||||||
delete prog;
|
delete prog;
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
@ -1132,9 +1275,8 @@ Prog* Compiler::CompileSet(const RE2::Options& options, RE2::Anchor anchor,
|
|||||||
return prog;
|
return prog;
|
||||||
}
|
}
|
||||||
|
|
||||||
Prog* Prog::CompileSet(const RE2::Options& options, RE2::Anchor anchor,
|
Prog* Prog::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) {
|
||||||
Regexp* re) {
|
return Compiler::CompileSet(re, anchor, max_mem);
|
||||||
return Compiler::CompileSet(options, anchor, re);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace re2
|
} // namespace re2
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -2,9 +2,13 @@
|
|||||||
// Use of this source code is governed by a BSD-style
|
// Use of this source code is governed by a BSD-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
#include <string>
|
|
||||||
#include "util/util.h"
|
|
||||||
#include "re2/filtered_re2.h"
|
#include "re2/filtered_re2.h"
|
||||||
|
|
||||||
|
#include <stddef.h>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
#include "util/util.h"
|
||||||
|
#include "util/logging.h"
|
||||||
#include "re2/prefilter.h"
|
#include "re2/prefilter.h"
|
||||||
#include "re2/prefilter_tree.h"
|
#include "re2/prefilter_tree.h"
|
||||||
|
|
||||||
@ -15,6 +19,11 @@ FilteredRE2::FilteredRE2()
|
|||||||
prefilter_tree_(new PrefilterTree()) {
|
prefilter_tree_(new PrefilterTree()) {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
FilteredRE2::FilteredRE2(int min_atom_len)
|
||||||
|
: compiled_(false),
|
||||||
|
prefilter_tree_(new PrefilterTree(min_atom_len)) {
|
||||||
|
}
|
||||||
|
|
||||||
FilteredRE2::~FilteredRE2() {
|
FilteredRE2::~FilteredRE2() {
|
||||||
for (size_t i = 0; i < re2_vec_.size(); i++)
|
for (size_t i = 0; i < re2_vec_.size(); i++)
|
||||||
delete re2_vec_[i];
|
delete re2_vec_[i];
|
||||||
@ -33,16 +42,21 @@ RE2::ErrorCode FilteredRE2::Add(const StringPiece& pattern,
|
|||||||
}
|
}
|
||||||
delete re;
|
delete re;
|
||||||
} else {
|
} else {
|
||||||
*id = re2_vec_.size();
|
*id = static_cast<int>(re2_vec_.size());
|
||||||
re2_vec_.push_back(re);
|
re2_vec_.push_back(re);
|
||||||
}
|
}
|
||||||
|
|
||||||
return code;
|
return code;
|
||||||
}
|
}
|
||||||
|
|
||||||
void FilteredRE2::Compile(vector<string>* atoms) {
|
void FilteredRE2::Compile(std::vector<string>* atoms) {
|
||||||
if (compiled_ || re2_vec_.size() == 0) {
|
if (compiled_) {
|
||||||
LOG(INFO) << "C: " << compiled_ << " S:" << re2_vec_.size();
|
LOG(ERROR) << "Compile called already.";
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (re2_vec_.empty()) {
|
||||||
|
LOG(ERROR) << "Compile called before Add.";
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -58,17 +72,17 @@ void FilteredRE2::Compile(vector<string>* atoms) {
|
|||||||
int FilteredRE2::SlowFirstMatch(const StringPiece& text) const {
|
int FilteredRE2::SlowFirstMatch(const StringPiece& text) const {
|
||||||
for (size_t i = 0; i < re2_vec_.size(); i++)
|
for (size_t i = 0; i < re2_vec_.size(); i++)
|
||||||
if (RE2::PartialMatch(text, *re2_vec_[i]))
|
if (RE2::PartialMatch(text, *re2_vec_[i]))
|
||||||
return i;
|
return static_cast<int>(i);
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
int FilteredRE2::FirstMatch(const StringPiece& text,
|
int FilteredRE2::FirstMatch(const StringPiece& text,
|
||||||
const vector<int>& atoms) const {
|
const std::vector<int>& atoms) const {
|
||||||
if (!compiled_) {
|
if (!compiled_) {
|
||||||
LOG(DFATAL) << "FirstMatch called before Compile";
|
LOG(DFATAL) << "FirstMatch called before Compile.";
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
vector<int> regexps;
|
std::vector<int> regexps;
|
||||||
prefilter_tree_->RegexpsGivenStrings(atoms, ®exps);
|
prefilter_tree_->RegexpsGivenStrings(atoms, ®exps);
|
||||||
for (size_t i = 0; i < regexps.size(); i++)
|
for (size_t i = 0; i < regexps.size(); i++)
|
||||||
if (RE2::PartialMatch(text, *re2_vec_[regexps[i]]))
|
if (RE2::PartialMatch(text, *re2_vec_[regexps[i]]))
|
||||||
@ -78,10 +92,10 @@ int FilteredRE2::FirstMatch(const StringPiece& text,
|
|||||||
|
|
||||||
bool FilteredRE2::AllMatches(
|
bool FilteredRE2::AllMatches(
|
||||||
const StringPiece& text,
|
const StringPiece& text,
|
||||||
const vector<int>& atoms,
|
const std::vector<int>& atoms,
|
||||||
vector<int>* matching_regexps) const {
|
std::vector<int>* matching_regexps) const {
|
||||||
matching_regexps->clear();
|
matching_regexps->clear();
|
||||||
vector<int> regexps;
|
std::vector<int> regexps;
|
||||||
prefilter_tree_->RegexpsGivenStrings(atoms, ®exps);
|
prefilter_tree_->RegexpsGivenStrings(atoms, ®exps);
|
||||||
for (size_t i = 0; i < regexps.size(); i++)
|
for (size_t i = 0; i < regexps.size(); i++)
|
||||||
if (RE2::PartialMatch(text, *re2_vec_[regexps[i]]))
|
if (RE2::PartialMatch(text, *re2_vec_[regexps[i]]))
|
||||||
@ -89,11 +103,16 @@ bool FilteredRE2::AllMatches(
|
|||||||
return !matching_regexps->empty();
|
return !matching_regexps->empty();
|
||||||
}
|
}
|
||||||
|
|
||||||
void FilteredRE2::RegexpsGivenStrings(const vector<int>& matched_atoms,
|
void FilteredRE2::AllPotentials(
|
||||||
vector<int>* passed_regexps) {
|
const std::vector<int>& atoms,
|
||||||
prefilter_tree_->RegexpsGivenStrings(matched_atoms, passed_regexps);
|
std::vector<int>* potential_regexps) const {
|
||||||
|
prefilter_tree_->RegexpsGivenStrings(atoms, potential_regexps);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void FilteredRE2::RegexpsGivenStrings(const std::vector<int>& matched_atoms,
|
||||||
|
std::vector<int>* passed_regexps) {
|
||||||
|
prefilter_tree_->RegexpsGivenStrings(matched_atoms, passed_regexps);
|
||||||
|
}
|
||||||
|
|
||||||
void FilteredRE2::PrintPrefilter(int regexpid) {
|
void FilteredRE2::PrintPrefilter(int regexpid) {
|
||||||
prefilter_tree_->PrintPrefilter(regexpid);
|
prefilter_tree_->PrintPrefilter(regexpid);
|
||||||
|
@ -2,6 +2,9 @@
|
|||||||
// Use of this source code is governed by a BSD-style
|
// Use of this source code is governed by a BSD-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
#ifndef RE2_FILTERED_RE2_H_
|
||||||
|
#define RE2_FILTERED_RE2_H_
|
||||||
|
|
||||||
// The class FilteredRE2 is used as a wrapper to multiple RE2 regexps.
|
// The class FilteredRE2 is used as a wrapper to multiple RE2 regexps.
|
||||||
// It provides a prefilter mechanism that helps in cutting down the
|
// It provides a prefilter mechanism that helps in cutting down the
|
||||||
// number of regexps that need to be actually searched.
|
// number of regexps that need to be actually searched.
|
||||||
@ -18,20 +21,19 @@
|
|||||||
// indices of strings that were found in the text to get the actual
|
// indices of strings that were found in the text to get the actual
|
||||||
// regexp matches.
|
// regexp matches.
|
||||||
|
|
||||||
#ifndef RE2_FILTERED_RE2_H_
|
#include <string>
|
||||||
#define RE2_FILTERED_RE2_H_
|
|
||||||
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include "re2/re2.h"
|
#include "re2/re2.h"
|
||||||
|
|
||||||
namespace re2 {
|
namespace re2 {
|
||||||
using std::vector;
|
|
||||||
|
|
||||||
class PrefilterTree;
|
class PrefilterTree;
|
||||||
|
|
||||||
class FilteredRE2 {
|
class FilteredRE2 {
|
||||||
public:
|
public:
|
||||||
FilteredRE2();
|
FilteredRE2();
|
||||||
|
explicit FilteredRE2(int min_atom_len);
|
||||||
~FilteredRE2();
|
~FilteredRE2();
|
||||||
|
|
||||||
// Uses RE2 constructor to create a RE2 object (re). Returns
|
// Uses RE2 constructor to create a RE2 object (re). Returns
|
||||||
@ -47,7 +49,7 @@ class FilteredRE2 {
|
|||||||
// the search text should be lowercased first to find matching
|
// the search text should be lowercased first to find matching
|
||||||
// strings from the set of strings returned by Compile. Call after
|
// strings from the set of strings returned by Compile. Call after
|
||||||
// all Add calls are done.
|
// all Add calls are done.
|
||||||
void Compile(vector<string>* strings_to_match);
|
void Compile(std::vector<string>* strings_to_match);
|
||||||
|
|
||||||
// Returns the index of the first matching regexp.
|
// Returns the index of the first matching regexp.
|
||||||
// Returns -1 on no match. Can be called prior to Compile.
|
// Returns -1 on no match. Can be called prior to Compile.
|
||||||
@ -59,16 +61,24 @@ class FilteredRE2 {
|
|||||||
// Returns -1 on no match. Compile has to be called before
|
// Returns -1 on no match. Compile has to be called before
|
||||||
// calling this.
|
// calling this.
|
||||||
int FirstMatch(const StringPiece& text,
|
int FirstMatch(const StringPiece& text,
|
||||||
const vector<int>& atoms) const;
|
const std::vector<int>& atoms) const;
|
||||||
|
|
||||||
// Returns the indices of all matching regexps, after first clearing
|
// Returns the indices of all matching regexps, after first clearing
|
||||||
// matched_regexps.
|
// matched_regexps.
|
||||||
bool AllMatches(const StringPiece& text,
|
bool AllMatches(const StringPiece& text,
|
||||||
const vector<int>& atoms,
|
const std::vector<int>& atoms,
|
||||||
vector<int>* matching_regexps) const;
|
std::vector<int>* matching_regexps) const;
|
||||||
|
|
||||||
|
// Returns the indices of all potentially matching regexps after first
|
||||||
|
// clearing potential_regexps.
|
||||||
|
// A regexp is potentially matching if it passes the filter.
|
||||||
|
// If a regexp passes the filter it may still not match.
|
||||||
|
// A regexp that does not pass the filter is guaranteed to not match.
|
||||||
|
void AllPotentials(const std::vector<int>& atoms,
|
||||||
|
std::vector<int>* potential_regexps) const;
|
||||||
|
|
||||||
// The number of regexps added.
|
// The number of regexps added.
|
||||||
int NumRegexps() const { return re2_vec_.size(); }
|
int NumRegexps() const { return static_cast<int>(re2_vec_.size()); }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
||||||
@ -79,11 +89,11 @@ class FilteredRE2 {
|
|||||||
void PrintPrefilter(int regexpid);
|
void PrintPrefilter(int regexpid);
|
||||||
|
|
||||||
// Useful for testing and debugging.
|
// Useful for testing and debugging.
|
||||||
void RegexpsGivenStrings(const vector<int>& matched_atoms,
|
void RegexpsGivenStrings(const std::vector<int>& matched_atoms,
|
||||||
vector<int>* passed_regexps);
|
std::vector<int>* passed_regexps);
|
||||||
|
|
||||||
// All the regexps in the FilteredRE2.
|
// All the regexps in the FilteredRE2.
|
||||||
vector<RE2*> re2_vec_;
|
std::vector<RE2*> re2_vec_;
|
||||||
|
|
||||||
// Has the FilteredRE2 been compiled using Compile()
|
// Has the FilteredRE2 been compiled using Compile()
|
||||||
bool compiled_;
|
bool compiled_;
|
||||||
@ -91,9 +101,8 @@ class FilteredRE2 {
|
|||||||
// An AND-OR tree of string atoms used for filtering regexps.
|
// An AND-OR tree of string atoms used for filtering regexps.
|
||||||
PrefilterTree* prefilter_tree_;
|
PrefilterTree* prefilter_tree_;
|
||||||
|
|
||||||
//DISALLOW_EVIL_CONSTRUCTORS(FilteredRE2);
|
FilteredRE2(const FilteredRE2&) = delete;
|
||||||
FilteredRE2(const FilteredRE2&);
|
FilteredRE2& operator=(const FilteredRE2&) = delete;
|
||||||
void operator=(const FilteredRE2&);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace re2
|
} // namespace re2
|
||||||
|
@ -23,6 +23,7 @@
|
|||||||
// Regexp::MimicsPCRE checks for any of these conditions.
|
// Regexp::MimicsPCRE checks for any of these conditions.
|
||||||
|
|
||||||
#include "util/util.h"
|
#include "util/util.h"
|
||||||
|
#include "util/logging.h"
|
||||||
#include "re2/regexp.h"
|
#include "re2/regexp.h"
|
||||||
#include "re2/walker-inl.h"
|
#include "re2/walker-inl.h"
|
||||||
|
|
||||||
@ -124,7 +125,8 @@ class EmptyStringWalker : public Regexp::Walker<bool> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
DISALLOW_EVIL_CONSTRUCTORS(EmptyStringWalker);
|
EmptyStringWalker(const EmptyStringWalker&) = delete;
|
||||||
|
EmptyStringWalker& operator=(const EmptyStringWalker&) = delete;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Called after visiting re's children. child_args contains the return
|
// Called after visiting re's children. child_args contains the return
|
||||||
|
@ -24,13 +24,24 @@
|
|||||||
// Like Thompson's original machine and like the DFA implementation, this
|
// Like Thompson's original machine and like the DFA implementation, this
|
||||||
// implementation notices a match only once it is one byte past it.
|
// implementation notices a match only once it is one byte past it.
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <algorithm>
|
||||||
|
#include <string>
|
||||||
|
#include <utility>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
#include "re2/prog.h"
|
#include "re2/prog.h"
|
||||||
#include "re2/regexp.h"
|
#include "re2/regexp.h"
|
||||||
|
#include "util/logging.h"
|
||||||
#include "util/sparse_array.h"
|
#include "util/sparse_array.h"
|
||||||
#include "util/sparse_set.h"
|
#include "util/sparse_set.h"
|
||||||
|
#include "util/strutil.h"
|
||||||
|
|
||||||
namespace re2 {
|
namespace re2 {
|
||||||
|
|
||||||
|
static const bool ExtraDebug = false;
|
||||||
|
|
||||||
class NFA {
|
class NFA {
|
||||||
public:
|
public:
|
||||||
NFA(Prog* prog);
|
NFA(Prog* prog);
|
||||||
@ -51,12 +62,10 @@ class NFA {
|
|||||||
bool anchored, bool longest,
|
bool anchored, bool longest,
|
||||||
StringPiece* submatch, int nsubmatch);
|
StringPiece* submatch, int nsubmatch);
|
||||||
|
|
||||||
static const int Debug = 0;
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
struct Thread {
|
struct Thread {
|
||||||
union {
|
union {
|
||||||
int id;
|
int ref;
|
||||||
Thread* next; // when on free list
|
Thread* next; // when on free list
|
||||||
};
|
};
|
||||||
const char** capture;
|
const char** capture;
|
||||||
@ -65,15 +74,14 @@ class NFA {
|
|||||||
// State for explicit stack in AddToThreadq.
|
// State for explicit stack in AddToThreadq.
|
||||||
struct AddState {
|
struct AddState {
|
||||||
int id; // Inst to process
|
int id; // Inst to process
|
||||||
int j;
|
Thread* t; // if not null, set t0 = t before processing id
|
||||||
const char* cap_j; // if j>=0, set capture[j] = cap_j before processing ip
|
|
||||||
|
|
||||||
AddState()
|
AddState()
|
||||||
: id(0), j(-1), cap_j(NULL) {}
|
: id(0), t(NULL) {}
|
||||||
explicit AddState(int id)
|
explicit AddState(int id)
|
||||||
: id(id), j(-1), cap_j(NULL) {}
|
: id(id), t(NULL) {}
|
||||||
AddState(int id, const char* cap_j, int j)
|
AddState(int id, Thread* t)
|
||||||
: id(id), j(j), cap_j(cap_j) {}
|
: id(id), t(t) {}
|
||||||
};
|
};
|
||||||
|
|
||||||
// Threadq is a list of threads. The list is sorted by the order
|
// Threadq is a list of threads. The list is sorted by the order
|
||||||
@ -82,19 +90,24 @@ class NFA {
|
|||||||
typedef SparseArray<Thread*> Threadq;
|
typedef SparseArray<Thread*> Threadq;
|
||||||
|
|
||||||
inline Thread* AllocThread();
|
inline Thread* AllocThread();
|
||||||
inline void FreeThread(Thread*);
|
inline Thread* Incref(Thread* t);
|
||||||
|
inline void Decref(Thread* t);
|
||||||
|
|
||||||
// Add id (or its children, following unlabeled arrows)
|
// Follows all empty arrows from id0 and enqueues all the states reached.
|
||||||
// to the workqueue q with associated capture info.
|
// Enqueues only the ByteRange instructions that match byte c.
|
||||||
void AddToThreadq(Threadq* q, int id, int flag,
|
// The bits in flag (Bol, Eol, etc.) specify whether ^, $ and \b match.
|
||||||
const char* p, const char** capture);
|
// p is the current input position, and t0 is the current thread.
|
||||||
|
void AddToThreadq(Threadq* q, int id0, int c, int flag,
|
||||||
|
const char* p, Thread* t0);
|
||||||
|
|
||||||
// Run runq on byte c, appending new states to nextq.
|
// Run runq on byte c, appending new states to nextq.
|
||||||
// Updates matched_ and match_ as new, better matches are found.
|
// Updates matched_ and match_ as new, better matches are found.
|
||||||
// p is position of the next byte (the one after c)
|
// p is the position of byte c in the input string for AddToThreadq;
|
||||||
// in the input string, used when processing capturing parens.
|
// p-1 will be used when processing Match instructions.
|
||||||
// flag is the bitwise or of Bol, Eol, etc., specifying whether
|
// flag is the bitwise OR of Bol, Eol, etc., specifying whether
|
||||||
// ^, $ and \b match the current input point (after c).
|
// ^, $ and \b match the current input position (after c).
|
||||||
|
// Frees all the threads on runq.
|
||||||
|
// If there is a shortcut to the end, returns that shortcut.
|
||||||
inline int Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p);
|
inline int Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p);
|
||||||
|
|
||||||
// Returns text version of capture information, for debugging.
|
// Returns text version of capture information, for debugging.
|
||||||
@ -102,10 +115,6 @@ class NFA {
|
|||||||
|
|
||||||
inline void CopyCapture(const char** dst, const char** src);
|
inline void CopyCapture(const char** dst, const char** src);
|
||||||
|
|
||||||
// Computes whether all matches must begin with the same first
|
|
||||||
// byte, and if so, returns that byte. If not, returns -1.
|
|
||||||
int ComputeFirstByte();
|
|
||||||
|
|
||||||
Prog* prog_; // underlying program
|
Prog* prog_; // underlying program
|
||||||
int start_; // start instruction in program
|
int start_; // start instruction in program
|
||||||
int ncapture_; // number of submatches to track
|
int ncapture_; // number of submatches to track
|
||||||
@ -118,16 +127,16 @@ class NFA {
|
|||||||
bool matched_; // any match so far?
|
bool matched_; // any match so far?
|
||||||
AddState* astack_; // pre-allocated for AddToThreadq
|
AddState* astack_; // pre-allocated for AddToThreadq
|
||||||
int nastack_;
|
int nastack_;
|
||||||
int first_byte_; // required first byte for match, or -1 if none
|
|
||||||
|
|
||||||
Thread* free_threads_; // free list
|
Thread* free_threads_; // free list
|
||||||
|
|
||||||
DISALLOW_EVIL_CONSTRUCTORS(NFA);
|
NFA(const NFA&) = delete;
|
||||||
|
NFA& operator=(const NFA&) = delete;
|
||||||
};
|
};
|
||||||
|
|
||||||
NFA::NFA(Prog* prog) {
|
NFA::NFA(Prog* prog) {
|
||||||
prog_ = prog;
|
prog_ = prog;
|
||||||
start_ = prog->start();
|
start_ = prog_->start();
|
||||||
ncapture_ = 0;
|
ncapture_ = 0;
|
||||||
longest_ = false;
|
longest_ = false;
|
||||||
endmatch_ = false;
|
endmatch_ = false;
|
||||||
@ -135,12 +144,14 @@ NFA::NFA(Prog* prog) {
|
|||||||
etext_ = NULL;
|
etext_ = NULL;
|
||||||
q0_.resize(prog_->size());
|
q0_.resize(prog_->size());
|
||||||
q1_.resize(prog_->size());
|
q1_.resize(prog_->size());
|
||||||
nastack_ = 2*prog_->size();
|
// See NFA::AddToThreadq() for why this is so.
|
||||||
|
nastack_ = 2*prog_->inst_count(kInstCapture) +
|
||||||
|
prog_->inst_count(kInstEmptyWidth) +
|
||||||
|
prog_->inst_count(kInstNop) + 1; // + 1 for start inst
|
||||||
astack_ = new AddState[nastack_];
|
astack_ = new AddState[nastack_];
|
||||||
match_ = NULL;
|
match_ = NULL;
|
||||||
matched_ = false;
|
matched_ = false;
|
||||||
free_threads_ = NULL;
|
free_threads_ = NULL;
|
||||||
first_byte_ = ComputeFirstByte();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
NFA::~NFA() {
|
NFA::~NFA() {
|
||||||
@ -154,24 +165,36 @@ NFA::~NFA() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void NFA::FreeThread(Thread *t) {
|
|
||||||
if (t == NULL)
|
|
||||||
return;
|
|
||||||
t->next = free_threads_;
|
|
||||||
free_threads_ = t;
|
|
||||||
}
|
|
||||||
|
|
||||||
NFA::Thread* NFA::AllocThread() {
|
NFA::Thread* NFA::AllocThread() {
|
||||||
Thread* t = free_threads_;
|
Thread* t = free_threads_;
|
||||||
if (t == NULL) {
|
if (t == NULL) {
|
||||||
t = new Thread;
|
t = new Thread;
|
||||||
|
t->ref = 1;
|
||||||
t->capture = new const char*[ncapture_];
|
t->capture = new const char*[ncapture_];
|
||||||
return t;
|
return t;
|
||||||
}
|
}
|
||||||
free_threads_ = t->next;
|
free_threads_ = t->next;
|
||||||
|
t->ref = 1;
|
||||||
return t;
|
return t;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
NFA::Thread* NFA::Incref(Thread* t) {
|
||||||
|
DCHECK(t != NULL);
|
||||||
|
t->ref++;
|
||||||
|
return t;
|
||||||
|
}
|
||||||
|
|
||||||
|
void NFA::Decref(Thread* t) {
|
||||||
|
if (t == NULL)
|
||||||
|
return;
|
||||||
|
t->ref--;
|
||||||
|
if (t->ref > 0)
|
||||||
|
return;
|
||||||
|
DCHECK_EQ(t->ref, 0);
|
||||||
|
t->next = free_threads_;
|
||||||
|
free_threads_ = t;
|
||||||
|
}
|
||||||
|
|
||||||
void NFA::CopyCapture(const char** dst, const char** src) {
|
void NFA::CopyCapture(const char** dst, const char** src) {
|
||||||
for (int i = 0; i < ncapture_; i+=2) {
|
for (int i = 0; i < ncapture_; i+=2) {
|
||||||
dst[i] = src[i];
|
dst[i] = src[i];
|
||||||
@ -180,35 +203,43 @@ void NFA::CopyCapture(const char** dst, const char** src) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Follows all empty arrows from id0 and enqueues all the states reached.
|
// Follows all empty arrows from id0 and enqueues all the states reached.
|
||||||
|
// Enqueues only the ByteRange instructions that match byte c.
|
||||||
// The bits in flag (Bol, Eol, etc.) specify whether ^, $ and \b match.
|
// The bits in flag (Bol, Eol, etc.) specify whether ^, $ and \b match.
|
||||||
// The pointer p is the current input position, and m is the
|
// p is the current input position, and t0 is the current thread.
|
||||||
// current set of match boundaries.
|
void NFA::AddToThreadq(Threadq* q, int id0, int c, int flag,
|
||||||
void NFA::AddToThreadq(Threadq* q, int id0, int flag,
|
const char* p, Thread* t0) {
|
||||||
const char* p, const char** capture) {
|
|
||||||
if (id0 == 0)
|
if (id0 == 0)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
// Astack_ is pre-allocated to avoid resize operations.
|
// Use astack_ to hold our stack of instructions yet to process.
|
||||||
// It has room for 2*prog_->size() entries, which is enough:
|
// It was preallocated as follows:
|
||||||
// Each inst in prog can be processed at most once,
|
// two entries per Capture;
|
||||||
// pushing at most two entries on stk.
|
// one entry per EmptyWidth; and
|
||||||
|
// one entry per Nop.
|
||||||
int nstk = 0;
|
// This reflects the maximum number of stack pushes that each can
|
||||||
|
// perform. (Each instruction can be processed at most once.)
|
||||||
AddState* stk = astack_;
|
AddState* stk = astack_;
|
||||||
stk[nstk++] = AddState(id0);
|
int nstk = 0;
|
||||||
|
|
||||||
|
stk[nstk++] = AddState(id0);
|
||||||
while (nstk > 0) {
|
while (nstk > 0) {
|
||||||
DCHECK_LE(nstk, nastack_);
|
DCHECK_LE(nstk, nastack_);
|
||||||
const AddState& a = stk[--nstk];
|
AddState a = stk[--nstk];
|
||||||
if (a.j >= 0)
|
|
||||||
capture[a.j] = a.cap_j;
|
Loop:
|
||||||
|
if (a.t != NULL) {
|
||||||
|
// t0 was a thread that we allocated and copied in order to
|
||||||
|
// record the capture, so we must now decref it.
|
||||||
|
Decref(t0);
|
||||||
|
t0 = a.t;
|
||||||
|
}
|
||||||
|
|
||||||
int id = a.id;
|
int id = a.id;
|
||||||
if (id == 0)
|
if (id == 0)
|
||||||
continue;
|
continue;
|
||||||
if (q->has_index(id)) {
|
if (q->has_index(id)) {
|
||||||
if (Debug)
|
if (ExtraDebug)
|
||||||
fprintf(stderr, " [%d%s]\n", id, FormatCapture(capture).c_str());
|
fprintf(stderr, " [%d%s]\n", id, FormatCapture(t0->capture).c_str());
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -231,62 +262,76 @@ void NFA::AddToThreadq(Threadq* q, int id0, int flag,
|
|||||||
|
|
||||||
case kInstAltMatch:
|
case kInstAltMatch:
|
||||||
// Save state; will pick up at next byte.
|
// Save state; will pick up at next byte.
|
||||||
t = AllocThread();
|
t = Incref(t0);
|
||||||
t->id = id;
|
|
||||||
CopyCapture(t->capture, capture);
|
|
||||||
*tp = t;
|
*tp = t;
|
||||||
// fall through
|
|
||||||
|
|
||||||
case kInstAlt:
|
DCHECK(!ip->last());
|
||||||
// Explore alternatives.
|
a = AddState(id+1);
|
||||||
stk[nstk++] = AddState(ip->out1());
|
goto Loop;
|
||||||
stk[nstk++] = AddState(ip->out());
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kInstNop:
|
case kInstNop:
|
||||||
|
if (!ip->last())
|
||||||
|
stk[nstk++] = AddState(id+1);
|
||||||
|
|
||||||
// Continue on.
|
// Continue on.
|
||||||
stk[nstk++] = AddState(ip->out());
|
a = AddState(ip->out());
|
||||||
break;
|
goto Loop;
|
||||||
|
|
||||||
case kInstCapture:
|
case kInstCapture:
|
||||||
|
if (!ip->last())
|
||||||
|
stk[nstk++] = AddState(id+1);
|
||||||
|
|
||||||
if ((j=ip->cap()) < ncapture_) {
|
if ((j=ip->cap()) < ncapture_) {
|
||||||
// Push a dummy whose only job is to restore capture[j]
|
// Push a dummy whose only job is to restore t0
|
||||||
// once we finish exploring this possibility.
|
// once we finish exploring this possibility.
|
||||||
stk[nstk++] = AddState(0, capture[j], j);
|
stk[nstk++] = AddState(0, t0);
|
||||||
|
|
||||||
// Record capture.
|
// Record capture.
|
||||||
capture[j] = p;
|
t = AllocThread();
|
||||||
|
CopyCapture(t->capture, t0->capture);
|
||||||
|
t->capture[j] = p;
|
||||||
|
t0 = t;
|
||||||
}
|
}
|
||||||
stk[nstk++] = AddState(ip->out());
|
a = AddState(ip->out());
|
||||||
break;
|
goto Loop;
|
||||||
|
|
||||||
|
case kInstByteRange:
|
||||||
|
if (!ip->Matches(c))
|
||||||
|
goto Next;
|
||||||
|
FALLTHROUGH_INTENDED;
|
||||||
|
|
||||||
case kInstMatch:
|
case kInstMatch:
|
||||||
case kInstByteRange:
|
|
||||||
// Save state; will pick up at next byte.
|
// Save state; will pick up at next byte.
|
||||||
t = AllocThread();
|
t = Incref(t0);
|
||||||
t->id = id;
|
|
||||||
CopyCapture(t->capture, capture);
|
|
||||||
*tp = t;
|
*tp = t;
|
||||||
if (Debug)
|
if (ExtraDebug)
|
||||||
fprintf(stderr, " + %d%s [%p]\n", id, FormatCapture(t->capture).c_str(), t);
|
fprintf(stderr, " + %d%s\n", id, FormatCapture(t0->capture).c_str());
|
||||||
|
|
||||||
|
Next:
|
||||||
|
if (ip->last())
|
||||||
break;
|
break;
|
||||||
|
a = AddState(id+1);
|
||||||
|
goto Loop;
|
||||||
|
|
||||||
case kInstEmptyWidth:
|
case kInstEmptyWidth:
|
||||||
|
if (!ip->last())
|
||||||
|
stk[nstk++] = AddState(id+1);
|
||||||
|
|
||||||
// Continue on if we have all the right flag bits.
|
// Continue on if we have all the right flag bits.
|
||||||
if (ip->empty() & ~flag)
|
if (ip->empty() & ~flag)
|
||||||
break;
|
break;
|
||||||
stk[nstk++] = AddState(ip->out());
|
a = AddState(ip->out());
|
||||||
break;
|
goto Loop;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Run runq on byte c, appending new states to nextq.
|
// Run runq on byte c, appending new states to nextq.
|
||||||
// Updates match as new, better matches are found.
|
// Updates matched_ and match_ as new, better matches are found.
|
||||||
// p is position of the byte c in the input string,
|
// p is the position of byte c in the input string for AddToThreadq;
|
||||||
// used when processing capturing parens.
|
// p-1 will be used when processing Match instructions.
|
||||||
// flag is the bitwise or of Bol, Eol, etc., specifying whether
|
// flag is the bitwise OR of Bol, Eol, etc., specifying whether
|
||||||
// ^, $ and \b match the current input point (after c).
|
// ^, $ and \b match the current input position (after c).
|
||||||
// Frees all the threads on runq.
|
// Frees all the threads on runq.
|
||||||
// If there is a shortcut to the end, returns that shortcut.
|
// If there is a shortcut to the end, returns that shortcut.
|
||||||
int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {
|
int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {
|
||||||
@ -300,12 +345,12 @@ int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {
|
|||||||
if (longest_) {
|
if (longest_) {
|
||||||
// Can skip any threads started after our current best match.
|
// Can skip any threads started after our current best match.
|
||||||
if (matched_ && match_[0] < t->capture[0]) {
|
if (matched_ && match_[0] < t->capture[0]) {
|
||||||
FreeThread(t);
|
Decref(t);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int id = t->id;
|
int id = i->index();
|
||||||
Prog::Inst* ip = prog_->inst(id);
|
Prog::Inst* ip = prog_->inst(id);
|
||||||
|
|
||||||
switch (ip->opcode()) {
|
switch (ip->opcode()) {
|
||||||
@ -315,8 +360,7 @@ int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {
|
|||||||
break;
|
break;
|
||||||
|
|
||||||
case kInstByteRange:
|
case kInstByteRange:
|
||||||
if (ip->Matches(c))
|
AddToThreadq(nextq, ip->out(), c, flag, p, t);
|
||||||
AddToThreadq(nextq, ip->out(), flag, p+1, t->capture);
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case kInstAltMatch:
|
case kInstAltMatch:
|
||||||
@ -324,52 +368,58 @@ int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {
|
|||||||
break;
|
break;
|
||||||
// The match is ours if we want it.
|
// The match is ours if we want it.
|
||||||
if (ip->greedy(prog_) || longest_) {
|
if (ip->greedy(prog_) || longest_) {
|
||||||
CopyCapture((const char**)match_, t->capture);
|
CopyCapture(match_, t->capture);
|
||||||
FreeThread(t);
|
|
||||||
for (++i; i != runq->end(); ++i)
|
|
||||||
FreeThread(i->second);
|
|
||||||
runq->clear();
|
|
||||||
matched_ = true;
|
matched_ = true;
|
||||||
|
|
||||||
|
Decref(t);
|
||||||
|
for (++i; i != runq->end(); ++i)
|
||||||
|
Decref(i->second);
|
||||||
|
runq->clear();
|
||||||
if (ip->greedy(prog_))
|
if (ip->greedy(prog_))
|
||||||
return ip->out1();
|
return ip->out1();
|
||||||
return ip->out();
|
return ip->out();
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case kInstMatch:
|
case kInstMatch: {
|
||||||
if (endmatch_ && p != etext_)
|
// Avoid invoking undefined behavior when p happens
|
||||||
|
// to be null - and p-1 would be meaningless anyway.
|
||||||
|
if (p == NULL)
|
||||||
|
break;
|
||||||
|
|
||||||
|
if (endmatch_ && p-1 != etext_)
|
||||||
break;
|
break;
|
||||||
|
|
||||||
const char* old = t->capture[1]; // previous end pointer
|
|
||||||
t->capture[1] = p;
|
|
||||||
if (longest_) {
|
if (longest_) {
|
||||||
// Leftmost-longest mode: save this match only if
|
// Leftmost-longest mode: save this match only if
|
||||||
// it is either farther to the left or at the same
|
// it is either farther to the left or at the same
|
||||||
// point but longer than an existing match.
|
// point but longer than an existing match.
|
||||||
if (!matched_ || t->capture[0] < match_[0] ||
|
if (!matched_ || t->capture[0] < match_[0] ||
|
||||||
(t->capture[0] == match_[0] && t->capture[1] > match_[1]))
|
(t->capture[0] == match_[0] && p-1 > match_[1])) {
|
||||||
CopyCapture((const char**)match_, t->capture);
|
CopyCapture(match_, t->capture);
|
||||||
|
match_[1] = p-1;
|
||||||
|
matched_ = true;
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
// Leftmost-biased mode: this match is by definition
|
// Leftmost-biased mode: this match is by definition
|
||||||
// better than what we've already found (see next line).
|
// better than what we've already found (see next line).
|
||||||
CopyCapture((const char**)match_, t->capture);
|
CopyCapture(match_, t->capture);
|
||||||
|
match_[1] = p-1;
|
||||||
|
matched_ = true;
|
||||||
|
|
||||||
// Cut off the threads that can only find matches
|
// Cut off the threads that can only find matches
|
||||||
// worse than the one we just found: don't run the
|
// worse than the one we just found: don't run the
|
||||||
// rest of the current Threadq.
|
// rest of the current Threadq.
|
||||||
t->capture[0] = old;
|
Decref(t);
|
||||||
FreeThread(t);
|
|
||||||
for (++i; i != runq->end(); ++i)
|
for (++i; i != runq->end(); ++i)
|
||||||
FreeThread(i->second);
|
Decref(i->second);
|
||||||
runq->clear();
|
runq->clear();
|
||||||
matched_ = true;
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
t->capture[0] = old;
|
|
||||||
matched_ = true;
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
FreeThread(t);
|
}
|
||||||
|
Decref(t);
|
||||||
}
|
}
|
||||||
runq->clear();
|
runq->clear();
|
||||||
return 0;
|
return 0;
|
||||||
@ -391,12 +441,6 @@ string NFA::FormatCapture(const char** capture) {
|
|||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Returns whether haystack contains needle's memory.
|
|
||||||
static bool StringPieceContains(const StringPiece haystack, const StringPiece needle) {
|
|
||||||
return haystack.begin() <= needle.begin() &&
|
|
||||||
haystack.end() >= needle.end();
|
|
||||||
}
|
|
||||||
|
|
||||||
bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
|
bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
|
||||||
bool anchored, bool longest,
|
bool anchored, bool longest,
|
||||||
StringPiece* submatch, int nsubmatch) {
|
StringPiece* submatch, int nsubmatch) {
|
||||||
@ -407,12 +451,9 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
|
|||||||
if (context.begin() == NULL)
|
if (context.begin() == NULL)
|
||||||
context = text;
|
context = text;
|
||||||
|
|
||||||
if (!StringPieceContains(context, text)) {
|
// Sanity check: make sure that text lies within context.
|
||||||
LOG(FATAL) << "Bad args: context does not contain text "
|
if (text.begin() < context.begin() || text.end() > context.end()) {
|
||||||
<< reinterpret_cast<const void*>(context.begin())
|
LOG(DFATAL) << "context does not contain text";
|
||||||
<< "+" << context.size() << " "
|
|
||||||
<< reinterpret_cast<const void*>(text.begin())
|
|
||||||
<< "+" << text.size();
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -445,16 +486,14 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
|
|||||||
|
|
||||||
match_ = new const char*[ncapture_];
|
match_ = new const char*[ncapture_];
|
||||||
matched_ = false;
|
matched_ = false;
|
||||||
memset(match_, 0, ncapture_*sizeof match_[0]);
|
|
||||||
|
|
||||||
// For debugging prints.
|
// For debugging prints.
|
||||||
btext_ = context.begin();
|
btext_ = context.begin();
|
||||||
|
|
||||||
if (Debug) {
|
if (ExtraDebug)
|
||||||
fprintf(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n",
|
fprintf(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n",
|
||||||
text.as_string().c_str(), context.as_string().c_str(), anchored,
|
text.ToString().c_str(), context.ToString().c_str(), anchored,
|
||||||
longest);
|
longest);
|
||||||
}
|
|
||||||
|
|
||||||
// Set up search.
|
// Set up search.
|
||||||
Threadq* runq = &q0_;
|
Threadq* runq = &q0_;
|
||||||
@ -462,14 +501,10 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
|
|||||||
runq->clear();
|
runq->clear();
|
||||||
nextq->clear();
|
nextq->clear();
|
||||||
memset(&match_[0], 0, ncapture_*sizeof match_[0]);
|
memset(&match_[0], 0, ncapture_*sizeof match_[0]);
|
||||||
const char* bp = context.begin();
|
|
||||||
int c = -1;
|
|
||||||
int wasword = 0;
|
int wasword = 0;
|
||||||
|
|
||||||
if (text.begin() > context.begin()) {
|
if (text.begin() > context.begin())
|
||||||
c = text.begin()[-1] & 0xFF;
|
wasword = Prog::IsWordChar(text.begin()[-1] & 0xFF);
|
||||||
wasword = Prog::IsWordChar(c);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Loop over the text, stepping the machine.
|
// Loop over the text, stepping the machine.
|
||||||
for (const char* p = text.begin();; p++) {
|
for (const char* p = text.begin();; p++) {
|
||||||
@ -498,24 +533,29 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
|
|||||||
else
|
else
|
||||||
flag |= kEmptyNonWordBoundary;
|
flag |= kEmptyNonWordBoundary;
|
||||||
|
|
||||||
if (Debug) {
|
if (ExtraDebug) {
|
||||||
fprintf(stderr, "%c[%#x/%d/%d]:", p > text.end() ? '$' : p == bp ? '^' : c, flag, isword, wasword);
|
int c = 0;
|
||||||
|
if (p == context.begin())
|
||||||
|
c = '^';
|
||||||
|
else if (p > text.end())
|
||||||
|
c = '$';
|
||||||
|
else if (p < text.end())
|
||||||
|
c = p[0] & 0xFF;
|
||||||
|
|
||||||
|
fprintf(stderr, "%c[%#x/%d/%d]:", c, flag, isword, wasword);
|
||||||
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
|
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
|
||||||
Thread* t = i->second;
|
Thread* t = i->second;
|
||||||
if (t == NULL)
|
if (t == NULL)
|
||||||
continue;
|
continue;
|
||||||
fprintf(stderr, " %d%s", t->id,
|
fprintf(stderr, " %d%s", i->index(), FormatCapture(t->capture).c_str());
|
||||||
FormatCapture((const char**)t->capture).c_str());
|
|
||||||
}
|
}
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Process previous character (waited until now to avoid
|
// This is a no-op the first time around the loop because runq is empty.
|
||||||
// repeating the flag computation above).
|
int id = Step(runq, nextq, p < text.end() ? p[0] & 0xFF : -1, flag, p);
|
||||||
// This is a no-op the first time around the loop, because
|
|
||||||
// runq is empty.
|
|
||||||
int id = Step(runq, nextq, c, flag, p-1);
|
|
||||||
DCHECK_EQ(runq->size(), 0);
|
DCHECK_EQ(runq->size(), 0);
|
||||||
|
using std::swap;
|
||||||
swap(nextq, runq);
|
swap(nextq, runq);
|
||||||
nextq->clear();
|
nextq->clear();
|
||||||
if (id != 0) {
|
if (id != 0) {
|
||||||
@ -529,6 +569,7 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
|
|||||||
break;
|
break;
|
||||||
|
|
||||||
case kInstCapture:
|
case kInstCapture:
|
||||||
|
if (ip->cap() < ncapture_)
|
||||||
match_[ip->cap()] = p;
|
match_[ip->cap()] = p;
|
||||||
id = ip->out();
|
id = ip->out();
|
||||||
continue;
|
continue;
|
||||||
@ -541,14 +582,6 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
|
|||||||
match_[1] = p;
|
match_[1] = p;
|
||||||
matched_ = true;
|
matched_ = true;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case kInstEmptyWidth:
|
|
||||||
if (ip->empty() & ~(kEmptyEndLine|kEmptyEndText)) {
|
|
||||||
LOG(DFATAL) << "Unexpected empty-width in short circuit: " << ip->empty();
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
id = ip->out();
|
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -566,10 +599,10 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
|
|||||||
// If there's a required first byte for an unanchored search
|
// If there's a required first byte for an unanchored search
|
||||||
// and we're not in the middle of any possible matches,
|
// and we're not in the middle of any possible matches,
|
||||||
// use memchr to search for the byte quickly.
|
// use memchr to search for the byte quickly.
|
||||||
if (!anchored && first_byte_ >= 0 && runq->size() == 0 &&
|
int fb = prog_->first_byte();
|
||||||
p < text.end() && (p[0] & 0xFF) != first_byte_) {
|
if (!anchored && runq->size() == 0 &&
|
||||||
p = reinterpret_cast<const char*>(memchr(p, first_byte_,
|
fb >= 0 && p < text.end() && (p[0] & 0xFF) != fb) {
|
||||||
text.end() - p));
|
p = reinterpret_cast<const char*>(memchr(p, fb, text.end() - p));
|
||||||
if (p == NULL) {
|
if (p == NULL) {
|
||||||
p = text.end();
|
p = text.end();
|
||||||
isword = 0;
|
isword = 0;
|
||||||
@ -579,59 +612,48 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
|
|||||||
flag = Prog::EmptyFlags(context, p);
|
flag = Prog::EmptyFlags(context, p);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Steal match storage (cleared but unused as of yet)
|
Thread* t = AllocThread();
|
||||||
// temporarily to hold match boundaries for new thread.
|
CopyCapture(t->capture, match_);
|
||||||
match_[0] = p;
|
t->capture[0] = p;
|
||||||
AddToThreadq(runq, start_, flag, p, match_);
|
AddToThreadq(runq, start_, p < text.end() ? p[0] & 0xFF : -1, flag, p, t);
|
||||||
match_[0] = NULL;
|
Decref(t);
|
||||||
}
|
}
|
||||||
|
|
||||||
// If all the threads have died, stop early.
|
// If all the threads have died, stop early.
|
||||||
if (runq->size() == 0) {
|
if (runq->size() == 0) {
|
||||||
if (Debug)
|
if (ExtraDebug)
|
||||||
fprintf(stderr, "dead\n");
|
fprintf(stderr, "dead\n");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (p == text.end())
|
|
||||||
c = 0;
|
|
||||||
else
|
|
||||||
c = *p & 0xFF;
|
|
||||||
wasword = isword;
|
wasword = isword;
|
||||||
|
|
||||||
// Will run step(runq, nextq, c, ...) on next iteration. See above.
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i)
|
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i)
|
||||||
FreeThread(i->second);
|
Decref(i->second);
|
||||||
|
|
||||||
if (matched_) {
|
if (matched_) {
|
||||||
for (int i = 0; i < nsubmatch; i++)
|
for (int i = 0; i < nsubmatch; i++)
|
||||||
submatch[i].set(match_[2*i], match_[2*i+1] - match_[2*i]);
|
submatch[i] =
|
||||||
if (Debug)
|
StringPiece(match_[2 * i],
|
||||||
fprintf(stderr, "match (%d,%d)\n",
|
static_cast<size_t>(match_[2 * i + 1] - match_[2 * i]));
|
||||||
static_cast<int>(match_[0] - btext_),
|
if (ExtraDebug)
|
||||||
static_cast<int>(match_[1] - btext_));
|
fprintf(stderr, "match (%td,%td)\n",
|
||||||
|
match_[0] - btext_, match_[1] - btext_);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
VLOG(1) << "No matches found";
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Computes whether all successful matches have a common first byte,
|
// Computes whether all successful matches have a common first byte,
|
||||||
// and if so, returns that byte. If not, returns -1.
|
// and if so, returns that byte. If not, returns -1.
|
||||||
int NFA::ComputeFirstByte() {
|
int Prog::ComputeFirstByte() {
|
||||||
if (start_ == 0)
|
int b = -1;
|
||||||
return -1;
|
SparseSet q(size());
|
||||||
|
q.insert(start());
|
||||||
int b = -1; // first byte, not yet computed
|
for (SparseSet::iterator it = q.begin(); it != q.end(); ++it) {
|
||||||
|
|
||||||
typedef SparseSet Workq;
|
|
||||||
Workq q(prog_->size());
|
|
||||||
q.insert(start_);
|
|
||||||
for (Workq::iterator it = q.begin(); it != q.end(); ++it) {
|
|
||||||
int id = *it;
|
int id = *it;
|
||||||
Prog::Inst* ip = prog_->inst(id);
|
Prog::Inst* ip = inst(id);
|
||||||
switch (ip->opcode()) {
|
switch (ip->opcode()) {
|
||||||
default:
|
default:
|
||||||
LOG(DFATAL) << "unhandled " << ip->opcode() << " in ComputeFirstByte";
|
LOG(DFATAL) << "unhandled " << ip->opcode() << " in ComputeFirstByte";
|
||||||
@ -642,6 +664,9 @@ int NFA::ComputeFirstByte() {
|
|||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
case kInstByteRange:
|
case kInstByteRange:
|
||||||
|
if (!ip->last())
|
||||||
|
q.insert(id+1);
|
||||||
|
|
||||||
// Must match only a single byte
|
// Must match only a single byte
|
||||||
if (ip->lo() != ip->hi())
|
if (ip->lo() != ip->hi())
|
||||||
return -1;
|
return -1;
|
||||||
@ -658,6 +683,9 @@ int NFA::ComputeFirstByte() {
|
|||||||
case kInstNop:
|
case kInstNop:
|
||||||
case kInstCapture:
|
case kInstCapture:
|
||||||
case kInstEmptyWidth:
|
case kInstEmptyWidth:
|
||||||
|
if (!ip->last())
|
||||||
|
q.insert(id+1);
|
||||||
|
|
||||||
// Continue on.
|
// Continue on.
|
||||||
// Ignore ip->empty() flags for kInstEmptyWidth
|
// Ignore ip->empty() flags for kInstEmptyWidth
|
||||||
// in order to be as conservative as possible
|
// in order to be as conservative as possible
|
||||||
@ -666,13 +694,9 @@ int NFA::ComputeFirstByte() {
|
|||||||
q.insert(ip->out());
|
q.insert(ip->out());
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case kInstAlt:
|
|
||||||
case kInstAltMatch:
|
case kInstAltMatch:
|
||||||
// Explore alternatives.
|
DCHECK(!ip->last());
|
||||||
if (ip->out())
|
q.insert(id+1);
|
||||||
q.insert(ip->out());
|
|
||||||
if (ip->out1())
|
|
||||||
q.insert(ip->out1());
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case kInstFail:
|
case kInstFail:
|
||||||
@ -686,7 +710,7 @@ bool
|
|||||||
Prog::SearchNFA(const StringPiece& text, const StringPiece& context,
|
Prog::SearchNFA(const StringPiece& text, const StringPiece& context,
|
||||||
Anchor anchor, MatchKind kind,
|
Anchor anchor, MatchKind kind,
|
||||||
StringPiece* match, int nmatch) {
|
StringPiece* match, int nmatch) {
|
||||||
if (NFA::Debug)
|
if (ExtraDebug)
|
||||||
Dump();
|
Dump();
|
||||||
|
|
||||||
NFA nfa(this);
|
NFA nfa(this);
|
||||||
@ -705,5 +729,63 @@ Prog::SearchNFA(const StringPiece& text, const StringPiece& context,
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace re2
|
// For each instruction i in the program reachable from the start, compute the
|
||||||
|
// number of instructions reachable from i by following only empty transitions
|
||||||
|
// and record that count as fanout[i].
|
||||||
|
//
|
||||||
|
// fanout holds the results and is also the work queue for the outer iteration.
|
||||||
|
// reachable holds the reached nodes for the inner iteration.
|
||||||
|
void Prog::Fanout(SparseArray<int>* fanout) {
|
||||||
|
DCHECK_EQ(fanout->max_size(), size());
|
||||||
|
SparseSet reachable(size());
|
||||||
|
fanout->clear();
|
||||||
|
fanout->set_new(start(), 0);
|
||||||
|
for (SparseArray<int>::iterator i = fanout->begin(); i != fanout->end(); ++i) {
|
||||||
|
int* count = &i->second;
|
||||||
|
reachable.clear();
|
||||||
|
reachable.insert(i->index());
|
||||||
|
for (SparseSet::iterator j = reachable.begin(); j != reachable.end(); ++j) {
|
||||||
|
int id = *j;
|
||||||
|
Prog::Inst* ip = inst(id);
|
||||||
|
switch (ip->opcode()) {
|
||||||
|
default:
|
||||||
|
LOG(DFATAL) << "unhandled " << ip->opcode() << " in Prog::Fanout()";
|
||||||
|
break;
|
||||||
|
|
||||||
|
case kInstByteRange:
|
||||||
|
if (!ip->last())
|
||||||
|
reachable.insert(id+1);
|
||||||
|
|
||||||
|
(*count)++;
|
||||||
|
if (!fanout->has_index(ip->out())) {
|
||||||
|
fanout->set_new(ip->out(), 0);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
case kInstAltMatch:
|
||||||
|
DCHECK(!ip->last());
|
||||||
|
reachable.insert(id+1);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case kInstCapture:
|
||||||
|
case kInstEmptyWidth:
|
||||||
|
case kInstNop:
|
||||||
|
if (!ip->last())
|
||||||
|
reachable.insert(id+1);
|
||||||
|
|
||||||
|
reachable.insert(ip->out());
|
||||||
|
break;
|
||||||
|
|
||||||
|
case kInstMatch:
|
||||||
|
if (!ip->last())
|
||||||
|
reachable.insert(id+1);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case kInstFail:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace re2
|
||||||
|
@ -50,17 +50,29 @@
|
|||||||
// See also Anne Brüggemann-Klein and Derick Wood,
|
// See also Anne Brüggemann-Klein and Derick Wood,
|
||||||
// "One-unambiguous regular languages", Information and Computation 142(2).
|
// "One-unambiguous regular languages", Information and Computation 142(2).
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
#include <algorithm>
|
||||||
#include <map>
|
#include <map>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
#include "util/util.h"
|
#include "util/util.h"
|
||||||
#include "util/arena.h"
|
#include "util/logging.h"
|
||||||
#include "util/sparse_set.h"
|
#include "util/sparse_set.h"
|
||||||
|
#include "util/strutil.h"
|
||||||
|
#include "util/utf.h"
|
||||||
#include "re2/prog.h"
|
#include "re2/prog.h"
|
||||||
#include "re2/stringpiece.h"
|
#include "re2/stringpiece.h"
|
||||||
|
|
||||||
|
// Silence "zero-sized array in struct/union" warning for OneState::action.
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
#pragma warning(disable: 4200)
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace re2 {
|
namespace re2 {
|
||||||
|
|
||||||
static const int Debug = 0;
|
static const bool ExtraDebug = false;
|
||||||
|
|
||||||
// The key insight behind this implementation is that the
|
// The key insight behind this implementation is that the
|
||||||
// non-determinism in an NFA for a one-pass regular expression
|
// non-determinism in an NFA for a one-pass regular expression
|
||||||
@ -126,19 +138,16 @@ static const int Debug = 0;
|
|||||||
// whether a set of conditions required to finish a match at that
|
// whether a set of conditions required to finish a match at that
|
||||||
// point in the input rather than process the next byte.
|
// point in the input rather than process the next byte.
|
||||||
|
|
||||||
// A state in the one-pass NFA (aka DFA) - just an array of actions.
|
|
||||||
struct OneState;
|
|
||||||
|
|
||||||
// A state in the one-pass NFA - just an array of actions indexed
|
// A state in the one-pass NFA - just an array of actions indexed
|
||||||
// by the bytemap_[] of the next input byte. (The bytemap
|
// by the bytemap_[] of the next input byte. (The bytemap
|
||||||
// maps next input bytes into equivalence classes, to reduce
|
// maps next input bytes into equivalence classes, to reduce
|
||||||
// the memory footprint.)
|
// the memory footprint.)
|
||||||
struct OneState {
|
struct OneState {
|
||||||
uint32 matchcond; // conditions to match right now.
|
uint32_t matchcond; // conditions to match right now.
|
||||||
uint32 action[1];
|
uint32_t action[];
|
||||||
};
|
};
|
||||||
|
|
||||||
// The uint32 conditions in the action are a combination of
|
// The uint32_t conditions in the action are a combination of
|
||||||
// condition and capture bits and the next state. The bottom 16 bits
|
// condition and capture bits and the next state. The bottom 16 bits
|
||||||
// are the condition and capture bits, and the top 16 are the index of
|
// are the condition and capture bits, and the top 16 are the index of
|
||||||
// the next state.
|
// the next state.
|
||||||
@ -164,23 +173,23 @@ static const int kRealMaxCap = (kIndexShift - kRealCapShift) / 2 * 2;
|
|||||||
static const int kCapShift = kRealCapShift - 2;
|
static const int kCapShift = kRealCapShift - 2;
|
||||||
static const int kMaxCap = kRealMaxCap + 2;
|
static const int kMaxCap = kRealMaxCap + 2;
|
||||||
|
|
||||||
static const uint32 kMatchWins = 1 << kEmptyShift;
|
static const uint32_t kMatchWins = 1 << kEmptyShift;
|
||||||
static const uint32 kCapMask = ((1 << kRealMaxCap) - 1) << kRealCapShift;
|
static const uint32_t kCapMask = ((1 << kRealMaxCap) - 1) << kRealCapShift;
|
||||||
|
|
||||||
static const uint32 kImpossible = kEmptyWordBoundary | kEmptyNonWordBoundary;
|
static const uint32_t kImpossible = kEmptyWordBoundary | kEmptyNonWordBoundary;
|
||||||
|
|
||||||
// Check, at compile time, that prog.h agrees with math above.
|
// Check, at compile time, that prog.h agrees with math above.
|
||||||
// This function is never called.
|
// This function is never called.
|
||||||
void OnePass_Checks() {
|
void OnePass_Checks() {
|
||||||
COMPILE_ASSERT((1<<kEmptyShift)-1 == kEmptyAllFlags,
|
static_assert((1<<kEmptyShift)-1 == kEmptyAllFlags,
|
||||||
kEmptyShift_disagrees_with_kEmptyAllFlags);
|
"kEmptyShift disagrees with kEmptyAllFlags");
|
||||||
// kMaxCap counts pointers, kMaxOnePassCapture counts pairs.
|
// kMaxCap counts pointers, kMaxOnePassCapture counts pairs.
|
||||||
COMPILE_ASSERT(kMaxCap == Prog::kMaxOnePassCapture*2,
|
static_assert(kMaxCap == Prog::kMaxOnePassCapture*2,
|
||||||
kMaxCap_disagrees_with_kMaxOnePassCapture);
|
"kMaxCap disagrees with kMaxOnePassCapture");
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool Satisfy(uint32 cond, const StringPiece& context, const char* p) {
|
static bool Satisfy(uint32_t cond, const StringPiece& context, const char* p) {
|
||||||
uint32 satisfied = Prog::EmptyFlags(context, p);
|
uint32_t satisfied = Prog::EmptyFlags(context, p);
|
||||||
if (cond & kEmptyAllFlags & ~satisfied)
|
if (cond & kEmptyAllFlags & ~satisfied)
|
||||||
return false;
|
return false;
|
||||||
return true;
|
return true;
|
||||||
@ -188,20 +197,17 @@ static bool Satisfy(uint32 cond, const StringPiece& context, const char* p) {
|
|||||||
|
|
||||||
// Apply the capture bits in cond, saving p to the appropriate
|
// Apply the capture bits in cond, saving p to the appropriate
|
||||||
// locations in cap[].
|
// locations in cap[].
|
||||||
static void ApplyCaptures(uint32 cond, const char* p,
|
static void ApplyCaptures(uint32_t cond, const char* p,
|
||||||
const char** cap, int ncap) {
|
const char** cap, int ncap) {
|
||||||
for (int i = 2; i < ncap; i++)
|
for (int i = 2; i < ncap; i++)
|
||||||
if (cond & (1 << kCapShift << i))
|
if (cond & (1 << kCapShift << i))
|
||||||
cap[i] = p;
|
cap[i] = p;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Compute a node pointer.
|
// Computes the OneState* for the given nodeindex.
|
||||||
// Basically (OneState*)(nodes + statesize*nodeindex)
|
static inline OneState* IndexToNode(uint8_t* nodes, int statesize,
|
||||||
// but the version with the C++ casts overflows 80 characters (and is ugly).
|
|
||||||
static inline OneState* IndexToNode(volatile uint8* nodes, int statesize,
|
|
||||||
int nodeindex) {
|
int nodeindex) {
|
||||||
return reinterpret_cast<OneState*>(
|
return reinterpret_cast<OneState*>(nodes + statesize*nodeindex);
|
||||||
const_cast<uint8*>(nodes + statesize*nodeindex));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Prog::SearchOnePass(const StringPiece& text,
|
bool Prog::SearchOnePass(const StringPiece& text,
|
||||||
@ -237,30 +243,27 @@ bool Prog::SearchOnePass(const StringPiece& text,
|
|||||||
if (anchor_end())
|
if (anchor_end())
|
||||||
kind = kFullMatch;
|
kind = kFullMatch;
|
||||||
|
|
||||||
// State and act are marked volatile to
|
uint8_t* nodes = onepass_nodes_;
|
||||||
// keep the compiler from re-ordering the
|
int statesize = sizeof(OneState) + bytemap_range()*sizeof(uint32_t);
|
||||||
// memory accesses walking over the NFA.
|
// start() is always mapped to the zeroth OneState.
|
||||||
// This is worth about 5%.
|
OneState* state = IndexToNode(nodes, statesize, 0);
|
||||||
volatile OneState* state = onepass_start_;
|
uint8_t* bytemap = bytemap_;
|
||||||
volatile uint8* nodes = onepass_nodes_;
|
|
||||||
volatile uint32 statesize = onepass_statesize_;
|
|
||||||
uint8* bytemap = bytemap_;
|
|
||||||
const char* bp = text.begin();
|
const char* bp = text.begin();
|
||||||
const char* ep = text.end();
|
const char* ep = text.end();
|
||||||
const char* p;
|
const char* p;
|
||||||
bool matched = false;
|
bool matched = false;
|
||||||
matchcap[0] = bp;
|
matchcap[0] = bp;
|
||||||
cap[0] = bp;
|
cap[0] = bp;
|
||||||
uint32 nextmatchcond = state->matchcond;
|
uint32_t nextmatchcond = state->matchcond;
|
||||||
for (p = bp; p < ep; p++) {
|
for (p = bp; p < ep; p++) {
|
||||||
int c = bytemap[*p & 0xFF];
|
int c = bytemap[*p & 0xFF];
|
||||||
uint32 matchcond = nextmatchcond;
|
uint32_t matchcond = nextmatchcond;
|
||||||
uint32 cond = state->action[c];
|
uint32_t cond = state->action[c];
|
||||||
|
|
||||||
// Determine whether we can reach act->next.
|
// Determine whether we can reach act->next.
|
||||||
// If so, advance state and nextmatchcond.
|
// If so, advance state and nextmatchcond.
|
||||||
if ((cond & kEmptyAllFlags) == 0 || Satisfy(cond, context, p)) {
|
if ((cond & kEmptyAllFlags) == 0 || Satisfy(cond, context, p)) {
|
||||||
uint32 nextindex = cond >> kIndexShift;
|
uint32_t nextindex = cond >> kIndexShift;
|
||||||
state = IndexToNode(nodes, statesize, nextindex);
|
state = IndexToNode(nodes, statesize, nextindex);
|
||||||
nextmatchcond = state->matchcond;
|
nextmatchcond = state->matchcond;
|
||||||
} else {
|
} else {
|
||||||
@ -319,7 +322,7 @@ bool Prog::SearchOnePass(const StringPiece& text,
|
|||||||
|
|
||||||
// Look for match at end of input.
|
// Look for match at end of input.
|
||||||
{
|
{
|
||||||
uint32 matchcond = state->matchcond;
|
uint32_t matchcond = state->matchcond;
|
||||||
if (matchcond != kImpossible &&
|
if (matchcond != kImpossible &&
|
||||||
((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p))) {
|
((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p))) {
|
||||||
if (nmatch > 1 && (matchcond & kCapMask))
|
if (nmatch > 1 && (matchcond & kCapMask))
|
||||||
@ -335,7 +338,9 @@ done:
|
|||||||
if (!matched)
|
if (!matched)
|
||||||
return false;
|
return false;
|
||||||
for (int i = 0; i < nmatch; i++)
|
for (int i = 0; i < nmatch; i++)
|
||||||
match[i].set(matchcap[2*i], matchcap[2*i+1] - matchcap[2*i]);
|
match[i] =
|
||||||
|
StringPiece(matchcap[2 * i],
|
||||||
|
static_cast<size_t>(matchcap[2 * i + 1] - matchcap[2 * i]));
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -357,7 +362,7 @@ static bool AddQ(Instq *q, int id) {
|
|||||||
|
|
||||||
struct InstCond {
|
struct InstCond {
|
||||||
int id;
|
int id;
|
||||||
uint32 cond;
|
uint32_t cond;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Returns whether this is a one-pass program; that is,
|
// Returns whether this is a one-pass program; that is,
|
||||||
@ -377,7 +382,7 @@ struct InstCond {
|
|||||||
// Constructs and saves corresponding one-pass NFA on success.
|
// Constructs and saves corresponding one-pass NFA on success.
|
||||||
bool Prog::IsOnePass() {
|
bool Prog::IsOnePass() {
|
||||||
if (did_onepass_)
|
if (did_onepass_)
|
||||||
return onepass_start_ != NULL;
|
return onepass_nodes_ != NULL;
|
||||||
did_onepass_ = true;
|
did_onepass_ = true;
|
||||||
|
|
||||||
if (start() == 0) // no match
|
if (start() == 0) // no match
|
||||||
@ -387,32 +392,37 @@ bool Prog::IsOnePass() {
|
|||||||
// Willing to use at most 1/4 of the DFA budget (heuristic).
|
// Willing to use at most 1/4 of the DFA budget (heuristic).
|
||||||
// Limit max node count to 65000 as a conservative estimate to
|
// Limit max node count to 65000 as a conservative estimate to
|
||||||
// avoid overflowing 16-bit node index in encoding.
|
// avoid overflowing 16-bit node index in encoding.
|
||||||
int maxnodes = 2 + byte_inst_count_;
|
int maxnodes = 2 + inst_count(kInstByteRange);
|
||||||
int statesize = sizeof(OneState) + (bytemap_range_-1)*sizeof(uint32);
|
int statesize = sizeof(OneState) + bytemap_range()*sizeof(uint32_t);
|
||||||
if (maxnodes >= 65000 || dfa_mem_ / 4 / statesize < maxnodes)
|
if (maxnodes >= 65000 || dfa_mem_ / 4 / statesize < maxnodes)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
// Flood the graph starting at the start state, and check
|
// Flood the graph starting at the start state, and check
|
||||||
// that in each reachable state, each possible byte leads
|
// that in each reachable state, each possible byte leads
|
||||||
// to a unique next state.
|
// to a unique next state.
|
||||||
int size = this->size();
|
int stacksize = inst_count(kInstCapture) +
|
||||||
InstCond *stack = new InstCond[size];
|
inst_count(kInstEmptyWidth) +
|
||||||
|
inst_count(kInstNop) + 1; // + 1 for start inst
|
||||||
|
InstCond* stack = new InstCond[stacksize];
|
||||||
|
|
||||||
|
int size = this->size();
|
||||||
int* nodebyid = new int[size]; // indexed by ip
|
int* nodebyid = new int[size]; // indexed by ip
|
||||||
memset(nodebyid, 0xFF, size*sizeof nodebyid[0]);
|
memset(nodebyid, 0xFF, size*sizeof nodebyid[0]);
|
||||||
|
|
||||||
uint8* nodes = new uint8[maxnodes*statesize];
|
// Originally, nodes was a uint8_t[maxnodes*statesize], but that was
|
||||||
uint8* nodep = nodes;
|
// unnecessarily optimistic: why allocate a large amount of memory
|
||||||
|
// upfront for a large program when it is unlikely to be one-pass?
|
||||||
|
std::vector<uint8_t> nodes;
|
||||||
|
|
||||||
Instq tovisit(size), workq(size);
|
Instq tovisit(size), workq(size);
|
||||||
AddQ(&tovisit, start());
|
AddQ(&tovisit, start());
|
||||||
nodebyid[start()] = 0;
|
nodebyid[start()] = 0;
|
||||||
nodep += statesize;
|
|
||||||
int nalloc = 1;
|
int nalloc = 1;
|
||||||
|
nodes.insert(nodes.end(), statesize, 0);
|
||||||
for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
|
for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
|
||||||
int id = *it;
|
int id = *it;
|
||||||
int nodeindex = nodebyid[id];
|
int nodeindex = nodebyid[id];
|
||||||
OneState* node = IndexToNode(nodes, statesize, nodeindex);
|
OneState* node = IndexToNode(nodes.data(), statesize, nodeindex);
|
||||||
|
|
||||||
// Flood graph using manual stack, filling in actions as found.
|
// Flood graph using manual stack, filling in actions as found.
|
||||||
// Default is none.
|
// Default is none.
|
||||||
@ -427,93 +437,108 @@ bool Prog::IsOnePass() {
|
|||||||
stack[nstack++].cond = 0;
|
stack[nstack++].cond = 0;
|
||||||
while (nstack > 0) {
|
while (nstack > 0) {
|
||||||
int id = stack[--nstack].id;
|
int id = stack[--nstack].id;
|
||||||
|
uint32_t cond = stack[nstack].cond;
|
||||||
|
|
||||||
|
Loop:
|
||||||
Prog::Inst* ip = inst(id);
|
Prog::Inst* ip = inst(id);
|
||||||
uint32 cond = stack[nstack].cond;
|
|
||||||
switch (ip->opcode()) {
|
switch (ip->opcode()) {
|
||||||
|
default:
|
||||||
|
LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
|
||||||
|
break;
|
||||||
|
|
||||||
case kInstAltMatch:
|
case kInstAltMatch:
|
||||||
// TODO(rsc): Ignoring kInstAltMatch optimization.
|
// TODO(rsc): Ignoring kInstAltMatch optimization.
|
||||||
// Should implement it in this engine, but it's subtle.
|
// Should implement it in this engine, but it's subtle.
|
||||||
// Fall through.
|
DCHECK(!ip->last());
|
||||||
case kInstAlt:
|
|
||||||
// If already on work queue, (1) is violated: bail out.
|
// If already on work queue, (1) is violated: bail out.
|
||||||
if (!AddQ(&workq, ip->out()) || !AddQ(&workq, ip->out1()))
|
if (!AddQ(&workq, id+1))
|
||||||
goto fail;
|
goto fail;
|
||||||
stack[nstack].id = ip->out1();
|
id = id+1;
|
||||||
stack[nstack++].cond = cond;
|
goto Loop;
|
||||||
stack[nstack].id = ip->out();
|
|
||||||
stack[nstack++].cond = cond;
|
|
||||||
break;
|
|
||||||
|
|
||||||
case kInstByteRange: {
|
case kInstByteRange: {
|
||||||
int nextindex = nodebyid[ip->out()];
|
int nextindex = nodebyid[ip->out()];
|
||||||
if (nextindex == -1) {
|
if (nextindex == -1) {
|
||||||
if (nalloc >= maxnodes) {
|
if (nalloc >= maxnodes) {
|
||||||
if (Debug)
|
if (ExtraDebug)
|
||||||
LOG(ERROR)
|
LOG(ERROR) << StringPrintf(
|
||||||
<< StringPrintf("Not OnePass: hit node limit %d > %d",
|
"Not OnePass: hit node limit %d >= %d", nalloc, maxnodes);
|
||||||
nalloc, maxnodes);
|
|
||||||
goto fail;
|
goto fail;
|
||||||
}
|
}
|
||||||
nextindex = nalloc;
|
nextindex = nalloc;
|
||||||
nodep += statesize;
|
|
||||||
nodebyid[ip->out()] = nextindex;
|
|
||||||
nalloc++;
|
|
||||||
AddQ(&tovisit, ip->out());
|
AddQ(&tovisit, ip->out());
|
||||||
|
nodebyid[ip->out()] = nalloc;
|
||||||
|
nalloc++;
|
||||||
|
nodes.insert(nodes.end(), statesize, 0);
|
||||||
|
// Update node because it might have been invalidated.
|
||||||
|
node = IndexToNode(nodes.data(), statesize, nodeindex);
|
||||||
}
|
}
|
||||||
if (matched)
|
|
||||||
cond |= kMatchWins;
|
|
||||||
for (int c = ip->lo(); c <= ip->hi(); c++) {
|
for (int c = ip->lo(); c <= ip->hi(); c++) {
|
||||||
int b = bytemap_[c];
|
int b = bytemap_[c];
|
||||||
c = unbytemap_[b]; // last c in byte class
|
// Skip any bytes immediately after c that are also in b.
|
||||||
uint32 act = node->action[b];
|
while (c < 256-1 && bytemap_[c+1] == b)
|
||||||
uint32 newact = (nextindex << kIndexShift) | cond;
|
c++;
|
||||||
|
uint32_t act = node->action[b];
|
||||||
|
uint32_t newact = (nextindex << kIndexShift) | cond;
|
||||||
|
if (matched)
|
||||||
|
newact |= kMatchWins;
|
||||||
if ((act & kImpossible) == kImpossible) {
|
if ((act & kImpossible) == kImpossible) {
|
||||||
node->action[b] = newact;
|
node->action[b] = newact;
|
||||||
} else if (act != newact) {
|
} else if (act != newact) {
|
||||||
if (Debug) {
|
if (ExtraDebug)
|
||||||
LOG(ERROR)
|
LOG(ERROR) << StringPrintf(
|
||||||
<< StringPrintf("Not OnePass: conflict on byte "
|
"Not OnePass: conflict on byte %#x at state %d", c, *it);
|
||||||
"%#x at state %d",
|
|
||||||
c, *it);
|
|
||||||
}
|
|
||||||
goto fail;
|
goto fail;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (ip->foldcase()) {
|
if (ip->foldcase()) {
|
||||||
Rune lo = max<Rune>(ip->lo(), 'a') + 'A' - 'a';
|
Rune lo = std::max<Rune>(ip->lo(), 'a') + 'A' - 'a';
|
||||||
Rune hi = min<Rune>(ip->hi(), 'z') + 'A' - 'a';
|
Rune hi = std::min<Rune>(ip->hi(), 'z') + 'A' - 'a';
|
||||||
for (int c = lo; c <= hi; c++) {
|
for (int c = lo; c <= hi; c++) {
|
||||||
int b = bytemap_[c];
|
int b = bytemap_[c];
|
||||||
c = unbytemap_[b]; // last c in class
|
// Skip any bytes immediately after c that are also in b.
|
||||||
uint32 act = node->action[b];
|
while (c < 256-1 && bytemap_[c+1] == b)
|
||||||
uint32 newact = (nextindex << kIndexShift) | cond;
|
c++;
|
||||||
|
uint32_t act = node->action[b];
|
||||||
|
uint32_t newact = (nextindex << kIndexShift) | cond;
|
||||||
|
if (matched)
|
||||||
|
newact |= kMatchWins;
|
||||||
if ((act & kImpossible) == kImpossible) {
|
if ((act & kImpossible) == kImpossible) {
|
||||||
node->action[b] = newact;
|
node->action[b] = newact;
|
||||||
} else if (act != newact) {
|
} else if (act != newact) {
|
||||||
if (Debug) {
|
if (ExtraDebug)
|
||||||
LOG(ERROR)
|
LOG(ERROR) << StringPrintf(
|
||||||
<< StringPrintf("Not OnePass: conflict on byte "
|
"Not OnePass: conflict on byte %#x at state %d", c, *it);
|
||||||
"%#x at state %d",
|
|
||||||
c, *it);
|
|
||||||
}
|
|
||||||
goto fail;
|
goto fail;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (ip->last())
|
||||||
break;
|
break;
|
||||||
|
// If already on work queue, (1) is violated: bail out.
|
||||||
|
if (!AddQ(&workq, id+1))
|
||||||
|
goto fail;
|
||||||
|
id = id+1;
|
||||||
|
goto Loop;
|
||||||
}
|
}
|
||||||
|
|
||||||
case kInstCapture:
|
case kInstCapture:
|
||||||
if (ip->cap() < kMaxCap)
|
|
||||||
cond |= (1 << kCapShift) << ip->cap();
|
|
||||||
goto QueueEmpty;
|
|
||||||
|
|
||||||
case kInstEmptyWidth:
|
case kInstEmptyWidth:
|
||||||
cond |= ip->empty();
|
|
||||||
goto QueueEmpty;
|
|
||||||
|
|
||||||
case kInstNop:
|
case kInstNop:
|
||||||
QueueEmpty:
|
if (!ip->last()) {
|
||||||
|
// If already on work queue, (1) is violated: bail out.
|
||||||
|
if (!AddQ(&workq, id+1))
|
||||||
|
goto fail;
|
||||||
|
stack[nstack].id = id+1;
|
||||||
|
stack[nstack++].cond = cond;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ip->opcode() == kInstCapture && ip->cap() < kMaxCap)
|
||||||
|
cond |= (1 << kCapShift) << ip->cap();
|
||||||
|
if (ip->opcode() == kInstEmptyWidth)
|
||||||
|
cond |= ip->empty();
|
||||||
|
|
||||||
// kInstCapture and kInstNop always proceed to ip->out().
|
// kInstCapture and kInstNop always proceed to ip->out().
|
||||||
// kInstEmptyWidth only sometimes proceeds to ip->out(),
|
// kInstEmptyWidth only sometimes proceeds to ip->out(),
|
||||||
// but as a conservative approximation we assume it always does.
|
// but as a conservative approximation we assume it always does.
|
||||||
@ -522,29 +547,32 @@ bool Prog::IsOnePass() {
|
|||||||
|
|
||||||
// If already on work queue, (1) is violated: bail out.
|
// If already on work queue, (1) is violated: bail out.
|
||||||
if (!AddQ(&workq, ip->out())) {
|
if (!AddQ(&workq, ip->out())) {
|
||||||
if (Debug) {
|
if (ExtraDebug)
|
||||||
LOG(ERROR) << StringPrintf("Not OnePass: multiple paths"
|
LOG(ERROR) << StringPrintf(
|
||||||
" %d -> %d\n",
|
"Not OnePass: multiple paths %d -> %d\n", *it, ip->out());
|
||||||
*it, ip->out());
|
|
||||||
}
|
|
||||||
goto fail;
|
goto fail;
|
||||||
}
|
}
|
||||||
stack[nstack].id = ip->out();
|
id = ip->out();
|
||||||
stack[nstack++].cond = cond;
|
goto Loop;
|
||||||
break;
|
|
||||||
|
|
||||||
case kInstMatch:
|
case kInstMatch:
|
||||||
if (matched) {
|
if (matched) {
|
||||||
// (3) is violated
|
// (3) is violated
|
||||||
if (Debug) {
|
if (ExtraDebug)
|
||||||
LOG(ERROR) << StringPrintf("Not OnePass: multiple matches"
|
LOG(ERROR) << StringPrintf(
|
||||||
" from %d\n", *it);
|
"Not OnePass: multiple matches from %d\n", *it);
|
||||||
}
|
|
||||||
goto fail;
|
goto fail;
|
||||||
}
|
}
|
||||||
matched = true;
|
matched = true;
|
||||||
node->matchcond = cond;
|
node->matchcond = cond;
|
||||||
|
|
||||||
|
if (ip->last())
|
||||||
break;
|
break;
|
||||||
|
// If already on work queue, (1) is violated: bail out.
|
||||||
|
if (!AddQ(&workq, id+1))
|
||||||
|
goto fail;
|
||||||
|
id = id+1;
|
||||||
|
goto Loop;
|
||||||
|
|
||||||
case kInstFail:
|
case kInstFail:
|
||||||
break;
|
break;
|
||||||
@ -552,29 +580,22 @@ bool Prog::IsOnePass() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (Debug) { // For debugging, dump one-pass NFA to LOG(ERROR).
|
if (ExtraDebug) { // For debugging, dump one-pass NFA to LOG(ERROR).
|
||||||
string dump = "prog dump:\n" + Dump() + "node dump\n";
|
LOG(ERROR) << "bytemap:\n" << DumpByteMap();
|
||||||
map<int, int> idmap;
|
LOG(ERROR) << "prog:\n" << Dump();
|
||||||
|
|
||||||
|
std::map<int, int> idmap;
|
||||||
for (int i = 0; i < size; i++)
|
for (int i = 0; i < size; i++)
|
||||||
if (nodebyid[i] != -1)
|
if (nodebyid[i] != -1)
|
||||||
idmap[nodebyid[i]] = i;
|
idmap[nodebyid[i]] = i;
|
||||||
|
|
||||||
StringAppendF(&dump, "byte ranges:\n");
|
string dump;
|
||||||
int i = 0;
|
|
||||||
for (int b = 0; b < bytemap_range_; b++) {
|
|
||||||
int lo = i;
|
|
||||||
while (bytemap_[i] == b)
|
|
||||||
i++;
|
|
||||||
StringAppendF(&dump, "\t%d: %#x-%#x\n", b, lo, i - 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
|
for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
|
||||||
int id = *it;
|
int id = *it;
|
||||||
int nodeindex = nodebyid[id];
|
int nodeindex = nodebyid[id];
|
||||||
if (nodeindex == -1)
|
if (nodeindex == -1)
|
||||||
continue;
|
continue;
|
||||||
OneState* node = IndexToNode(nodes, statesize, nodeindex);
|
OneState* node = IndexToNode(nodes.data(), statesize, nodeindex);
|
||||||
string s;
|
|
||||||
StringAppendF(&dump, "node %d id=%d: matchcond=%#x\n",
|
StringAppendF(&dump, "node %d id=%d: matchcond=%#x\n",
|
||||||
nodeindex, id, node->matchcond);
|
nodeindex, id, node->matchcond);
|
||||||
for (int i = 0; i < bytemap_range_; i++) {
|
for (int i = 0; i < bytemap_range_; i++) {
|
||||||
@ -586,19 +607,12 @@ bool Prog::IsOnePass() {
|
|||||||
idmap[node->action[i] >> kIndexShift]);
|
idmap[node->action[i] >> kIndexShift]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
LOG(ERROR) << dump;
|
LOG(ERROR) << "nodes:\n" << dump;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Overallocated earlier; cut down to actual size.
|
|
||||||
nodep = new uint8[nalloc*statesize];
|
|
||||||
memmove(nodep, nodes, nalloc*statesize);
|
|
||||||
delete[] nodes;
|
|
||||||
nodes = nodep;
|
|
||||||
|
|
||||||
onepass_start_ = IndexToNode(nodes, statesize, nodebyid[start()]);
|
|
||||||
onepass_nodes_ = nodes;
|
|
||||||
onepass_statesize_ = statesize;
|
|
||||||
dfa_mem_ -= nalloc*statesize;
|
dfa_mem_ -= nalloc*statesize;
|
||||||
|
onepass_nodes_ = new uint8_t[nalloc*statesize];
|
||||||
|
memmove(onepass_nodes_, nodes.data(), nalloc*statesize);
|
||||||
|
|
||||||
delete[] stack;
|
delete[] stack;
|
||||||
delete[] nodebyid;
|
delete[] nodebyid;
|
||||||
@ -607,7 +621,6 @@ bool Prog::IsOnePass() {
|
|||||||
fail:
|
fail:
|
||||||
delete[] stack;
|
delete[] stack;
|
||||||
delete[] nodebyid;
|
delete[] nodebyid;
|
||||||
delete[] nodes;
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -2,34 +2,38 @@
|
|||||||
// Use of this source code is governed by a BSD-style
|
// Use of this source code is governed by a BSD-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
#include "util/util.h"
|
|
||||||
#include "re2/prefilter.h"
|
#include "re2/prefilter.h"
|
||||||
|
|
||||||
|
#include <stddef.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "util/util.h"
|
||||||
|
#include "util/logging.h"
|
||||||
|
#include "util/strutil.h"
|
||||||
|
#include "util/utf.h"
|
||||||
#include "re2/re2.h"
|
#include "re2/re2.h"
|
||||||
#include "re2/unicode_casefold.h"
|
#include "re2/unicode_casefold.h"
|
||||||
#include "re2/walker-inl.h"
|
#include "re2/walker-inl.h"
|
||||||
|
|
||||||
namespace re2 {
|
namespace re2 {
|
||||||
|
|
||||||
static const int Trace = false;
|
static const bool ExtraDebug = false;
|
||||||
|
|
||||||
typedef set<string>::iterator SSIter;
|
typedef std::set<string>::iterator SSIter;
|
||||||
typedef set<string>::const_iterator ConstSSIter;
|
typedef std::set<string>::const_iterator ConstSSIter;
|
||||||
|
|
||||||
static int alloc_id = 100000; // Used for debugging.
|
|
||||||
// Initializes a Prefilter, allocating subs_ as necessary.
|
// Initializes a Prefilter, allocating subs_ as necessary.
|
||||||
Prefilter::Prefilter(Op op) {
|
Prefilter::Prefilter(Op op) {
|
||||||
op_ = op;
|
op_ = op;
|
||||||
subs_ = NULL;
|
subs_ = NULL;
|
||||||
if (op_ == AND || op_ == OR)
|
if (op_ == AND || op_ == OR)
|
||||||
subs_ = new vector<Prefilter*>;
|
subs_ = new std::vector<Prefilter*>;
|
||||||
|
|
||||||
alloc_id_ = alloc_id++;
|
|
||||||
VLOG(10) << "alloc_id: " << alloc_id_;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Destroys a Prefilter.
|
// Destroys a Prefilter.
|
||||||
Prefilter::~Prefilter() {
|
Prefilter::~Prefilter() {
|
||||||
VLOG(10) << "Deleted: " << alloc_id_;
|
|
||||||
if (subs_) {
|
if (subs_) {
|
||||||
for (size_t i = 0; i < subs_->size(); i++)
|
for (size_t i = 0; i < subs_->size(); i++)
|
||||||
delete (*subs_)[i];
|
delete (*subs_)[i];
|
||||||
@ -45,7 +49,7 @@ Prefilter* Prefilter::Simplify() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Nothing left in the AND/OR.
|
// Nothing left in the AND/OR.
|
||||||
if (subs_->size() == 0) {
|
if (subs_->empty()) {
|
||||||
if (op_ == AND)
|
if (op_ == AND)
|
||||||
op_ = ALL; // AND of nothing is true
|
op_ = ALL; // AND of nothing is true
|
||||||
else
|
else
|
||||||
@ -136,7 +140,7 @@ Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) {
|
|||||||
return AndOr(OR, a, b);
|
return AndOr(OR, a, b);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void SimplifyStringSet(set<string> *ss) {
|
static void SimplifyStringSet(std::set<string> *ss) {
|
||||||
// Now make sure that the strings aren't redundant. For example, if
|
// Now make sure that the strings aren't redundant. For example, if
|
||||||
// we know "ab" is a required string, then it doesn't help at all to
|
// we know "ab" is a required string, then it doesn't help at all to
|
||||||
// know that "abc" is also a required string, so delete "abc". This
|
// know that "abc" is also a required string, so delete "abc". This
|
||||||
@ -157,7 +161,7 @@ static void SimplifyStringSet(set<string> *ss) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Prefilter* Prefilter::OrStrings(set<string>* ss) {
|
Prefilter* Prefilter::OrStrings(std::set<string>* ss) {
|
||||||
SimplifyStringSet(ss);
|
SimplifyStringSet(ss);
|
||||||
Prefilter* or_prefilter = NULL;
|
Prefilter* or_prefilter = NULL;
|
||||||
if (!ss->empty()) {
|
if (!ss->empty()) {
|
||||||
@ -176,7 +180,7 @@ static Rune ToLowerRune(Rune r) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r);
|
const CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r);
|
||||||
if (f == NULL || r < static_cast<Rune>(f->lo))
|
if (f == NULL || r < f->lo)
|
||||||
return r;
|
return r;
|
||||||
return ApplyFold(f, r);
|
return ApplyFold(f, r);
|
||||||
}
|
}
|
||||||
@ -222,14 +226,14 @@ class Prefilter::Info {
|
|||||||
// Caller takes ownership of the Prefilter.
|
// Caller takes ownership of the Prefilter.
|
||||||
Prefilter* TakeMatch();
|
Prefilter* TakeMatch();
|
||||||
|
|
||||||
set<string>& exact() { return exact_; }
|
std::set<string>& exact() { return exact_; }
|
||||||
|
|
||||||
bool is_exact() const { return is_exact_; }
|
bool is_exact() const { return is_exact_; }
|
||||||
|
|
||||||
class Walker;
|
class Walker;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
set<string> exact_;
|
std::set<string> exact_;
|
||||||
|
|
||||||
// When is_exact_ is true, the strings that match
|
// When is_exact_ is true, the strings that match
|
||||||
// are placed in exact_. When it is no longer an exact
|
// are placed in exact_. When it is no longer an exact
|
||||||
@ -268,7 +272,9 @@ string Prefilter::Info::ToString() {
|
|||||||
if (is_exact_) {
|
if (is_exact_) {
|
||||||
int n = 0;
|
int n = 0;
|
||||||
string s;
|
string s;
|
||||||
for (set<string>::iterator i = exact_.begin(); i != exact_.end(); ++i) {
|
for (std::set<string>::iterator i = exact_.begin();
|
||||||
|
i != exact_.end();
|
||||||
|
++i) {
|
||||||
if (n++ > 0)
|
if (n++ > 0)
|
||||||
s += ",";
|
s += ",";
|
||||||
s += *i;
|
s += *i;
|
||||||
@ -283,16 +289,17 @@ string Prefilter::Info::ToString() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Add the strings from src to dst.
|
// Add the strings from src to dst.
|
||||||
static void CopyIn(const set<string>& src, set<string>* dst) {
|
static void CopyIn(const std::set<string>& src,
|
||||||
|
std::set<string>* dst) {
|
||||||
for (ConstSSIter i = src.begin(); i != src.end(); ++i)
|
for (ConstSSIter i = src.begin(); i != src.end(); ++i)
|
||||||
dst->insert(*i);
|
dst->insert(*i);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Add the cross-product of a and b to dst.
|
// Add the cross-product of a and b to dst.
|
||||||
// (For each string i in a and j in b, add i+j.)
|
// (For each string i in a and j in b, add i+j.)
|
||||||
static void CrossProduct(const set<string>& a,
|
static void CrossProduct(const std::set<string>& a,
|
||||||
const set<string>& b,
|
const std::set<string>& b,
|
||||||
set<string>* dst) {
|
std::set<string>* dst) {
|
||||||
for (ConstSSIter i = a.begin(); i != a.end(); ++i)
|
for (ConstSSIter i = a.begin(); i != a.end(); ++i)
|
||||||
for (ConstSSIter j = b.begin(); j != b.end(); ++j)
|
for (ConstSSIter j = b.begin(); j != b.end(); ++j)
|
||||||
dst->insert(*i + *j);
|
dst->insert(*i + *j);
|
||||||
@ -446,10 +453,10 @@ Prefilter::Info* Prefilter::Info::EmptyString() {
|
|||||||
typedef CharClass::iterator CCIter;
|
typedef CharClass::iterator CCIter;
|
||||||
Prefilter::Info* Prefilter::Info::CClass(CharClass *cc,
|
Prefilter::Info* Prefilter::Info::CClass(CharClass *cc,
|
||||||
bool latin1) {
|
bool latin1) {
|
||||||
if (Trace) {
|
if (ExtraDebug) {
|
||||||
VLOG(0) << "CharClassInfo:";
|
LOG(ERROR) << "CharClassInfo:";
|
||||||
for (CCIter i = cc->begin(); i != cc->end(); ++i)
|
for (CCIter i = cc->begin(); i != cc->end(); ++i)
|
||||||
VLOG(0) << " " << i->lo << "-" << i->hi;
|
LOG(ERROR) << " " << i->lo << "-" << i->hi;
|
||||||
}
|
}
|
||||||
|
|
||||||
// If the class is too large, it's okay to overestimate.
|
// If the class is too large, it's okay to overestimate.
|
||||||
@ -469,9 +476,8 @@ Prefilter::Info* Prefilter::Info::CClass(CharClass *cc,
|
|||||||
|
|
||||||
a->is_exact_ = true;
|
a->is_exact_ = true;
|
||||||
|
|
||||||
if (Trace) {
|
if (ExtraDebug)
|
||||||
VLOG(0) << " = " << a->ToString();
|
LOG(ERROR) << " = " << a->ToString();
|
||||||
}
|
|
||||||
|
|
||||||
return a;
|
return a;
|
||||||
}
|
}
|
||||||
@ -492,15 +498,16 @@ class Prefilter::Info::Walker : public Regexp::Walker<Prefilter::Info*> {
|
|||||||
bool latin1() { return latin1_; }
|
bool latin1() { return latin1_; }
|
||||||
private:
|
private:
|
||||||
bool latin1_;
|
bool latin1_;
|
||||||
DISALLOW_EVIL_CONSTRUCTORS(Walker);
|
|
||||||
|
Walker(const Walker&) = delete;
|
||||||
|
Walker& operator=(const Walker&) = delete;
|
||||||
};
|
};
|
||||||
|
|
||||||
Prefilter::Info* Prefilter::BuildInfo(Regexp* re) {
|
Prefilter::Info* Prefilter::BuildInfo(Regexp* re) {
|
||||||
if (Trace) {
|
if (ExtraDebug)
|
||||||
LOG(INFO) << "BuildPrefilter::Info: " << re->ToString();
|
LOG(ERROR) << "BuildPrefilter::Info: " << re->ToString();
|
||||||
}
|
|
||||||
|
|
||||||
bool latin1 = re->parse_flags() & Regexp::Latin1;
|
bool latin1 = (re->parse_flags() & Regexp::Latin1) != 0;
|
||||||
Prefilter::Info::Walker w(latin1);
|
Prefilter::Info::Walker w(latin1);
|
||||||
Prefilter::Info* info = w.WalkExponential(re, NULL, 100000);
|
Prefilter::Info* info = w.WalkExponential(re, NULL, 100000);
|
||||||
|
|
||||||
@ -600,7 +607,6 @@ Prefilter::Info* Prefilter::Info::Walker::PostVisit(
|
|||||||
info = child_args[0];
|
info = child_args[0];
|
||||||
for (int i = 1; i < nchild_args; i++)
|
for (int i = 1; i < nchild_args; i++)
|
||||||
info = Alt(info, child_args[i]);
|
info = Alt(info, child_args[i]);
|
||||||
VLOG(10) << "Alt: " << info->ToString();
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case kRegexpStar:
|
case kRegexpStar:
|
||||||
@ -630,10 +636,9 @@ Prefilter::Info* Prefilter::Info::Walker::PostVisit(
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (Trace) {
|
if (ExtraDebug)
|
||||||
VLOG(0) << "BuildInfo " << re->ToString()
|
LOG(ERROR) << "BuildInfo " << re->ToString()
|
||||||
<< ": " << (info ? info->ToString() : "");
|
<< ": " << (info ? info->ToString() : "");
|
||||||
}
|
|
||||||
|
|
||||||
return info;
|
return info;
|
||||||
}
|
}
|
||||||
|
@ -2,14 +2,19 @@
|
|||||||
// Use of this source code is governed by a BSD-style
|
// Use of this source code is governed by a BSD-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
#ifndef RE2_PREFILTER_H_
|
||||||
|
#define RE2_PREFILTER_H_
|
||||||
|
|
||||||
// Prefilter is the class used to extract string guards from regexps.
|
// Prefilter is the class used to extract string guards from regexps.
|
||||||
// Rather than using Prefilter class directly, use FilteredRE2.
|
// Rather than using Prefilter class directly, use FilteredRE2.
|
||||||
// See filtered_re2.h
|
// See filtered_re2.h
|
||||||
|
|
||||||
#ifndef RE2_PREFILTER_H_
|
#include <set>
|
||||||
#define RE2_PREFILTER_H_
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
#include "util/util.h"
|
#include "util/util.h"
|
||||||
|
#include "util/logging.h"
|
||||||
|
|
||||||
namespace re2 {
|
namespace re2 {
|
||||||
|
|
||||||
@ -37,14 +42,14 @@ class Prefilter {
|
|||||||
int unique_id() const { return unique_id_; }
|
int unique_id() const { return unique_id_; }
|
||||||
|
|
||||||
// The children of the Prefilter node.
|
// The children of the Prefilter node.
|
||||||
vector<Prefilter*>* subs() {
|
std::vector<Prefilter*>* subs() {
|
||||||
CHECK(op_ == AND || op_ == OR);
|
DCHECK(op_ == AND || op_ == OR);
|
||||||
return subs_;
|
return subs_;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Set the children vector. Prefilter takes ownership of subs and
|
// Set the children vector. Prefilter takes ownership of subs and
|
||||||
// subs_ will be deleted when Prefilter is deleted.
|
// subs_ will be deleted when Prefilter is deleted.
|
||||||
void set_subs(vector<Prefilter*>* subs) { subs_ = subs; }
|
void set_subs(std::vector<Prefilter*>* subs) { subs_ = subs; }
|
||||||
|
|
||||||
// Given a RE2, return a Prefilter. The caller takes ownership of
|
// Given a RE2, return a Prefilter. The caller takes ownership of
|
||||||
// the Prefilter and should deallocate it. Returns NULL if Prefilter
|
// the Prefilter and should deallocate it. Returns NULL if Prefilter
|
||||||
@ -72,7 +77,7 @@ class Prefilter {
|
|||||||
|
|
||||||
static Prefilter* FromString(const string& str);
|
static Prefilter* FromString(const string& str);
|
||||||
|
|
||||||
static Prefilter* OrStrings(set<string>* ss);
|
static Prefilter* OrStrings(std::set<string>* ss);
|
||||||
|
|
||||||
static Info* BuildInfo(Regexp* re);
|
static Info* BuildInfo(Regexp* re);
|
||||||
|
|
||||||
@ -82,7 +87,7 @@ class Prefilter {
|
|||||||
Op op_;
|
Op op_;
|
||||||
|
|
||||||
// Sub-matches for AND or OR Prefilter.
|
// Sub-matches for AND or OR Prefilter.
|
||||||
vector<Prefilter*>* subs_;
|
std::vector<Prefilter*>* subs_;
|
||||||
|
|
||||||
// Actual string to match in leaf node.
|
// Actual string to match in leaf node.
|
||||||
string atom_;
|
string atom_;
|
||||||
@ -94,10 +99,8 @@ class Prefilter {
|
|||||||
// and -1 for duplicate nodes.
|
// and -1 for duplicate nodes.
|
||||||
int unique_id_;
|
int unique_id_;
|
||||||
|
|
||||||
// Used for debugging, helps in tracking memory leaks.
|
Prefilter(const Prefilter&) = delete;
|
||||||
int alloc_id_;
|
Prefilter& operator=(const Prefilter&) = delete;
|
||||||
|
|
||||||
DISALLOW_EVIL_CONSTRUCTORS(Prefilter);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace re2
|
} // namespace re2
|
||||||
|
@ -2,20 +2,35 @@
|
|||||||
// Use of this source code is governed by a BSD-style
|
// Use of this source code is governed by a BSD-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
#include "util/util.h"
|
|
||||||
#include "util/flags.h"
|
|
||||||
#include "re2/prefilter.h"
|
|
||||||
#include "re2/prefilter_tree.h"
|
#include "re2/prefilter_tree.h"
|
||||||
#include "re2/re2.h"
|
|
||||||
|
|
||||||
DEFINE_int32(filtered_re2_min_atom_len,
|
#include <stddef.h>
|
||||||
3,
|
#include <algorithm>
|
||||||
"Strings less than this length are not stored as atoms");
|
#include <map>
|
||||||
|
#include <memory>
|
||||||
|
#include <set>
|
||||||
|
#include <string>
|
||||||
|
#include <utility>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "util/util.h"
|
||||||
|
#include "util/logging.h"
|
||||||
|
#include "util/strutil.h"
|
||||||
|
#include "re2/prefilter.h"
|
||||||
|
#include "re2/re2.h"
|
||||||
|
|
||||||
namespace re2 {
|
namespace re2 {
|
||||||
|
|
||||||
|
static const bool ExtraDebug = false;
|
||||||
|
|
||||||
PrefilterTree::PrefilterTree()
|
PrefilterTree::PrefilterTree()
|
||||||
: compiled_(false) {
|
: compiled_(false),
|
||||||
|
min_atom_len_(3) {
|
||||||
|
}
|
||||||
|
|
||||||
|
PrefilterTree::PrefilterTree(int min_atom_len)
|
||||||
|
: compiled_(false),
|
||||||
|
min_atom_len_(min_atom_len) {
|
||||||
}
|
}
|
||||||
|
|
||||||
PrefilterTree::~PrefilterTree() {
|
PrefilterTree::~PrefilterTree() {
|
||||||
@ -26,62 +41,22 @@ PrefilterTree::~PrefilterTree() {
|
|||||||
delete entries_[i].parents;
|
delete entries_[i].parents;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Functions used for adding and Compiling prefilters to the
|
void PrefilterTree::Add(Prefilter* prefilter) {
|
||||||
// PrefilterTree.
|
|
||||||
static bool KeepPart(Prefilter* prefilter, int level) {
|
|
||||||
if (prefilter == NULL)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
switch (prefilter->op()) {
|
|
||||||
default:
|
|
||||||
LOG(DFATAL) << "Unexpected op in KeepPart: "
|
|
||||||
<< prefilter->op();
|
|
||||||
return false;
|
|
||||||
|
|
||||||
case Prefilter::ALL:
|
|
||||||
return false;
|
|
||||||
|
|
||||||
case Prefilter::ATOM:
|
|
||||||
return prefilter->atom().size() >=
|
|
||||||
static_cast<size_t>(FLAGS_filtered_re2_min_atom_len);
|
|
||||||
|
|
||||||
case Prefilter::AND: {
|
|
||||||
int j = 0;
|
|
||||||
vector<Prefilter*>* subs = prefilter->subs();
|
|
||||||
for (size_t i = 0; i < subs->size(); i++)
|
|
||||||
if (KeepPart((*subs)[i], level + 1))
|
|
||||||
(*subs)[j++] = (*subs)[i];
|
|
||||||
else
|
|
||||||
delete (*subs)[i];
|
|
||||||
|
|
||||||
subs->resize(j);
|
|
||||||
return j > 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
case Prefilter::OR:
|
|
||||||
for (size_t i = 0; i < prefilter->subs()->size(); i++)
|
|
||||||
if (!KeepPart((*prefilter->subs())[i], level + 1))
|
|
||||||
return false;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void PrefilterTree::Add(Prefilter *f) {
|
|
||||||
if (compiled_) {
|
if (compiled_) {
|
||||||
LOG(DFATAL) << "Add after Compile.";
|
LOG(DFATAL) << "Add called after Compile.";
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (f != NULL && !KeepPart(f, 0)) {
|
if (prefilter != NULL && !KeepNode(prefilter)) {
|
||||||
delete f;
|
delete prefilter;
|
||||||
f = NULL;
|
prefilter = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
prefilter_vec_.push_back(f);
|
prefilter_vec_.push_back(prefilter);
|
||||||
}
|
}
|
||||||
|
|
||||||
void PrefilterTree::Compile(vector<string>* atom_vec) {
|
void PrefilterTree::Compile(std::vector<string>* atom_vec) {
|
||||||
if (compiled_) {
|
if (compiled_) {
|
||||||
LOG(DFATAL) << "Compile after Compile.";
|
LOG(DFATAL) << "Compile called already.";
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -93,7 +68,9 @@ void PrefilterTree::Compile(vector<string>* atom_vec) {
|
|||||||
|
|
||||||
compiled_ = true;
|
compiled_ = true;
|
||||||
|
|
||||||
AssignUniqueIds(atom_vec);
|
// TODO(junyer): Use std::unordered_set<Prefilter*> instead?
|
||||||
|
NodeMap nodes;
|
||||||
|
AssignUniqueIds(&nodes, atom_vec);
|
||||||
|
|
||||||
// Identify nodes that are too common among prefilters and are
|
// Identify nodes that are too common among prefilters and are
|
||||||
// triggering too many parents. Then get rid of them if possible.
|
// triggering too many parents. Then get rid of them if possible.
|
||||||
@ -109,9 +86,11 @@ void PrefilterTree::Compile(vector<string>* atom_vec) {
|
|||||||
// this trigger. TODO(vsri): Adjust the threshold appropriately,
|
// this trigger. TODO(vsri): Adjust the threshold appropriately,
|
||||||
// make it a function of total number of nodes?
|
// make it a function of total number of nodes?
|
||||||
bool have_other_guard = true;
|
bool have_other_guard = true;
|
||||||
for (StdIntMap::iterator it = parents->begin(); it != parents->end(); ++it)
|
for (StdIntMap::iterator it = parents->begin();
|
||||||
|
it != parents->end(); ++it) {
|
||||||
have_other_guard = have_other_guard &&
|
have_other_guard = have_other_guard &&
|
||||||
(entries_[it->first].propagate_up_at_count > 1);
|
(entries_[it->first].propagate_up_at_count > 1);
|
||||||
|
}
|
||||||
|
|
||||||
if (have_other_guard) {
|
if (have_other_guard) {
|
||||||
for (StdIntMap::iterator it = parents->begin();
|
for (StdIntMap::iterator it = parents->begin();
|
||||||
@ -123,50 +102,82 @@ void PrefilterTree::Compile(vector<string>* atom_vec) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
PrintDebugInfo();
|
if (ExtraDebug)
|
||||||
|
PrintDebugInfo(&nodes);
|
||||||
}
|
}
|
||||||
|
|
||||||
Prefilter* PrefilterTree::CanonicalNode(Prefilter* node) {
|
Prefilter* PrefilterTree::CanonicalNode(NodeMap* nodes, Prefilter* node) {
|
||||||
string node_string = NodeString(node);
|
string node_string = NodeString(node);
|
||||||
map<string, Prefilter*>::iterator iter = node_map_.find(node_string);
|
std::map<string, Prefilter*>::iterator iter = nodes->find(node_string);
|
||||||
if (iter == node_map_.end())
|
if (iter == nodes->end())
|
||||||
return NULL;
|
return NULL;
|
||||||
return (*iter).second;
|
return (*iter).second;
|
||||||
}
|
}
|
||||||
|
|
||||||
static string Itoa(int n) {
|
|
||||||
char buf[100];
|
|
||||||
snprintf(buf, sizeof buf, "%d", n);
|
|
||||||
return string(buf);
|
|
||||||
}
|
|
||||||
|
|
||||||
string PrefilterTree::NodeString(Prefilter* node) const {
|
string PrefilterTree::NodeString(Prefilter* node) const {
|
||||||
// Adding the operation disambiguates AND/OR/atom nodes.
|
// Adding the operation disambiguates AND/OR/atom nodes.
|
||||||
string s = Itoa(node->op()) + ":";
|
string s = StringPrintf("%d", node->op()) + ":";
|
||||||
if (node->op() == Prefilter::ATOM) {
|
if (node->op() == Prefilter::ATOM) {
|
||||||
s += node->atom();
|
s += node->atom();
|
||||||
} else {
|
} else {
|
||||||
for (size_t i = 0; i < node->subs()->size(); i++) {
|
for (size_t i = 0; i < node->subs()->size(); i++) {
|
||||||
if (i > 0)
|
if (i > 0)
|
||||||
s += ',';
|
s += ',';
|
||||||
s += Itoa((*node->subs())[i]->unique_id());
|
s += StringPrintf("%d", (*node->subs())[i]->unique_id());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) {
|
bool PrefilterTree::KeepNode(Prefilter* node) const {
|
||||||
|
if (node == NULL)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
switch (node->op()) {
|
||||||
|
default:
|
||||||
|
LOG(DFATAL) << "Unexpected op in KeepNode: " << node->op();
|
||||||
|
return false;
|
||||||
|
|
||||||
|
case Prefilter::ALL:
|
||||||
|
return false;
|
||||||
|
|
||||||
|
case Prefilter::ATOM:
|
||||||
|
return node->atom().size() >= static_cast<size_t>(min_atom_len_);
|
||||||
|
|
||||||
|
case Prefilter::AND: {
|
||||||
|
int j = 0;
|
||||||
|
std::vector<Prefilter*>* subs = node->subs();
|
||||||
|
for (size_t i = 0; i < subs->size(); i++)
|
||||||
|
if (KeepNode((*subs)[i]))
|
||||||
|
(*subs)[j++] = (*subs)[i];
|
||||||
|
else
|
||||||
|
delete (*subs)[i];
|
||||||
|
|
||||||
|
subs->resize(j);
|
||||||
|
return j > 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
case Prefilter::OR:
|
||||||
|
for (size_t i = 0; i < node->subs()->size(); i++)
|
||||||
|
if (!KeepNode((*node->subs())[i]))
|
||||||
|
return false;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void PrefilterTree::AssignUniqueIds(NodeMap* nodes,
|
||||||
|
std::vector<string>* atom_vec) {
|
||||||
atom_vec->clear();
|
atom_vec->clear();
|
||||||
|
|
||||||
// Build vector of all filter nodes, sorted topologically
|
// Build vector of all filter nodes, sorted topologically
|
||||||
// from top to bottom in v.
|
// from top to bottom in v.
|
||||||
vector<Prefilter*> v;
|
std::vector<Prefilter*> v;
|
||||||
|
|
||||||
// Add the top level nodes of each regexp prefilter.
|
// Add the top level nodes of each regexp prefilter.
|
||||||
for (size_t i = 0; i < prefilter_vec_.size(); i++) {
|
for (size_t i = 0; i < prefilter_vec_.size(); i++) {
|
||||||
Prefilter* f = prefilter_vec_[i];
|
Prefilter* f = prefilter_vec_[i];
|
||||||
if (f == NULL)
|
if (f == NULL)
|
||||||
unfiltered_.push_back(i);
|
unfiltered_.push_back(static_cast<int>(i));
|
||||||
|
|
||||||
// We push NULL also on to v, so that we maintain the
|
// We push NULL also on to v, so that we maintain the
|
||||||
// mapping of index==regexpid for level=0 prefilter nodes.
|
// mapping of index==regexpid for level=0 prefilter nodes.
|
||||||
@ -179,7 +190,7 @@ void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) {
|
|||||||
if (f == NULL)
|
if (f == NULL)
|
||||||
continue;
|
continue;
|
||||||
if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) {
|
if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) {
|
||||||
const vector<Prefilter*>& subs = *f->subs();
|
const std::vector<Prefilter*>& subs = *f->subs();
|
||||||
for (size_t j = 0; j < subs.size(); j++)
|
for (size_t j = 0; j < subs.size(); j++)
|
||||||
v.push_back(subs[j]);
|
v.push_back(subs[j]);
|
||||||
}
|
}
|
||||||
@ -187,16 +198,16 @@ void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) {
|
|||||||
|
|
||||||
// Identify unique nodes.
|
// Identify unique nodes.
|
||||||
int unique_id = 0;
|
int unique_id = 0;
|
||||||
for (int i = v.size() - 1; i >= 0; i--) {
|
for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
|
||||||
Prefilter *node = v[i];
|
Prefilter *node = v[i];
|
||||||
if (node == NULL)
|
if (node == NULL)
|
||||||
continue;
|
continue;
|
||||||
node->set_unique_id(-1);
|
node->set_unique_id(-1);
|
||||||
Prefilter* canonical = CanonicalNode(node);
|
Prefilter* canonical = CanonicalNode(nodes, node);
|
||||||
if (canonical == NULL) {
|
if (canonical == NULL) {
|
||||||
// Any further nodes that have the same node string
|
// Any further nodes that have the same node string
|
||||||
// will find this node as the canonical node.
|
// will find this node as the canonical node.
|
||||||
node_map_[NodeString(node)] = node;
|
nodes->emplace(NodeString(node), node);
|
||||||
if (node->op() == Prefilter::ATOM) {
|
if (node->op() == Prefilter::ATOM) {
|
||||||
atom_vec->push_back(node->atom());
|
atom_vec->push_back(node->atom());
|
||||||
atom_index_to_id_.push_back(unique_id);
|
atom_index_to_id_.push_back(unique_id);
|
||||||
@ -206,15 +217,15 @@ void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) {
|
|||||||
node->set_unique_id(canonical->unique_id());
|
node->set_unique_id(canonical->unique_id());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
entries_.resize(node_map_.size());
|
entries_.resize(nodes->size());
|
||||||
|
|
||||||
// Create parent IntMap for the entries.
|
// Create parent StdIntMap for the entries.
|
||||||
for (int i = v.size() - 1; i >= 0; i--) {
|
for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
|
||||||
Prefilter* prefilter = v[i];
|
Prefilter* prefilter = v[i];
|
||||||
if (prefilter == NULL)
|
if (prefilter == NULL)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
if (CanonicalNode(prefilter) != prefilter)
|
if (CanonicalNode(nodes, prefilter) != prefilter)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
Entry* entry = &entries_[prefilter->unique_id()];
|
Entry* entry = &entries_[prefilter->unique_id()];
|
||||||
@ -222,12 +233,12 @@ void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Fill the entries.
|
// Fill the entries.
|
||||||
for (int i = v.size() - 1; i >= 0; i--) {
|
for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
|
||||||
Prefilter* prefilter = v[i];
|
Prefilter* prefilter = v[i];
|
||||||
if (prefilter == NULL)
|
if (prefilter == NULL)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
if (CanonicalNode(prefilter) != prefilter)
|
if (CanonicalNode(nodes, prefilter) != prefilter)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
Entry* entry = &entries_[prefilter->unique_id()];
|
Entry* entry = &entries_[prefilter->unique_id()];
|
||||||
@ -244,10 +255,10 @@ void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) {
|
|||||||
|
|
||||||
case Prefilter::OR:
|
case Prefilter::OR:
|
||||||
case Prefilter::AND: {
|
case Prefilter::AND: {
|
||||||
set<int> uniq_child;
|
std::set<int> uniq_child;
|
||||||
for (size_t j = 0; j < prefilter->subs()->size(); j++) {
|
for (size_t j = 0; j < prefilter->subs()->size(); j++) {
|
||||||
Prefilter* child = (*prefilter->subs())[j];
|
Prefilter* child = (*prefilter->subs())[j];
|
||||||
Prefilter* canonical = CanonicalNode(child);
|
Prefilter* canonical = CanonicalNode(nodes, child);
|
||||||
if (canonical == NULL) {
|
if (canonical == NULL) {
|
||||||
LOG(DFATAL) << "Null canonical node";
|
LOG(DFATAL) << "Null canonical node";
|
||||||
return;
|
return;
|
||||||
@ -256,11 +267,14 @@ void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) {
|
|||||||
uniq_child.insert(child_id);
|
uniq_child.insert(child_id);
|
||||||
// To the child, we want to add to parent indices.
|
// To the child, we want to add to parent indices.
|
||||||
Entry* child_entry = &entries_[child_id];
|
Entry* child_entry = &entries_[child_id];
|
||||||
if (child_entry->parents->find(prefilter->unique_id()) == child_entry->parents->end())
|
if (child_entry->parents->find(prefilter->unique_id()) ==
|
||||||
|
child_entry->parents->end()) {
|
||||||
(*child_entry->parents)[prefilter->unique_id()] = 1;
|
(*child_entry->parents)[prefilter->unique_id()] = 1;
|
||||||
}
|
}
|
||||||
entry->propagate_up_at_count =
|
}
|
||||||
prefilter->op() == Prefilter::AND ? uniq_child.size() : 1;
|
entry->propagate_up_at_count = prefilter->op() == Prefilter::AND
|
||||||
|
? static_cast<int>(uniq_child.size())
|
||||||
|
: 1;
|
||||||
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -271,29 +285,28 @@ void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) {
|
|||||||
for (size_t i = 0; i < prefilter_vec_.size(); i++) {
|
for (size_t i = 0; i < prefilter_vec_.size(); i++) {
|
||||||
if (prefilter_vec_[i] == NULL)
|
if (prefilter_vec_[i] == NULL)
|
||||||
continue;
|
continue;
|
||||||
int id = CanonicalNode(prefilter_vec_[i])->unique_id();
|
int id = CanonicalNode(nodes, prefilter_vec_[i])->unique_id();
|
||||||
DCHECK_LE(0, id);
|
DCHECK_LE(0, id);
|
||||||
Entry* entry = &entries_[id];
|
Entry* entry = &entries_[id];
|
||||||
entry->regexps.push_back(i);
|
entry->regexps.push_back(static_cast<int>(i));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Functions for triggering during search.
|
// Functions for triggering during search.
|
||||||
void PrefilterTree::RegexpsGivenStrings(
|
void PrefilterTree::RegexpsGivenStrings(
|
||||||
const vector<int>& matched_atoms,
|
const std::vector<int>& matched_atoms,
|
||||||
vector<int>* regexps) const {
|
std::vector<int>* regexps) const {
|
||||||
regexps->clear();
|
regexps->clear();
|
||||||
if (!compiled_) {
|
if (!compiled_) {
|
||||||
LOG(WARNING) << "Compile() not called";
|
LOG(ERROR) << "RegexpsGivenStrings called before Compile.";
|
||||||
for (size_t i = 0; i < prefilter_vec_.size(); ++i)
|
for (size_t i = 0; i < prefilter_vec_.size(); ++i)
|
||||||
regexps->push_back(i);
|
regexps->push_back(static_cast<int>(i));
|
||||||
} else {
|
} else {
|
||||||
if (!prefilter_vec_.empty()) {
|
if (!prefilter_vec_.empty()) {
|
||||||
IntMap regexps_map(prefilter_vec_.size());
|
IntMap regexps_map(static_cast<int>(prefilter_vec_.size()));
|
||||||
vector<int> matched_atom_ids;
|
std::vector<int> matched_atom_ids;
|
||||||
for (size_t j = 0; j < matched_atoms.size(); j++) {
|
for (size_t j = 0; j < matched_atoms.size(); j++) {
|
||||||
matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]);
|
matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]);
|
||||||
VLOG(10) << "Atom id:" << atom_index_to_id_[matched_atoms[j]];
|
|
||||||
}
|
}
|
||||||
PropagateMatch(matched_atom_ids, ®exps_map);
|
PropagateMatch(matched_atom_ids, ®exps_map);
|
||||||
for (IntMap::iterator it = regexps_map.begin();
|
for (IntMap::iterator it = regexps_map.begin();
|
||||||
@ -304,23 +317,20 @@ void PrefilterTree::RegexpsGivenStrings(
|
|||||||
regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end());
|
regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
sort(regexps->begin(), regexps->end());
|
std::sort(regexps->begin(), regexps->end());
|
||||||
}
|
}
|
||||||
|
|
||||||
void PrefilterTree::PropagateMatch(const vector<int>& atom_ids,
|
void PrefilterTree::PropagateMatch(const std::vector<int>& atom_ids,
|
||||||
IntMap* regexps) const {
|
IntMap* regexps) const {
|
||||||
IntMap count(entries_.size());
|
IntMap count(static_cast<int>(entries_.size()));
|
||||||
IntMap work(entries_.size());
|
IntMap work(static_cast<int>(entries_.size()));
|
||||||
for (size_t i = 0; i < atom_ids.size(); i++)
|
for (size_t i = 0; i < atom_ids.size(); i++)
|
||||||
work.set(atom_ids[i], 1);
|
work.set(atom_ids[i], 1);
|
||||||
for (IntMap::iterator it = work.begin(); it != work.end(); ++it) {
|
for (IntMap::iterator it = work.begin(); it != work.end(); ++it) {
|
||||||
const Entry& entry = entries_[it->index()];
|
const Entry& entry = entries_[it->index()];
|
||||||
VLOG(10) << "Processing: " << it->index();
|
|
||||||
// Record regexps triggered.
|
// Record regexps triggered.
|
||||||
for (size_t i = 0; i < entry.regexps.size(); i++) {
|
for (size_t i = 0; i < entry.regexps.size(); i++)
|
||||||
VLOG(10) << "Regexp triggered: " << entry.regexps[i];
|
|
||||||
regexps->set(entry.regexps[i], 1);
|
regexps->set(entry.regexps[i], 1);
|
||||||
}
|
|
||||||
int c;
|
int c;
|
||||||
// Pass trigger up to parents.
|
// Pass trigger up to parents.
|
||||||
for (StdIntMap::iterator it = entry.parents->begin();
|
for (StdIntMap::iterator it = entry.parents->begin();
|
||||||
@ -328,7 +338,6 @@ void PrefilterTree::PropagateMatch(const vector<int>& atom_ids,
|
|||||||
++it) {
|
++it) {
|
||||||
int j = it->first;
|
int j = it->first;
|
||||||
const Entry& parent = entries_[j];
|
const Entry& parent = entries_[j];
|
||||||
VLOG(10) << " parent= " << j << " trig= " << parent.propagate_up_at_count;
|
|
||||||
// Delay until all the children have succeeded.
|
// Delay until all the children have succeeded.
|
||||||
if (parent.propagate_up_at_count > 1) {
|
if (parent.propagate_up_at_count > 1) {
|
||||||
if (count.has_index(j)) {
|
if (count.has_index(j)) {
|
||||||
@ -341,7 +350,6 @@ void PrefilterTree::PropagateMatch(const vector<int>& atom_ids,
|
|||||||
if (c < parent.propagate_up_at_count)
|
if (c < parent.propagate_up_at_count)
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
VLOG(10) << "Triggering: " << j;
|
|
||||||
// Trigger the parent.
|
// Trigger the parent.
|
||||||
work.set(j, 1);
|
work.set(j, 1);
|
||||||
}
|
}
|
||||||
@ -350,25 +358,25 @@ void PrefilterTree::PropagateMatch(const vector<int>& atom_ids,
|
|||||||
|
|
||||||
// Debugging help.
|
// Debugging help.
|
||||||
void PrefilterTree::PrintPrefilter(int regexpid) {
|
void PrefilterTree::PrintPrefilter(int regexpid) {
|
||||||
LOG(INFO) << DebugNodeString(prefilter_vec_[regexpid]);
|
LOG(ERROR) << DebugNodeString(prefilter_vec_[regexpid]);
|
||||||
}
|
}
|
||||||
|
|
||||||
void PrefilterTree::PrintDebugInfo() {
|
void PrefilterTree::PrintDebugInfo(NodeMap* nodes) {
|
||||||
VLOG(10) << "#Unique Atoms: " << atom_index_to_id_.size();
|
LOG(ERROR) << "#Unique Atoms: " << atom_index_to_id_.size();
|
||||||
VLOG(10) << "#Unique Nodes: " << entries_.size();
|
LOG(ERROR) << "#Unique Nodes: " << entries_.size();
|
||||||
|
|
||||||
for (size_t i = 0; i < entries_.size(); ++i) {
|
for (size_t i = 0; i < entries_.size(); ++i) {
|
||||||
StdIntMap* parents = entries_[i].parents;
|
StdIntMap* parents = entries_[i].parents;
|
||||||
const vector<int>& regexps = entries_[i].regexps;
|
const std::vector<int>& regexps = entries_[i].regexps;
|
||||||
VLOG(10) << "EntryId: " << i
|
LOG(ERROR) << "EntryId: " << i
|
||||||
<< " N: " << parents->size() << " R: " << regexps.size();
|
<< " N: " << parents->size() << " R: " << regexps.size();
|
||||||
for (StdIntMap::iterator it = parents->begin(); it != parents->end(); ++it)
|
for (StdIntMap::iterator it = parents->begin(); it != parents->end(); ++it)
|
||||||
VLOG(10) << it->first;
|
LOG(ERROR) << it->first;
|
||||||
}
|
}
|
||||||
VLOG(10) << "Map:";
|
LOG(ERROR) << "Map:";
|
||||||
for (map<string, Prefilter*>::const_iterator iter = node_map_.begin();
|
for (std::map<string, Prefilter*>::const_iterator iter = nodes->begin();
|
||||||
iter != node_map_.end(); ++iter)
|
iter != nodes->end(); ++iter)
|
||||||
VLOG(10) << "NodeId: " << (*iter).second->unique_id()
|
LOG(ERROR) << "NodeId: " << (*iter).second->unique_id()
|
||||||
<< " Str: " << (*iter).first;
|
<< " Str: " << (*iter).first;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -385,7 +393,7 @@ string PrefilterTree::DebugNodeString(Prefilter* node) const {
|
|||||||
for (size_t i = 0; i < node->subs()->size(); i++) {
|
for (size_t i = 0; i < node->subs()->size(); i++) {
|
||||||
if (i > 0)
|
if (i > 0)
|
||||||
node_string += ',';
|
node_string += ',';
|
||||||
node_string += Itoa((*node->subs())[i]->unique_id());
|
node_string += StringPrintf("%d", (*node->subs())[i]->unique_id());
|
||||||
node_string += ":";
|
node_string += ":";
|
||||||
node_string += DebugNodeString((*node->subs())[i]);
|
node_string += DebugNodeString((*node->subs())[i]);
|
||||||
}
|
}
|
||||||
|
@ -2,6 +2,9 @@
|
|||||||
// Use of this source code is governed by a BSD-style
|
// Use of this source code is governed by a BSD-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
#ifndef RE2_PREFILTER_TREE_H_
|
||||||
|
#define RE2_PREFILTER_TREE_H_
|
||||||
|
|
||||||
// The PrefilterTree class is used to form an AND-OR tree of strings
|
// The PrefilterTree class is used to form an AND-OR tree of strings
|
||||||
// that would trigger each regexp. The 'prefilter' of each regexp is
|
// that would trigger each regexp. The 'prefilter' of each regexp is
|
||||||
// added tp PrefilterTree, and then PrefilterTree is used to find all
|
// added tp PrefilterTree, and then PrefilterTree is used to find all
|
||||||
@ -12,23 +15,21 @@
|
|||||||
// favorite engine. PrefilterTree provides a set of strings (called
|
// favorite engine. PrefilterTree provides a set of strings (called
|
||||||
// atoms) that the user of this class should use to do the string
|
// atoms) that the user of this class should use to do the string
|
||||||
// matching.
|
// matching.
|
||||||
//
|
|
||||||
#ifndef RE2_PREFILTER_TREE_H_
|
#include <map>
|
||||||
#define RE2_PREFILTER_TREE_H_
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
#include "util/util.h"
|
#include "util/util.h"
|
||||||
#include "util/sparse_array.h"
|
#include "util/sparse_array.h"
|
||||||
|
#include "re2/prefilter.h"
|
||||||
|
|
||||||
namespace re2 {
|
namespace re2 {
|
||||||
|
|
||||||
typedef SparseArray<int> IntMap;
|
|
||||||
typedef map<int,int> StdIntMap;
|
|
||||||
|
|
||||||
class Prefilter;
|
|
||||||
|
|
||||||
class PrefilterTree {
|
class PrefilterTree {
|
||||||
public:
|
public:
|
||||||
PrefilterTree();
|
PrefilterTree();
|
||||||
|
explicit PrefilterTree(int min_atom_len);
|
||||||
~PrefilterTree();
|
~PrefilterTree();
|
||||||
|
|
||||||
// Adds the prefilter for the next regexp. Note that we assume that
|
// Adds the prefilter for the next regexp. Note that we assume that
|
||||||
@ -42,20 +43,24 @@ class PrefilterTree {
|
|||||||
// The caller should use the returned set of strings to do string matching.
|
// The caller should use the returned set of strings to do string matching.
|
||||||
// Each time a string matches, the corresponding index then has to be
|
// Each time a string matches, the corresponding index then has to be
|
||||||
// and passed to RegexpsGivenStrings below.
|
// and passed to RegexpsGivenStrings below.
|
||||||
void Compile(vector<string>* atom_vec);
|
void Compile(std::vector<string>* atom_vec);
|
||||||
|
|
||||||
// Given the indices of the atoms that matched, returns the indexes
|
// Given the indices of the atoms that matched, returns the indexes
|
||||||
// of regexps that should be searched. The matched_atoms should
|
// of regexps that should be searched. The matched_atoms should
|
||||||
// contain all the ids of string atoms that were found to match the
|
// contain all the ids of string atoms that were found to match the
|
||||||
// content. The caller can use any string match engine to perform
|
// content. The caller can use any string match engine to perform
|
||||||
// this function. This function is thread safe.
|
// this function. This function is thread safe.
|
||||||
void RegexpsGivenStrings(const vector<int>& matched_atoms,
|
void RegexpsGivenStrings(const std::vector<int>& matched_atoms,
|
||||||
vector<int>* regexps) const;
|
std::vector<int>* regexps) const;
|
||||||
|
|
||||||
// Print debug prefilter. Also prints unique ids associated with
|
// Print debug prefilter. Also prints unique ids associated with
|
||||||
// nodes of the prefilter of the regexp.
|
// nodes of the prefilter of the regexp.
|
||||||
void PrintPrefilter(int regexpid);
|
void PrintPrefilter(int regexpid);
|
||||||
|
|
||||||
|
private:
|
||||||
|
typedef SparseArray<int> IntMap;
|
||||||
|
typedef std::map<int, int> StdIntMap;
|
||||||
|
typedef std::map<string, Prefilter*> NodeMap;
|
||||||
|
|
||||||
// Each unique node has a corresponding Entry that helps in
|
// Each unique node has a corresponding Entry that helps in
|
||||||
// passing the matching trigger information along the tree.
|
// passing the matching trigger information along the tree.
|
||||||
@ -76,22 +81,24 @@ class PrefilterTree {
|
|||||||
|
|
||||||
// When this node is ready to trigger the parent, what are the
|
// When this node is ready to trigger the parent, what are the
|
||||||
// regexps that are triggered.
|
// regexps that are triggered.
|
||||||
vector<int> regexps;
|
std::vector<int> regexps;
|
||||||
};
|
};
|
||||||
|
|
||||||
private:
|
// Returns true if the prefilter node should be kept.
|
||||||
|
bool KeepNode(Prefilter* node) const;
|
||||||
|
|
||||||
// This function assigns unique ids to various parts of the
|
// This function assigns unique ids to various parts of the
|
||||||
// prefilter, by looking at if these nodes are already in the
|
// prefilter, by looking at if these nodes are already in the
|
||||||
// PrefilterTree.
|
// PrefilterTree.
|
||||||
void AssignUniqueIds(vector<string>* atom_vec);
|
void AssignUniqueIds(NodeMap* nodes, std::vector<string>* atom_vec);
|
||||||
|
|
||||||
// Given the matching atoms, find the regexps to be triggered.
|
// Given the matching atoms, find the regexps to be triggered.
|
||||||
void PropagateMatch(const vector<int>& atom_ids,
|
void PropagateMatch(const std::vector<int>& atom_ids,
|
||||||
IntMap* regexps) const;
|
IntMap* regexps) const;
|
||||||
|
|
||||||
// Returns the prefilter node that has the same NodeString as this
|
// Returns the prefilter node that has the same NodeString as this
|
||||||
// node. For the canonical node, returns node.
|
// node. For the canonical node, returns node.
|
||||||
Prefilter* CanonicalNode(Prefilter* node);
|
Prefilter* CanonicalNode(NodeMap* nodes, Prefilter* node);
|
||||||
|
|
||||||
// A string that uniquely identifies the node. Assumes that the
|
// A string that uniquely identifies the node. Assumes that the
|
||||||
// children of node has already been assigned unique ids.
|
// children of node has already been assigned unique ids.
|
||||||
@ -101,29 +108,30 @@ class PrefilterTree {
|
|||||||
string DebugNodeString(Prefilter* node) const;
|
string DebugNodeString(Prefilter* node) const;
|
||||||
|
|
||||||
// Used for debugging.
|
// Used for debugging.
|
||||||
void PrintDebugInfo();
|
void PrintDebugInfo(NodeMap* nodes);
|
||||||
|
|
||||||
// These are all the nodes formed by Compile. Essentially, there is
|
// These are all the nodes formed by Compile. Essentially, there is
|
||||||
// one node for each unique atom and each unique AND/OR node.
|
// one node for each unique atom and each unique AND/OR node.
|
||||||
vector<Entry> entries_;
|
std::vector<Entry> entries_;
|
||||||
|
|
||||||
// Map node string to canonical Prefilter node.
|
|
||||||
map<string, Prefilter*> node_map_;
|
|
||||||
|
|
||||||
// indices of regexps that always pass through the filter (since we
|
// indices of regexps that always pass through the filter (since we
|
||||||
// found no required literals in these regexps).
|
// found no required literals in these regexps).
|
||||||
vector<int> unfiltered_;
|
std::vector<int> unfiltered_;
|
||||||
|
|
||||||
// vector of Prefilter for all regexps.
|
// vector of Prefilter for all regexps.
|
||||||
vector<Prefilter*> prefilter_vec_;
|
std::vector<Prefilter*> prefilter_vec_;
|
||||||
|
|
||||||
// Atom index in returned strings to entry id mapping.
|
// Atom index in returned strings to entry id mapping.
|
||||||
vector<int> atom_index_to_id_;
|
std::vector<int> atom_index_to_id_;
|
||||||
|
|
||||||
// Has the prefilter tree been compiled.
|
// Has the prefilter tree been compiled.
|
||||||
bool compiled_;
|
bool compiled_;
|
||||||
|
|
||||||
DISALLOW_EVIL_CONSTRUCTORS(PrefilterTree);
|
// Strings less than this length are not stored as atoms.
|
||||||
|
const int min_atom_len_;
|
||||||
|
|
||||||
|
PrefilterTree(const PrefilterTree&) = delete;
|
||||||
|
PrefilterTree& operator=(const PrefilterTree&) = delete;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
|
@ -5,48 +5,57 @@
|
|||||||
// Compiled regular expression representation.
|
// Compiled regular expression representation.
|
||||||
// Tested by compile_test.cc
|
// Tested by compile_test.cc
|
||||||
|
|
||||||
#include "util/util.h"
|
|
||||||
#include "util/sparse_set.h"
|
|
||||||
#include "re2/prog.h"
|
#include "re2/prog.h"
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <algorithm>
|
||||||
|
#include <memory>
|
||||||
|
#include <utility>
|
||||||
|
|
||||||
|
#include "util/util.h"
|
||||||
|
#include "util/logging.h"
|
||||||
|
#include "util/strutil.h"
|
||||||
|
#include "re2/bitmap256.h"
|
||||||
#include "re2/stringpiece.h"
|
#include "re2/stringpiece.h"
|
||||||
|
|
||||||
namespace re2 {
|
namespace re2 {
|
||||||
|
|
||||||
// Constructors per Inst opcode
|
// Constructors per Inst opcode
|
||||||
|
|
||||||
void Prog::Inst::InitAlt(uint32 out, uint32 out1) {
|
void Prog::Inst::InitAlt(uint32_t out, uint32_t out1) {
|
||||||
DCHECK_EQ(out_opcode_, 0);
|
DCHECK_EQ(out_opcode_, 0);
|
||||||
set_out_opcode(out, kInstAlt);
|
set_out_opcode(out, kInstAlt);
|
||||||
out1_ = out1;
|
out1_ = out1;
|
||||||
}
|
}
|
||||||
|
|
||||||
void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32 out) {
|
void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32_t out) {
|
||||||
DCHECK_EQ(out_opcode_, 0);
|
DCHECK_EQ(out_opcode_, 0);
|
||||||
set_out_opcode(out, kInstByteRange);
|
set_out_opcode(out, kInstByteRange);
|
||||||
lo_ = lo & 0xFF;
|
lo_ = lo & 0xFF;
|
||||||
hi_ = hi & 0xFF;
|
hi_ = hi & 0xFF;
|
||||||
foldcase_ = foldcase;
|
foldcase_ = foldcase & 0xFF;
|
||||||
}
|
}
|
||||||
|
|
||||||
void Prog::Inst::InitCapture(int cap, uint32 out) {
|
void Prog::Inst::InitCapture(int cap, uint32_t out) {
|
||||||
DCHECK_EQ(out_opcode_, 0);
|
DCHECK_EQ(out_opcode_, 0);
|
||||||
set_out_opcode(out, kInstCapture);
|
set_out_opcode(out, kInstCapture);
|
||||||
cap_ = cap;
|
cap_ = cap;
|
||||||
}
|
}
|
||||||
|
|
||||||
void Prog::Inst::InitEmptyWidth(EmptyOp empty, uint32 out) {
|
void Prog::Inst::InitEmptyWidth(EmptyOp empty, uint32_t out) {
|
||||||
DCHECK_EQ(out_opcode_, 0);
|
DCHECK_EQ(out_opcode_, 0);
|
||||||
set_out_opcode(out, kInstEmptyWidth);
|
set_out_opcode(out, kInstEmptyWidth);
|
||||||
empty_ = empty;
|
empty_ = empty;
|
||||||
}
|
}
|
||||||
|
|
||||||
void Prog::Inst::InitMatch(int32 id) {
|
void Prog::Inst::InitMatch(int32_t id) {
|
||||||
DCHECK_EQ(out_opcode_, 0);
|
DCHECK_EQ(out_opcode_, 0);
|
||||||
set_opcode(kInstMatch);
|
set_opcode(kInstMatch);
|
||||||
match_id_ = id;
|
match_id_ = id;
|
||||||
}
|
}
|
||||||
|
|
||||||
void Prog::Inst::InitNop(uint32 out) {
|
void Prog::Inst::InitNop(uint32_t out) {
|
||||||
DCHECK_EQ(out_opcode_, 0);
|
DCHECK_EQ(out_opcode_, 0);
|
||||||
set_opcode(kInstNop);
|
set_opcode(kInstNop);
|
||||||
}
|
}
|
||||||
@ -94,34 +103,27 @@ Prog::Prog()
|
|||||||
: anchor_start_(false),
|
: anchor_start_(false),
|
||||||
anchor_end_(false),
|
anchor_end_(false),
|
||||||
reversed_(false),
|
reversed_(false),
|
||||||
|
did_flatten_(false),
|
||||||
did_onepass_(false),
|
did_onepass_(false),
|
||||||
start_(0),
|
start_(0),
|
||||||
start_unanchored_(0),
|
start_unanchored_(0),
|
||||||
size_(0),
|
size_(0),
|
||||||
byte_inst_count_(0),
|
|
||||||
bytemap_range_(0),
|
bytemap_range_(0),
|
||||||
|
first_byte_(-1),
|
||||||
flags_(0),
|
flags_(0),
|
||||||
onepass_statesize_(0),
|
list_count_(0),
|
||||||
inst_(NULL),
|
inst_(NULL),
|
||||||
dfa_first_(NULL),
|
|
||||||
dfa_longest_(NULL),
|
|
||||||
dfa_mem_(0),
|
|
||||||
delete_dfa_(NULL),
|
|
||||||
unbytemap_(NULL),
|
|
||||||
onepass_nodes_(NULL),
|
onepass_nodes_(NULL),
|
||||||
onepass_start_(NULL) {
|
dfa_mem_(0),
|
||||||
|
dfa_first_(NULL),
|
||||||
|
dfa_longest_(NULL) {
|
||||||
}
|
}
|
||||||
|
|
||||||
Prog::~Prog() {
|
Prog::~Prog() {
|
||||||
if (delete_dfa_) {
|
DeleteDFA(dfa_longest_);
|
||||||
if (dfa_first_)
|
DeleteDFA(dfa_first_);
|
||||||
delete_dfa_(dfa_first_);
|
|
||||||
if (dfa_longest_)
|
|
||||||
delete_dfa_(dfa_longest_);
|
|
||||||
}
|
|
||||||
delete[] onepass_nodes_;
|
delete[] onepass_nodes_;
|
||||||
delete[] inst_;
|
delete[] inst_;
|
||||||
delete[] unbytemap_;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
typedef SparseSet Workq;
|
typedef SparseSet Workq;
|
||||||
@ -133,7 +135,6 @@ static inline void AddToQueue(Workq* q, int id) {
|
|||||||
|
|
||||||
static string ProgToString(Prog* prog, Workq* q) {
|
static string ProgToString(Prog* prog, Workq* q) {
|
||||||
string s;
|
string s;
|
||||||
|
|
||||||
for (Workq::iterator i = q->begin(); i != q->end(); ++i) {
|
for (Workq::iterator i = q->begin(); i != q->end(); ++i) {
|
||||||
int id = *i;
|
int id = *i;
|
||||||
Prog::Inst* ip = prog->inst(id);
|
Prog::Inst* ip = prog->inst(id);
|
||||||
@ -145,29 +146,56 @@ static string ProgToString(Prog* prog, Workq* q) {
|
|||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static string FlattenedProgToString(Prog* prog, int start) {
|
||||||
|
string s;
|
||||||
|
for (int id = start; id < prog->size(); id++) {
|
||||||
|
Prog::Inst* ip = prog->inst(id);
|
||||||
|
if (ip->last())
|
||||||
|
StringAppendF(&s, "%d. %s\n", id, ip->Dump().c_str());
|
||||||
|
else
|
||||||
|
StringAppendF(&s, "%d+ %s\n", id, ip->Dump().c_str());
|
||||||
|
}
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
string Prog::Dump() {
|
string Prog::Dump() {
|
||||||
string map;
|
if (did_flatten_)
|
||||||
if (false) { // Debugging
|
return FlattenedProgToString(this, start_);
|
||||||
int lo = 0;
|
|
||||||
StringAppendF(&map, "byte map:\n");
|
|
||||||
for (int i = 0; i < bytemap_range_; i++) {
|
|
||||||
StringAppendF(&map, "\t%d. [%02x-%02x]\n", i, lo, unbytemap_[i]);
|
|
||||||
lo = unbytemap_[i] + 1;
|
|
||||||
}
|
|
||||||
StringAppendF(&map, "\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
Workq q(size_);
|
Workq q(size_);
|
||||||
AddToQueue(&q, start_);
|
AddToQueue(&q, start_);
|
||||||
return map + ProgToString(this, &q);
|
return ProgToString(this, &q);
|
||||||
}
|
}
|
||||||
|
|
||||||
string Prog::DumpUnanchored() {
|
string Prog::DumpUnanchored() {
|
||||||
|
if (did_flatten_)
|
||||||
|
return FlattenedProgToString(this, start_unanchored_);
|
||||||
|
|
||||||
Workq q(size_);
|
Workq q(size_);
|
||||||
AddToQueue(&q, start_unanchored_);
|
AddToQueue(&q, start_unanchored_);
|
||||||
return ProgToString(this, &q);
|
return ProgToString(this, &q);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
string Prog::DumpByteMap() {
|
||||||
|
string map;
|
||||||
|
for (int c = 0; c < 256; c++) {
|
||||||
|
int b = bytemap_[c];
|
||||||
|
int lo = c;
|
||||||
|
while (c < 256-1 && bytemap_[c+1] == b)
|
||||||
|
c++;
|
||||||
|
int hi = c;
|
||||||
|
StringAppendF(&map, "[%02x-%02x] -> %d\n", lo, hi, b);
|
||||||
|
}
|
||||||
|
return map;
|
||||||
|
}
|
||||||
|
|
||||||
|
int Prog::first_byte() {
|
||||||
|
std::call_once(first_byte_once_, [](Prog* prog) {
|
||||||
|
prog->first_byte_ = prog->ComputeFirstByte();
|
||||||
|
}, this);
|
||||||
|
return first_byte_;
|
||||||
|
}
|
||||||
|
|
||||||
static bool IsMatch(Prog*, Prog::Inst*);
|
static bool IsMatch(Prog*, Prog::Inst*);
|
||||||
|
|
||||||
// Peep-hole optimizer.
|
// Peep-hole optimizer.
|
||||||
@ -260,7 +288,7 @@ static bool IsMatch(Prog* prog, Prog::Inst* ip) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32 Prog::EmptyFlags(const StringPiece& text, const char* p) {
|
uint32_t Prog::EmptyFlags(const StringPiece& text, const char* p) {
|
||||||
int flags = 0;
|
int flags = 0;
|
||||||
|
|
||||||
// ^ and \A
|
// ^ and \A
|
||||||
@ -294,50 +322,505 @@ uint32 Prog::EmptyFlags(const StringPiece& text, const char* p) {
|
|||||||
return flags;
|
return flags;
|
||||||
}
|
}
|
||||||
|
|
||||||
void Prog::MarkByteRange(int lo, int hi) {
|
// ByteMapBuilder implements a coloring algorithm.
|
||||||
|
//
|
||||||
|
// The first phase is a series of "mark and merge" batches: we mark one or more
|
||||||
|
// [lo-hi] ranges, then merge them into our internal state. Batching is not for
|
||||||
|
// performance; rather, it means that the ranges are treated indistinguishably.
|
||||||
|
//
|
||||||
|
// Internally, the ranges are represented using a bitmap that stores the splits
|
||||||
|
// and a vector that stores the colors; both of them are indexed by the ranges'
|
||||||
|
// last bytes. Thus, in order to merge a [lo-hi] range, we split at lo-1 and at
|
||||||
|
// hi (if not already split), then recolor each range in between. The color map
|
||||||
|
// (i.e. from the old color to the new color) is maintained for the lifetime of
|
||||||
|
// the batch and so underpins this somewhat obscure approach to set operations.
|
||||||
|
//
|
||||||
|
// The second phase builds the bytemap from our internal state: we recolor each
|
||||||
|
// range, then store the new color (which is now the byte class) in each of the
|
||||||
|
// corresponding array elements. Finally, we output the number of byte classes.
|
||||||
|
class ByteMapBuilder {
|
||||||
|
public:
|
||||||
|
ByteMapBuilder() {
|
||||||
|
// Initial state: the [0-255] range has color 256.
|
||||||
|
// This will avoid problems during the second phase,
|
||||||
|
// in which we assign byte classes numbered from 0.
|
||||||
|
splits_.Set(255);
|
||||||
|
colors_.resize(256);
|
||||||
|
colors_[255] = 256;
|
||||||
|
nextcolor_ = 257;
|
||||||
|
}
|
||||||
|
|
||||||
|
void Mark(int lo, int hi);
|
||||||
|
void Merge();
|
||||||
|
void Build(uint8_t* bytemap, int* bytemap_range);
|
||||||
|
|
||||||
|
private:
|
||||||
|
int Recolor(int oldcolor);
|
||||||
|
|
||||||
|
Bitmap256 splits_;
|
||||||
|
std::vector<int> colors_;
|
||||||
|
int nextcolor_;
|
||||||
|
std::vector<std::pair<int, int>> colormap_;
|
||||||
|
std::vector<std::pair<int, int>> ranges_;
|
||||||
|
|
||||||
|
ByteMapBuilder(const ByteMapBuilder&) = delete;
|
||||||
|
ByteMapBuilder& operator=(const ByteMapBuilder&) = delete;
|
||||||
|
};
|
||||||
|
|
||||||
|
void ByteMapBuilder::Mark(int lo, int hi) {
|
||||||
DCHECK_GE(lo, 0);
|
DCHECK_GE(lo, 0);
|
||||||
DCHECK_GE(hi, 0);
|
DCHECK_GE(hi, 0);
|
||||||
DCHECK_LE(lo, 255);
|
DCHECK_LE(lo, 255);
|
||||||
DCHECK_LE(hi, 255);
|
DCHECK_LE(hi, 255);
|
||||||
DCHECK_LE(lo, hi);
|
DCHECK_LE(lo, hi);
|
||||||
if (0 < lo && lo <= 255)
|
|
||||||
byterange_.Set(lo - 1);
|
// Ignore any [0-255] ranges. They cause us to recolor every range, which
|
||||||
if (0 <= hi && hi <= 255)
|
// has no effect on the eventual result and is therefore a waste of time.
|
||||||
byterange_.Set(hi);
|
if (lo == 0 && hi == 255)
|
||||||
|
return;
|
||||||
|
|
||||||
|
ranges_.emplace_back(lo, hi);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ByteMapBuilder::Merge() {
|
||||||
|
for (std::vector<std::pair<int, int>>::const_iterator it = ranges_.begin();
|
||||||
|
it != ranges_.end();
|
||||||
|
++it) {
|
||||||
|
int lo = it->first-1;
|
||||||
|
int hi = it->second;
|
||||||
|
|
||||||
|
if (0 <= lo && !splits_.Test(lo)) {
|
||||||
|
splits_.Set(lo);
|
||||||
|
int next = splits_.FindNextSetBit(lo+1);
|
||||||
|
colors_[lo] = colors_[next];
|
||||||
|
}
|
||||||
|
if (!splits_.Test(hi)) {
|
||||||
|
splits_.Set(hi);
|
||||||
|
int next = splits_.FindNextSetBit(hi+1);
|
||||||
|
colors_[hi] = colors_[next];
|
||||||
|
}
|
||||||
|
|
||||||
|
int c = lo+1;
|
||||||
|
while (c < 256) {
|
||||||
|
int next = splits_.FindNextSetBit(c);
|
||||||
|
colors_[next] = Recolor(colors_[next]);
|
||||||
|
if (next == hi)
|
||||||
|
break;
|
||||||
|
c = next+1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
colormap_.clear();
|
||||||
|
ranges_.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
void ByteMapBuilder::Build(uint8_t* bytemap, int* bytemap_range) {
|
||||||
|
// Assign byte classes numbered from 0.
|
||||||
|
nextcolor_ = 0;
|
||||||
|
|
||||||
|
int c = 0;
|
||||||
|
while (c < 256) {
|
||||||
|
int next = splits_.FindNextSetBit(c);
|
||||||
|
uint8_t b = static_cast<uint8_t>(Recolor(colors_[next]));
|
||||||
|
while (c <= next) {
|
||||||
|
bytemap[c] = b;
|
||||||
|
c++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
*bytemap_range = nextcolor_;
|
||||||
|
}
|
||||||
|
|
||||||
|
int ByteMapBuilder::Recolor(int oldcolor) {
|
||||||
|
// Yes, this is a linear search. There can be at most 256
|
||||||
|
// colors and there will typically be far fewer than that.
|
||||||
|
// Also, we need to consider keys *and* values in order to
|
||||||
|
// avoid recoloring a given range more than once per batch.
|
||||||
|
std::vector<std::pair<int, int>>::const_iterator it =
|
||||||
|
std::find_if(colormap_.begin(), colormap_.end(),
|
||||||
|
[=](const std::pair<int, int>& kv) -> bool {
|
||||||
|
return kv.first == oldcolor || kv.second == oldcolor;
|
||||||
|
});
|
||||||
|
if (it != colormap_.end())
|
||||||
|
return it->second;
|
||||||
|
int newcolor = nextcolor_;
|
||||||
|
nextcolor_++;
|
||||||
|
colormap_.emplace_back(oldcolor, newcolor);
|
||||||
|
return newcolor;
|
||||||
}
|
}
|
||||||
|
|
||||||
void Prog::ComputeByteMap() {
|
void Prog::ComputeByteMap() {
|
||||||
// Fill in bytemap with byte classes for prog_.
|
// Fill in bytemap with byte classes for the program.
|
||||||
// Ranges of bytes that are treated as indistinguishable
|
// Ranges of bytes that are treated indistinguishably
|
||||||
// by the regexp program are mapped to a single byte class.
|
// will be mapped to a single byte class.
|
||||||
// The vector prog_->byterange() marks the end of each
|
ByteMapBuilder builder;
|
||||||
// such range.
|
|
||||||
const Bitmap<256>& v = byterange();
|
|
||||||
|
|
||||||
COMPILE_ASSERT(8*sizeof(v.Word(0)) == 32, wordsize);
|
// Don't repeat the work for ^ and $.
|
||||||
uint8 n = 0;
|
bool marked_line_boundaries = false;
|
||||||
uint32 bits = 0;
|
// Don't repeat the work for \b and \B.
|
||||||
for (int i = 0; i < 256; i++) {
|
bool marked_word_boundaries = false;
|
||||||
if ((i&31) == 0)
|
|
||||||
bits = v.Word(i >> 5);
|
for (int id = 0; id < size(); id++) {
|
||||||
bytemap_[i] = n;
|
Inst* ip = inst(id);
|
||||||
n += bits & 1;
|
if (ip->opcode() == kInstByteRange) {
|
||||||
bits >>= 1;
|
int lo = ip->lo();
|
||||||
|
int hi = ip->hi();
|
||||||
|
builder.Mark(lo, hi);
|
||||||
|
if (ip->foldcase() && lo <= 'z' && hi >= 'a') {
|
||||||
|
int foldlo = lo;
|
||||||
|
int foldhi = hi;
|
||||||
|
if (foldlo < 'a')
|
||||||
|
foldlo = 'a';
|
||||||
|
if (foldhi > 'z')
|
||||||
|
foldhi = 'z';
|
||||||
|
if (foldlo <= foldhi)
|
||||||
|
builder.Mark(foldlo + 'A' - 'a', foldhi + 'A' - 'a');
|
||||||
}
|
}
|
||||||
bytemap_range_ = bytemap_[255] + 1;
|
// If this Inst is not the last Inst in its list AND the next Inst is
|
||||||
unbytemap_ = new uint8[bytemap_range_];
|
// also a ByteRange AND the Insts have the same out, defer the merge.
|
||||||
|
if (!ip->last() &&
|
||||||
|
inst(id+1)->opcode() == kInstByteRange &&
|
||||||
|
ip->out() == inst(id+1)->out())
|
||||||
|
continue;
|
||||||
|
builder.Merge();
|
||||||
|
} else if (ip->opcode() == kInstEmptyWidth) {
|
||||||
|
if (ip->empty() & (kEmptyBeginLine|kEmptyEndLine) &&
|
||||||
|
!marked_line_boundaries) {
|
||||||
|
builder.Mark('\n', '\n');
|
||||||
|
builder.Merge();
|
||||||
|
marked_line_boundaries = true;
|
||||||
|
}
|
||||||
|
if (ip->empty() & (kEmptyWordBoundary|kEmptyNonWordBoundary) &&
|
||||||
|
!marked_word_boundaries) {
|
||||||
|
// We require two batches here: the first for ranges that are word
|
||||||
|
// characters, the second for ranges that are not word characters.
|
||||||
|
for (bool isword : {true, false}) {
|
||||||
|
int j;
|
||||||
|
for (int i = 0; i < 256; i = j) {
|
||||||
|
for (j = i + 1; j < 256 &&
|
||||||
|
Prog::IsWordChar(static_cast<uint8_t>(i)) ==
|
||||||
|
Prog::IsWordChar(static_cast<uint8_t>(j));
|
||||||
|
j++)
|
||||||
|
;
|
||||||
|
if (Prog::IsWordChar(static_cast<uint8_t>(i)) == isword)
|
||||||
|
builder.Mark(i, j - 1);
|
||||||
|
}
|
||||||
|
builder.Merge();
|
||||||
|
}
|
||||||
|
marked_word_boundaries = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
builder.Build(bytemap_, &bytemap_range_);
|
||||||
|
|
||||||
|
if (0) { // For debugging, use trivial bytemap.
|
||||||
|
LOG(ERROR) << "Using trivial bytemap.";
|
||||||
for (int i = 0; i < 256; i++)
|
for (int i = 0; i < 256; i++)
|
||||||
unbytemap_[bytemap_[i]] = i;
|
bytemap_[i] = static_cast<uint8_t>(i);
|
||||||
|
|
||||||
if (0) { // For debugging: use trivial byte map.
|
|
||||||
for (int i = 0; i < 256; i++) {
|
|
||||||
bytemap_[i] = i;
|
|
||||||
unbytemap_[i] = i;
|
|
||||||
}
|
|
||||||
bytemap_range_ = 256;
|
bytemap_range_ = 256;
|
||||||
LOG(INFO) << "Using trivial bytemap.";
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Prog::Flatten() implements a graph rewriting algorithm.
|
||||||
|
//
|
||||||
|
// The overall process is similar to epsilon removal, but retains some epsilon
|
||||||
|
// transitions: those from Capture and EmptyWidth instructions; and those from
|
||||||
|
// nullable subexpressions. (The latter avoids quadratic blowup in transitions
|
||||||
|
// in the worst case.) It might be best thought of as Alt instruction elision.
|
||||||
|
//
|
||||||
|
// In conceptual terms, it divides the Prog into "trees" of instructions, then
|
||||||
|
// traverses the "trees" in order to produce "lists" of instructions. A "tree"
|
||||||
|
// is one or more instructions that grow from one "root" instruction to one or
|
||||||
|
// more "leaf" instructions; if a "tree" has exactly one instruction, then the
|
||||||
|
// "root" is also the "leaf". In most cases, a "root" is the successor of some
|
||||||
|
// "leaf" (i.e. the "leaf" instruction's out() returns the "root" instruction)
|
||||||
|
// and is considered a "successor root". A "leaf" can be a ByteRange, Capture,
|
||||||
|
// EmptyWidth or Match instruction. However, this is insufficient for handling
|
||||||
|
// nested nullable subexpressions correctly, so in some cases, a "root" is the
|
||||||
|
// dominator of the instructions reachable from some "successor root" (i.e. it
|
||||||
|
// has an unreachable predecessor) and is considered a "dominator root". Since
|
||||||
|
// only Alt instructions can be "dominator roots" (other instructions would be
|
||||||
|
// "leaves"), only Alt instructions are required to be marked as predecessors.
|
||||||
|
//
|
||||||
|
// Dividing the Prog into "trees" comprises two passes: marking the "successor
|
||||||
|
// roots" and the predecessors; and marking the "dominator roots". Sorting the
|
||||||
|
// "successor roots" by their bytecode offsets enables iteration in order from
|
||||||
|
// greatest to least during the second pass; by working backwards in this case
|
||||||
|
// and flooding the graph no further than "leaves" and already marked "roots",
|
||||||
|
// it becomes possible to mark "dominator roots" without doing excessive work.
|
||||||
|
//
|
||||||
|
// Traversing the "trees" is just iterating over the "roots" in order of their
|
||||||
|
// marking and flooding the graph no further than "leaves" and "roots". When a
|
||||||
|
// "leaf" is reached, the instruction is copied with its successor remapped to
|
||||||
|
// its "root" number. When a "root" is reached, a Nop instruction is generated
|
||||||
|
// with its successor remapped similarly. As each "list" is produced, its last
|
||||||
|
// instruction is marked as such. After all of the "lists" have been produced,
|
||||||
|
// a pass over their instructions remaps their successors to bytecode offsets.
|
||||||
|
void Prog::Flatten() {
|
||||||
|
if (did_flatten_)
|
||||||
|
return;
|
||||||
|
did_flatten_ = true;
|
||||||
|
|
||||||
|
// Scratch structures. It's important that these are reused by functions
|
||||||
|
// that we call in loops because they would thrash the heap otherwise.
|
||||||
|
SparseSet reachable(size());
|
||||||
|
std::vector<int> stk;
|
||||||
|
stk.reserve(size());
|
||||||
|
|
||||||
|
// First pass: Marks "successor roots" and predecessors.
|
||||||
|
// Builds the mapping from inst-ids to root-ids.
|
||||||
|
SparseArray<int> rootmap(size());
|
||||||
|
SparseArray<int> predmap(size());
|
||||||
|
std::vector<std::vector<int>> predvec;
|
||||||
|
MarkSuccessors(&rootmap, &predmap, &predvec, &reachable, &stk);
|
||||||
|
|
||||||
|
// Second pass: Marks "dominator roots".
|
||||||
|
SparseArray<int> sorted(rootmap);
|
||||||
|
std::sort(sorted.begin(), sorted.end(), sorted.less);
|
||||||
|
for (SparseArray<int>::const_iterator i = sorted.end() - 1;
|
||||||
|
i != sorted.begin();
|
||||||
|
--i) {
|
||||||
|
if (i->index() != start_unanchored() && i->index() != start())
|
||||||
|
MarkDominator(i->index(), &rootmap, &predmap, &predvec, &reachable, &stk);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Third pass: Emits "lists". Remaps outs to root-ids.
|
||||||
|
// Builds the mapping from root-ids to flat-ids.
|
||||||
|
std::vector<int> flatmap(rootmap.size());
|
||||||
|
std::vector<Inst> flat;
|
||||||
|
flat.reserve(size());
|
||||||
|
for (SparseArray<int>::const_iterator i = rootmap.begin();
|
||||||
|
i != rootmap.end();
|
||||||
|
++i) {
|
||||||
|
flatmap[i->value()] = static_cast<int>(flat.size());
|
||||||
|
EmitList(i->index(), &rootmap, &flat, &reachable, &stk);
|
||||||
|
flat.back().set_last();
|
||||||
|
}
|
||||||
|
|
||||||
|
list_count_ = static_cast<int>(flatmap.size());
|
||||||
|
for (int i = 0; i < kNumInst; i++)
|
||||||
|
inst_count_[i] = 0;
|
||||||
|
|
||||||
|
// Fourth pass: Remaps outs to flat-ids.
|
||||||
|
// Counts instructions by opcode.
|
||||||
|
for (int id = 0; id < static_cast<int>(flat.size()); id++) {
|
||||||
|
Inst* ip = &flat[id];
|
||||||
|
if (ip->opcode() != kInstAltMatch) // handled in EmitList()
|
||||||
|
ip->set_out(flatmap[ip->out()]);
|
||||||
|
inst_count_[ip->opcode()]++;
|
||||||
|
}
|
||||||
|
|
||||||
|
int total = 0;
|
||||||
|
for (int i = 0; i < kNumInst; i++)
|
||||||
|
total += inst_count_[i];
|
||||||
|
DCHECK_EQ(total, static_cast<int>(flat.size()));
|
||||||
|
|
||||||
|
// Remap start_unanchored and start.
|
||||||
|
if (start_unanchored() == 0) {
|
||||||
|
DCHECK_EQ(start(), 0);
|
||||||
|
} else if (start_unanchored() == start()) {
|
||||||
|
set_start_unanchored(flatmap[1]);
|
||||||
|
set_start(flatmap[1]);
|
||||||
|
} else {
|
||||||
|
set_start_unanchored(flatmap[1]);
|
||||||
|
set_start(flatmap[2]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Finally, replace the old instructions with the new instructions.
|
||||||
|
size_ = static_cast<int>(flat.size());
|
||||||
|
delete[] inst_;
|
||||||
|
inst_ = new Inst[size_];
|
||||||
|
memmove(inst_, flat.data(), size_ * sizeof *inst_);
|
||||||
|
}
|
||||||
|
|
||||||
|
void Prog::MarkSuccessors(SparseArray<int>* rootmap,
|
||||||
|
SparseArray<int>* predmap,
|
||||||
|
std::vector<std::vector<int>>* predvec,
|
||||||
|
SparseSet* reachable, std::vector<int>* stk) {
|
||||||
|
// Mark the kInstFail instruction.
|
||||||
|
rootmap->set_new(0, rootmap->size());
|
||||||
|
|
||||||
|
// Mark the start_unanchored and start instructions.
|
||||||
|
if (!rootmap->has_index(start_unanchored()))
|
||||||
|
rootmap->set_new(start_unanchored(), rootmap->size());
|
||||||
|
if (!rootmap->has_index(start()))
|
||||||
|
rootmap->set_new(start(), rootmap->size());
|
||||||
|
|
||||||
|
reachable->clear();
|
||||||
|
stk->clear();
|
||||||
|
stk->push_back(start_unanchored());
|
||||||
|
while (!stk->empty()) {
|
||||||
|
int id = stk->back();
|
||||||
|
stk->pop_back();
|
||||||
|
Loop:
|
||||||
|
if (reachable->contains(id))
|
||||||
|
continue;
|
||||||
|
reachable->insert_new(id);
|
||||||
|
|
||||||
|
Inst* ip = inst(id);
|
||||||
|
switch (ip->opcode()) {
|
||||||
|
default:
|
||||||
|
LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
|
||||||
|
break;
|
||||||
|
|
||||||
|
case kInstAltMatch:
|
||||||
|
case kInstAlt:
|
||||||
|
// Mark this instruction as a predecessor of each out.
|
||||||
|
for (int out : {ip->out(), ip->out1()}) {
|
||||||
|
if (!predmap->has_index(out)) {
|
||||||
|
predmap->set_new(out, static_cast<int>(predvec->size()));
|
||||||
|
predvec->emplace_back();
|
||||||
|
}
|
||||||
|
(*predvec)[predmap->get_existing(out)].emplace_back(id);
|
||||||
|
}
|
||||||
|
stk->push_back(ip->out1());
|
||||||
|
id = ip->out();
|
||||||
|
goto Loop;
|
||||||
|
|
||||||
|
case kInstByteRange:
|
||||||
|
case kInstCapture:
|
||||||
|
case kInstEmptyWidth:
|
||||||
|
// Mark the out of this instruction as a "root".
|
||||||
|
if (!rootmap->has_index(ip->out()))
|
||||||
|
rootmap->set_new(ip->out(), rootmap->size());
|
||||||
|
id = ip->out();
|
||||||
|
goto Loop;
|
||||||
|
|
||||||
|
case kInstNop:
|
||||||
|
id = ip->out();
|
||||||
|
goto Loop;
|
||||||
|
|
||||||
|
case kInstMatch:
|
||||||
|
case kInstFail:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void Prog::MarkDominator(int root, SparseArray<int>* rootmap,
|
||||||
|
SparseArray<int>* predmap,
|
||||||
|
std::vector<std::vector<int>>* predvec,
|
||||||
|
SparseSet* reachable, std::vector<int>* stk) {
|
||||||
|
reachable->clear();
|
||||||
|
stk->clear();
|
||||||
|
stk->push_back(root);
|
||||||
|
while (!stk->empty()) {
|
||||||
|
int id = stk->back();
|
||||||
|
stk->pop_back();
|
||||||
|
Loop:
|
||||||
|
if (reachable->contains(id))
|
||||||
|
continue;
|
||||||
|
reachable->insert_new(id);
|
||||||
|
|
||||||
|
if (id != root && rootmap->has_index(id)) {
|
||||||
|
// We reached another "tree" via epsilon transition.
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
Inst* ip = inst(id);
|
||||||
|
switch (ip->opcode()) {
|
||||||
|
default:
|
||||||
|
LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
|
||||||
|
break;
|
||||||
|
|
||||||
|
case kInstAltMatch:
|
||||||
|
case kInstAlt:
|
||||||
|
stk->push_back(ip->out1());
|
||||||
|
id = ip->out();
|
||||||
|
goto Loop;
|
||||||
|
|
||||||
|
case kInstByteRange:
|
||||||
|
case kInstCapture:
|
||||||
|
case kInstEmptyWidth:
|
||||||
|
break;
|
||||||
|
|
||||||
|
case kInstNop:
|
||||||
|
id = ip->out();
|
||||||
|
goto Loop;
|
||||||
|
|
||||||
|
case kInstMatch:
|
||||||
|
case kInstFail:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (SparseSet::const_iterator i = reachable->begin();
|
||||||
|
i != reachable->end();
|
||||||
|
++i) {
|
||||||
|
int id = *i;
|
||||||
|
if (predmap->has_index(id)) {
|
||||||
|
for (int pred : (*predvec)[predmap->get_existing(id)]) {
|
||||||
|
if (!reachable->contains(pred)) {
|
||||||
|
// id has a predecessor that cannot be reached from root!
|
||||||
|
// Therefore, id must be a "root" too - mark it as such.
|
||||||
|
if (!rootmap->has_index(id))
|
||||||
|
rootmap->set_new(id, rootmap->size());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void Prog::EmitList(int root, SparseArray<int>* rootmap,
|
||||||
|
std::vector<Inst>* flat,
|
||||||
|
SparseSet* reachable, std::vector<int>* stk) {
|
||||||
|
reachable->clear();
|
||||||
|
stk->clear();
|
||||||
|
stk->push_back(root);
|
||||||
|
while (!stk->empty()) {
|
||||||
|
int id = stk->back();
|
||||||
|
stk->pop_back();
|
||||||
|
Loop:
|
||||||
|
if (reachable->contains(id))
|
||||||
|
continue;
|
||||||
|
reachable->insert_new(id);
|
||||||
|
|
||||||
|
if (id != root && rootmap->has_index(id)) {
|
||||||
|
// We reached another "tree" via epsilon transition. Emit a kInstNop
|
||||||
|
// instruction so that the Prog does not become quadratically larger.
|
||||||
|
flat->emplace_back();
|
||||||
|
flat->back().set_opcode(kInstNop);
|
||||||
|
flat->back().set_out(rootmap->get_existing(id));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
Inst* ip = inst(id);
|
||||||
|
switch (ip->opcode()) {
|
||||||
|
default:
|
||||||
|
LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
|
||||||
|
break;
|
||||||
|
|
||||||
|
case kInstAltMatch:
|
||||||
|
flat->emplace_back();
|
||||||
|
flat->back().set_opcode(kInstAltMatch);
|
||||||
|
flat->back().set_out(static_cast<int>(flat->size()));
|
||||||
|
flat->back().out1_ = static_cast<uint32_t>(flat->size())+1;
|
||||||
|
FALLTHROUGH_INTENDED;
|
||||||
|
|
||||||
|
case kInstAlt:
|
||||||
|
stk->push_back(ip->out1());
|
||||||
|
id = ip->out();
|
||||||
|
goto Loop;
|
||||||
|
|
||||||
|
case kInstByteRange:
|
||||||
|
case kInstCapture:
|
||||||
|
case kInstEmptyWidth:
|
||||||
|
flat->emplace_back();
|
||||||
|
memmove(&flat->back(), ip, sizeof *ip);
|
||||||
|
flat->back().set_out(rootmap->get_existing(ip->out()));
|
||||||
|
break;
|
||||||
|
|
||||||
|
case kInstNop:
|
||||||
|
id = ip->out();
|
||||||
|
goto Loop;
|
||||||
|
|
||||||
|
case kInstMatch:
|
||||||
|
case kInstFail:
|
||||||
|
flat->emplace_back();
|
||||||
|
memmove(&flat->back(), ip, sizeof *ip);
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace re2
|
} // namespace re2
|
||||||
|
|
||||||
|
@ -2,50 +2,27 @@
|
|||||||
// Use of this source code is governed by a BSD-style
|
// Use of this source code is governed by a BSD-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
#ifndef RE2_PROG_H_
|
||||||
|
#define RE2_PROG_H_
|
||||||
|
|
||||||
// Compiled representation of regular expressions.
|
// Compiled representation of regular expressions.
|
||||||
// See regexp.h for the Regexp class, which represents a regular
|
// See regexp.h for the Regexp class, which represents a regular
|
||||||
// expression symbolically.
|
// expression symbolically.
|
||||||
|
|
||||||
#ifndef RE2_PROG_H__
|
#include <stdint.h>
|
||||||
#define RE2_PROG_H__
|
#include <functional>
|
||||||
|
#include <mutex>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
#include "util/util.h"
|
#include "util/util.h"
|
||||||
|
#include "util/logging.h"
|
||||||
|
#include "util/sparse_array.h"
|
||||||
|
#include "util/sparse_set.h"
|
||||||
#include "re2/re2.h"
|
#include "re2/re2.h"
|
||||||
|
|
||||||
namespace re2 {
|
namespace re2 {
|
||||||
|
|
||||||
// Simple fixed-size bitmap.
|
|
||||||
template<int Bits>
|
|
||||||
class Bitmap {
|
|
||||||
public:
|
|
||||||
Bitmap() { Reset(); }
|
|
||||||
int Size() { return Bits; }
|
|
||||||
|
|
||||||
void Reset() {
|
|
||||||
for (int i = 0; i < Words; i++)
|
|
||||||
w_[i] = 0;
|
|
||||||
}
|
|
||||||
bool Get(int k) const {
|
|
||||||
return w_[k >> WordLog] & (1<<(k & 31));
|
|
||||||
}
|
|
||||||
void Set(int k) {
|
|
||||||
w_[k >> WordLog] |= 1<<(k & 31);
|
|
||||||
}
|
|
||||||
void Clear(int k) {
|
|
||||||
w_[k >> WordLog] &= ~(1<<(k & 31));
|
|
||||||
}
|
|
||||||
uint32 Word(int i) const {
|
|
||||||
return w_[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
static const int WordLog = 5;
|
|
||||||
static const int Words = (Bits+31)/32;
|
|
||||||
uint32 w_[Words];
|
|
||||||
DISALLOW_EVIL_CONSTRUCTORS(Bitmap);
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
// Opcodes for Inst
|
// Opcodes for Inst
|
||||||
enum InstOp {
|
enum InstOp {
|
||||||
kInstAlt = 0, // choose between out_ and out1_
|
kInstAlt = 0, // choose between out_ and out1_
|
||||||
@ -56,6 +33,7 @@ enum InstOp {
|
|||||||
kInstMatch, // found a match!
|
kInstMatch, // found a match!
|
||||||
kInstNop, // no-op; occasionally unavoidable
|
kInstNop, // no-op; occasionally unavoidable
|
||||||
kInstFail, // never match; occasionally unavoidable
|
kInstFail, // never match; occasionally unavoidable
|
||||||
|
kNumInst,
|
||||||
};
|
};
|
||||||
|
|
||||||
// Bit flags for empty-width specials
|
// Bit flags for empty-width specials
|
||||||
@ -69,10 +47,8 @@ enum EmptyOp {
|
|||||||
kEmptyAllFlags = (1<<6)-1,
|
kEmptyAllFlags = (1<<6)-1,
|
||||||
};
|
};
|
||||||
|
|
||||||
class Regexp;
|
|
||||||
|
|
||||||
class DFA;
|
class DFA;
|
||||||
struct OneState;
|
class Regexp;
|
||||||
|
|
||||||
// Compiled form of regexp program.
|
// Compiled form of regexp program.
|
||||||
class Prog {
|
class Prog {
|
||||||
@ -85,19 +61,24 @@ class Prog {
|
|||||||
public:
|
public:
|
||||||
Inst() : out_opcode_(0), out1_(0) {}
|
Inst() : out_opcode_(0), out1_(0) {}
|
||||||
|
|
||||||
|
// Copyable.
|
||||||
|
Inst(const Inst&) = default;
|
||||||
|
Inst& operator=(const Inst&) = default;
|
||||||
|
|
||||||
// Constructors per opcode
|
// Constructors per opcode
|
||||||
void InitAlt(uint32 out, uint32 out1);
|
void InitAlt(uint32_t out, uint32_t out1);
|
||||||
void InitByteRange(int lo, int hi, int foldcase, uint32 out);
|
void InitByteRange(int lo, int hi, int foldcase, uint32_t out);
|
||||||
void InitCapture(int cap, uint32 out);
|
void InitCapture(int cap, uint32_t out);
|
||||||
void InitEmptyWidth(EmptyOp empty, uint32 out);
|
void InitEmptyWidth(EmptyOp empty, uint32_t out);
|
||||||
void InitMatch(int id);
|
void InitMatch(int id);
|
||||||
void InitNop(uint32 out);
|
void InitNop(uint32_t out);
|
||||||
void InitFail();
|
void InitFail();
|
||||||
|
|
||||||
// Getters
|
// Getters
|
||||||
int id(Prog* p) { return this - p->inst_; }
|
int id(Prog* p) { return static_cast<int>(this - p->inst_); }
|
||||||
InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); }
|
InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); }
|
||||||
int out() { return out_opcode_>>3; }
|
int last() { return (out_opcode_>>3)&1; }
|
||||||
|
int out() { return out_opcode_>>4; }
|
||||||
int out1() { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; }
|
int out1() { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; }
|
||||||
int cap() { DCHECK_EQ(opcode(), kInstCapture); return cap_; }
|
int cap() { DCHECK_EQ(opcode(), kInstCapture); return cap_; }
|
||||||
int lo() { DCHECK_EQ(opcode(), kInstByteRange); return lo_; }
|
int lo() { DCHECK_EQ(opcode(), kInstByteRange); return lo_; }
|
||||||
@ -105,9 +86,12 @@ class Prog {
|
|||||||
int foldcase() { DCHECK_EQ(opcode(), kInstByteRange); return foldcase_; }
|
int foldcase() { DCHECK_EQ(opcode(), kInstByteRange); return foldcase_; }
|
||||||
int match_id() { DCHECK_EQ(opcode(), kInstMatch); return match_id_; }
|
int match_id() { DCHECK_EQ(opcode(), kInstMatch); return match_id_; }
|
||||||
EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; }
|
EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; }
|
||||||
|
|
||||||
bool greedy(Prog* p) {
|
bool greedy(Prog* p) {
|
||||||
DCHECK_EQ(opcode(), kInstAltMatch);
|
DCHECK_EQ(opcode(), kInstAltMatch);
|
||||||
return p->inst(out())->opcode() == kInstByteRange;
|
return p->inst(out())->opcode() == kInstByteRange ||
|
||||||
|
(p->inst(out())->opcode() == kInstNop &&
|
||||||
|
p->inst(p->inst(out())->out())->opcode() == kInstByteRange);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Does this inst (an kInstByteRange) match c?
|
// Does this inst (an kInstByteRange) match c?
|
||||||
@ -122,41 +106,45 @@ class Prog {
|
|||||||
string Dump();
|
string Dump();
|
||||||
|
|
||||||
// Maximum instruction id.
|
// Maximum instruction id.
|
||||||
// (Must fit in out_opcode_, and PatchList steals another bit.)
|
// (Must fit in out_opcode_. PatchList/last steal another bit.)
|
||||||
static const int kMaxInst = (1<<28) - 1;
|
static const int kMaxInst = (1<<28) - 1;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void set_opcode(InstOp opcode) {
|
void set_opcode(InstOp opcode) {
|
||||||
out_opcode_ = (out()<<3) | opcode;
|
out_opcode_ = (out()<<4) | (last()<<3) | opcode;
|
||||||
|
}
|
||||||
|
|
||||||
|
void set_last() {
|
||||||
|
out_opcode_ = (out()<<4) | (1<<3) | opcode();
|
||||||
}
|
}
|
||||||
|
|
||||||
void set_out(int out) {
|
void set_out(int out) {
|
||||||
out_opcode_ = (out<<3) | opcode();
|
out_opcode_ = (out<<4) | (last()<<3) | opcode();
|
||||||
}
|
}
|
||||||
|
|
||||||
void set_out_opcode(int out, InstOp opcode) {
|
void set_out_opcode(int out, InstOp opcode) {
|
||||||
out_opcode_ = (out<<3) | opcode;
|
out_opcode_ = (out<<4) | (last()<<3) | opcode;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32 out_opcode_; // 29 bits of out, 3 (low) bits opcode
|
uint32_t out_opcode_; // 28 bits: out, 1 bit: last, 3 (low) bits: opcode
|
||||||
union { // additional instruction arguments:
|
union { // additional instruction arguments:
|
||||||
uint32 out1_; // opcode == kInstAlt
|
uint32_t out1_; // opcode == kInstAlt
|
||||||
// alternate next instruction
|
// alternate next instruction
|
||||||
|
|
||||||
int32 cap_; // opcode == kInstCapture
|
int32_t cap_; // opcode == kInstCapture
|
||||||
// Index of capture register (holds text
|
// Index of capture register (holds text
|
||||||
// position recorded by capturing parentheses).
|
// position recorded by capturing parentheses).
|
||||||
// For \n (the submatch for the nth parentheses),
|
// For \n (the submatch for the nth parentheses),
|
||||||
// the left parenthesis captures into register 2*n
|
// the left parenthesis captures into register 2*n
|
||||||
// and the right one captures into register 2*n+1.
|
// and the right one captures into register 2*n+1.
|
||||||
|
|
||||||
int32 match_id_; // opcode == kInstMatch
|
int32_t match_id_; // opcode == kInstMatch
|
||||||
// Match ID to identify this match (for re2::Set).
|
// Match ID to identify this match (for re2::Set).
|
||||||
|
|
||||||
struct { // opcode == kInstByteRange
|
struct { // opcode == kInstByteRange
|
||||||
uint8 lo_; // byte range is lo_-hi_ inclusive
|
uint8_t lo_; // byte range is lo_-hi_ inclusive
|
||||||
uint8 hi_; //
|
uint8_t hi_; //
|
||||||
uint8 foldcase_; // convert A-Z to a-z before checking range.
|
uint8_t foldcase_; // convert A-Z to a-z before checking range.
|
||||||
};
|
};
|
||||||
|
|
||||||
EmptyOp empty_; // opcode == kInstEmptyWidth
|
EmptyOp empty_; // opcode == kInstEmptyWidth
|
||||||
@ -166,8 +154,6 @@ class Prog {
|
|||||||
friend class Compiler;
|
friend class Compiler;
|
||||||
friend struct PatchList;
|
friend struct PatchList;
|
||||||
friend class Prog;
|
friend class Prog;
|
||||||
|
|
||||||
DISALLOW_EVIL_CONSTRUCTORS(Inst);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// Whether to anchor the search.
|
// Whether to anchor the search.
|
||||||
@ -200,13 +186,13 @@ class Prog {
|
|||||||
int start_unanchored() { return start_unanchored_; }
|
int start_unanchored() { return start_unanchored_; }
|
||||||
void set_start(int start) { start_ = start; }
|
void set_start(int start) { start_ = start; }
|
||||||
void set_start_unanchored(int start) { start_unanchored_ = start; }
|
void set_start_unanchored(int start) { start_unanchored_ = start; }
|
||||||
int64 size() { return size_; }
|
int size() { return size_; }
|
||||||
bool reversed() { return reversed_; }
|
bool reversed() { return reversed_; }
|
||||||
void set_reversed(bool reversed) { reversed_ = reversed; }
|
void set_reversed(bool reversed) { reversed_ = reversed; }
|
||||||
int64 byte_inst_count() { return byte_inst_count_; }
|
int list_count() { return list_count_; }
|
||||||
const Bitmap<256>& byterange() { return byterange_; }
|
int inst_count(InstOp op) { return inst_count_[op]; }
|
||||||
void set_dfa_mem(int64 dfa_mem) { dfa_mem_ = dfa_mem; }
|
void set_dfa_mem(int64_t dfa_mem) { dfa_mem_ = dfa_mem; }
|
||||||
int64 dfa_mem() { return dfa_mem_; }
|
int64_t dfa_mem() { return dfa_mem_; }
|
||||||
int flags() { return flags_; }
|
int flags() { return flags_; }
|
||||||
void set_flags(int flags) { flags_ = flags; }
|
void set_flags(int flags) { flags_ = flags; }
|
||||||
bool anchor_start() { return anchor_start_; }
|
bool anchor_start() { return anchor_start_; }
|
||||||
@ -214,21 +200,19 @@ class Prog {
|
|||||||
bool anchor_end() { return anchor_end_; }
|
bool anchor_end() { return anchor_end_; }
|
||||||
void set_anchor_end(bool b) { anchor_end_ = b; }
|
void set_anchor_end(bool b) { anchor_end_ = b; }
|
||||||
int bytemap_range() { return bytemap_range_; }
|
int bytemap_range() { return bytemap_range_; }
|
||||||
const uint8* bytemap() { return bytemap_; }
|
const uint8_t* bytemap() { return bytemap_; }
|
||||||
|
|
||||||
|
// Lazily computed.
|
||||||
|
int first_byte();
|
||||||
|
|
||||||
// Returns string representation of program for debugging.
|
// Returns string representation of program for debugging.
|
||||||
string Dump();
|
string Dump();
|
||||||
string DumpUnanchored();
|
string DumpUnanchored();
|
||||||
|
string DumpByteMap();
|
||||||
// Record that at some point in the prog, the bytes in the range
|
|
||||||
// lo-hi (inclusive) are treated as different from bytes outside the range.
|
|
||||||
// Tracking this lets the DFA collapse commonly-treated byte ranges
|
|
||||||
// when recording state pointers, greatly reducing its memory footprint.
|
|
||||||
void MarkByteRange(int lo, int hi);
|
|
||||||
|
|
||||||
// Returns the set of kEmpty flags that are in effect at
|
// Returns the set of kEmpty flags that are in effect at
|
||||||
// position p within context.
|
// position p within context.
|
||||||
static uint32 EmptyFlags(const StringPiece& context, const char* p);
|
static uint32_t EmptyFlags(const StringPiece& context, const char* p);
|
||||||
|
|
||||||
// Returns whether byte c is a word character: ASCII only.
|
// Returns whether byte c is a word character: ASCII only.
|
||||||
// Used by the implementation of \b and \B.
|
// Used by the implementation of \b and \B.
|
||||||
@ -237,7 +221,7 @@ class Prog {
|
|||||||
// (the DFA has only one-byte lookahead).
|
// (the DFA has only one-byte lookahead).
|
||||||
// - even if the lookahead were possible, the Progs would be huge.
|
// - even if the lookahead were possible, the Progs would be huge.
|
||||||
// This crude approximation is the same one PCRE uses.
|
// This crude approximation is the same one PCRE uses.
|
||||||
static bool IsWordChar(uint8 c) {
|
static bool IsWordChar(uint8_t c) {
|
||||||
return ('A' <= c && c <= 'Z') ||
|
return ('A' <= c && c <= 'Z') ||
|
||||||
('a' <= c && c <= 'z') ||
|
('a' <= c && c <= 'z') ||
|
||||||
('0' <= c && c <= '9') ||
|
('0' <= c && c <= '9') ||
|
||||||
@ -270,19 +254,37 @@ class Prog {
|
|||||||
// If matches != NULL and kind == kManyMatch and there is a match,
|
// If matches != NULL and kind == kManyMatch and there is a match,
|
||||||
// SearchDFA fills matches with the match IDs of the final matching state.
|
// SearchDFA fills matches with the match IDs of the final matching state.
|
||||||
bool SearchDFA(const StringPiece& text, const StringPiece& context,
|
bool SearchDFA(const StringPiece& text, const StringPiece& context,
|
||||||
Anchor anchor, MatchKind kind,
|
Anchor anchor, MatchKind kind, StringPiece* match0,
|
||||||
StringPiece* match0, bool* failed,
|
bool* failed, SparseSet* matches);
|
||||||
vector<int>* matches);
|
|
||||||
|
|
||||||
// Build the entire DFA for the given match kind. FOR TESTING ONLY.
|
// The callback issued after building each DFA state with BuildEntireDFA().
|
||||||
|
// If next is null, then the memory budget has been exhausted and building
|
||||||
|
// will halt. Otherwise, the state has been built and next points to an array
|
||||||
|
// of bytemap_range()+1 slots holding the next states as per the bytemap and
|
||||||
|
// kByteEndText. The number of the state is implied by the callback sequence:
|
||||||
|
// the first callback is for state 0, the second callback is for state 1, ...
|
||||||
|
// match indicates whether the state is a matching state.
|
||||||
|
using DFAStateCallback = std::function<void(const int* next, bool match)>;
|
||||||
|
|
||||||
|
// Build the entire DFA for the given match kind.
|
||||||
// Usually the DFA is built out incrementally, as needed, which
|
// Usually the DFA is built out incrementally, as needed, which
|
||||||
// avoids lots of unnecessary work. This function is useful only
|
// avoids lots of unnecessary work.
|
||||||
// for testing purposes. Returns number of states.
|
// If cb is not empty, it receives one callback per state built.
|
||||||
int BuildEntireDFA(MatchKind kind);
|
// Returns the number of states built.
|
||||||
|
// FOR TESTING OR EXPERIMENTAL PURPOSES ONLY.
|
||||||
|
int BuildEntireDFA(MatchKind kind, const DFAStateCallback& cb);
|
||||||
|
|
||||||
|
// Controls whether the DFA should bail out early if the NFA would be faster.
|
||||||
|
// FOR TESTING ONLY.
|
||||||
|
static void TEST_dfa_should_bail_when_slow(bool b);
|
||||||
|
|
||||||
// Compute bytemap.
|
// Compute bytemap.
|
||||||
void ComputeByteMap();
|
void ComputeByteMap();
|
||||||
|
|
||||||
|
// Computes whether all matches must begin with the same first
|
||||||
|
// byte, and if so, returns that byte. If not, returns -1.
|
||||||
|
int ComputeFirstByte();
|
||||||
|
|
||||||
// Run peep-hole optimizer on program.
|
// Run peep-hole optimizer on program.
|
||||||
void Optimize();
|
void Optimize();
|
||||||
|
|
||||||
@ -329,48 +331,80 @@ class Prog {
|
|||||||
// Returns true on success, false on error.
|
// Returns true on success, false on error.
|
||||||
bool PossibleMatchRange(string* min, string* max, int maxlen);
|
bool PossibleMatchRange(string* min, string* max, int maxlen);
|
||||||
|
|
||||||
|
// EXPERIMENTAL! SUBJECT TO CHANGE!
|
||||||
|
// Outputs the program fanout into the given sparse array.
|
||||||
|
void Fanout(SparseArray<int>* fanout);
|
||||||
|
|
||||||
// Compiles a collection of regexps to Prog. Each regexp will have
|
// Compiles a collection of regexps to Prog. Each regexp will have
|
||||||
// its own Match instruction recording the index in the vector.
|
// its own Match instruction recording the index in the output vector.
|
||||||
static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor,
|
static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem);
|
||||||
Regexp* re);
|
|
||||||
|
// Flattens the Prog from "tree" form to "list" form. This is an in-place
|
||||||
|
// operation in the sense that the old instructions are lost.
|
||||||
|
void Flatten();
|
||||||
|
|
||||||
|
// Walks the Prog; the "successor roots" or predecessors of the reachable
|
||||||
|
// instructions are marked in rootmap or predmap/predvec, respectively.
|
||||||
|
// reachable and stk are preallocated scratch structures.
|
||||||
|
void MarkSuccessors(SparseArray<int>* rootmap,
|
||||||
|
SparseArray<int>* predmap,
|
||||||
|
std::vector<std::vector<int>>* predvec,
|
||||||
|
SparseSet* reachable, std::vector<int>* stk);
|
||||||
|
|
||||||
|
// Walks the Prog from the given "root" instruction; the "dominator root"
|
||||||
|
// of the reachable instructions (if such exists) is marked in rootmap.
|
||||||
|
// reachable and stk are preallocated scratch structures.
|
||||||
|
void MarkDominator(int root, SparseArray<int>* rootmap,
|
||||||
|
SparseArray<int>* predmap,
|
||||||
|
std::vector<std::vector<int>>* predvec,
|
||||||
|
SparseSet* reachable, std::vector<int>* stk);
|
||||||
|
|
||||||
|
// Walks the Prog from the given "root" instruction; the reachable
|
||||||
|
// instructions are emitted in "list" form and appended to flat.
|
||||||
|
// reachable and stk are preallocated scratch structures.
|
||||||
|
void EmitList(int root, SparseArray<int>* rootmap,
|
||||||
|
std::vector<Inst>* flat,
|
||||||
|
SparseSet* reachable, std::vector<int>* stk);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
friend class Compiler;
|
friend class Compiler;
|
||||||
|
|
||||||
DFA* GetDFA(MatchKind kind);
|
DFA* GetDFA(MatchKind kind);
|
||||||
|
void DeleteDFA(DFA* dfa);
|
||||||
|
|
||||||
bool anchor_start_; // regexp has explicit start anchor
|
bool anchor_start_; // regexp has explicit start anchor
|
||||||
bool anchor_end_; // regexp has explicit end anchor
|
bool anchor_end_; // regexp has explicit end anchor
|
||||||
bool reversed_; // whether program runs backward over input
|
bool reversed_; // whether program runs backward over input
|
||||||
|
bool did_flatten_; // has Flatten been called?
|
||||||
bool did_onepass_; // has IsOnePass been called?
|
bool did_onepass_; // has IsOnePass been called?
|
||||||
|
|
||||||
int start_; // entry point for program
|
int start_; // entry point for program
|
||||||
int start_unanchored_; // unanchored entry point for program
|
int start_unanchored_; // unanchored entry point for program
|
||||||
int size_; // number of instructions
|
int size_; // number of instructions
|
||||||
int byte_inst_count_; // number of kInstByteRange instructions
|
|
||||||
int bytemap_range_; // bytemap_[x] < bytemap_range_
|
int bytemap_range_; // bytemap_[x] < bytemap_range_
|
||||||
|
int first_byte_; // required first byte for match, or -1 if none
|
||||||
int flags_; // regexp parse flags
|
int flags_; // regexp parse flags
|
||||||
int onepass_statesize_; // byte size of each OneState* node
|
|
||||||
|
int list_count_; // count of lists (see above)
|
||||||
|
int inst_count_[kNumInst]; // count of instructions by opcode
|
||||||
|
|
||||||
Inst* inst_; // pointer to instruction array
|
Inst* inst_; // pointer to instruction array
|
||||||
|
uint8_t* onepass_nodes_; // data for OnePass nodes
|
||||||
|
|
||||||
Mutex dfa_mutex_; // Protects dfa_first_, dfa_longest_
|
int64_t dfa_mem_; // Maximum memory for DFAs.
|
||||||
DFA* volatile dfa_first_; // DFA cached for kFirstMatch
|
DFA* dfa_first_; // DFA cached for kFirstMatch/kManyMatch
|
||||||
DFA* volatile dfa_longest_; // DFA cached for kLongestMatch and kFullMatch
|
DFA* dfa_longest_; // DFA cached for kLongestMatch/kFullMatch
|
||||||
int64 dfa_mem_; // Maximum memory for DFAs.
|
|
||||||
void (*delete_dfa_)(DFA* dfa);
|
|
||||||
|
|
||||||
Bitmap<256> byterange_; // byterange.Get(x) true if x ends a
|
uint8_t bytemap_[256]; // map from input bytes to byte classes
|
||||||
// commonly-treated byte range.
|
|
||||||
uint8 bytemap_[256]; // map from input bytes to byte classes
|
|
||||||
uint8 *unbytemap_; // bytemap_[unbytemap_[x]] == x
|
|
||||||
|
|
||||||
uint8* onepass_nodes_; // data for OnePass nodes
|
std::once_flag first_byte_once_;
|
||||||
OneState* onepass_start_; // start node for OnePass program
|
std::once_flag dfa_first_once_;
|
||||||
|
std::once_flag dfa_longest_once_;
|
||||||
|
|
||||||
DISALLOW_EVIL_CONSTRUCTORS(Prog);
|
Prog(const Prog&) = delete;
|
||||||
|
Prog& operator=(const Prog&) = delete;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace re2
|
} // namespace re2
|
||||||
|
|
||||||
#endif // RE2_PROG_H__
|
#endif // RE2_PROG_H_
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -2,8 +2,8 @@
|
|||||||
// Use of this source code is governed by a BSD-style
|
// Use of this source code is governed by a BSD-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
#ifndef RE2_RE2_H
|
#ifndef RE2_RE2_H_
|
||||||
#define RE2_RE2_H
|
#define RE2_RE2_H_
|
||||||
|
|
||||||
// C++ interface to the re2 regular-expression library.
|
// C++ interface to the re2 regular-expression library.
|
||||||
// RE2 supports Perl-style regular expressions (with extensions like
|
// RE2 supports Perl-style regular expressions (with extensions like
|
||||||
@ -17,7 +17,7 @@
|
|||||||
// some of the more complicated things thrown away. In particular,
|
// some of the more complicated things thrown away. In particular,
|
||||||
// backreferences and generalized assertions are not available, nor is \Z.
|
// backreferences and generalized assertions are not available, nor is \Z.
|
||||||
//
|
//
|
||||||
// See http://code.google.com/p/re2/wiki/Syntax for the syntax
|
// See https://github.com/google/re2/wiki/Syntax for the syntax
|
||||||
// supported by RE2, and a comparison with PCRE and PERL regexps.
|
// supported by RE2, and a comparison with PCRE and PERL regexps.
|
||||||
//
|
//
|
||||||
// For those not familiar with Perl's regular expressions,
|
// For those not familiar with Perl's regular expressions,
|
||||||
@ -179,38 +179,24 @@
|
|||||||
// RE2::Octal(&a), RE2::Hex(&b), RE2::CRadix(&c), RE2::CRadix(&d));
|
// RE2::Octal(&a), RE2::Hex(&b), RE2::CRadix(&c), RE2::CRadix(&d));
|
||||||
// will leave 64 in a, b, c, and d.
|
// will leave 64 in a, b, c, and d.
|
||||||
|
|
||||||
|
#include <stddef.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
#include <algorithm>
|
||||||
#include <map>
|
#include <map>
|
||||||
|
#include <mutex>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include "re2/stringpiece.h"
|
|
||||||
#include "re2/variadic_function.h"
|
|
||||||
|
|
||||||
#ifndef RE2_HAVE_LONGLONG
|
#include "re2/stringpiece.h"
|
||||||
#define RE2_HAVE_LONGLONG 1
|
|
||||||
#endif
|
namespace re2 {
|
||||||
|
class Prog;
|
||||||
|
class Regexp;
|
||||||
|
} // namespace re2
|
||||||
|
|
||||||
namespace re2 {
|
namespace re2 {
|
||||||
|
|
||||||
|
// TODO(junyer): Get rid of this.
|
||||||
using std::string;
|
using std::string;
|
||||||
using std::map;
|
|
||||||
class Mutex;
|
|
||||||
class Prog;
|
|
||||||
class Regexp;
|
|
||||||
|
|
||||||
// The following enum should be used only as a constructor argument to indicate
|
|
||||||
// that the variable has static storage class, and that the constructor should
|
|
||||||
// do nothing to its state. It indicates to the reader that it is legal to
|
|
||||||
// declare a static instance of the class, provided the constructor is given
|
|
||||||
// the LINKER_INITIALIZED argument. Normally, it is unsafe to declare a
|
|
||||||
// static variable that has a constructor or a destructor because invocation
|
|
||||||
// order is undefined. However, IF the type can be initialized by filling with
|
|
||||||
// zeroes (which the loader does for static variables), AND the type's
|
|
||||||
// destructor does nothing to the storage, then a constructor for static
|
|
||||||
// initialization can be declared as
|
|
||||||
// explicit MyClass(LinkerInitialized x) {}
|
|
||||||
// and invoked as
|
|
||||||
// static MyClass my_variable_name(LINKER_INITIALIZED);
|
|
||||||
enum LinkerInitialized { LINKER_INITIALIZED };
|
|
||||||
|
|
||||||
// Interface for regular expression matching. Also corresponds to a
|
// Interface for regular expression matching. Also corresponds to a
|
||||||
// pre-compiled regular expression. An "RE2" object is safe for
|
// pre-compiled regular expression. An "RE2" object is safe for
|
||||||
@ -266,7 +252,7 @@ class RE2 {
|
|||||||
RE2(const string& pattern);
|
RE2(const string& pattern);
|
||||||
#endif
|
#endif
|
||||||
RE2(const StringPiece& pattern);
|
RE2(const StringPiece& pattern);
|
||||||
RE2(const StringPiece& pattern, const Options& option);
|
RE2(const StringPiece& pattern, const Options& options);
|
||||||
~RE2();
|
~RE2();
|
||||||
|
|
||||||
// Returns whether RE2 was created properly.
|
// Returns whether RE2 was created properly.
|
||||||
@ -293,6 +279,11 @@ class RE2 {
|
|||||||
// Larger numbers are more expensive than smaller numbers.
|
// Larger numbers are more expensive than smaller numbers.
|
||||||
int ProgramSize() const;
|
int ProgramSize() const;
|
||||||
|
|
||||||
|
// EXPERIMENTAL! SUBJECT TO CHANGE!
|
||||||
|
// Outputs the program fanout as a histogram bucketed by powers of 2.
|
||||||
|
// Returns the number of the largest non-empty bucket.
|
||||||
|
int ProgramFanout(std::map<int, int>* histogram) const;
|
||||||
|
|
||||||
// Returns the underlying Regexp; not for general use.
|
// Returns the underlying Regexp; not for general use.
|
||||||
// Returns entire_regexp_ so that callers don't need
|
// Returns entire_regexp_ so that callers don't need
|
||||||
// to know about prefix_ and prefix_foldcase_.
|
// to know about prefix_ and prefix_foldcase_.
|
||||||
@ -300,21 +291,21 @@ class RE2 {
|
|||||||
|
|
||||||
/***** The useful part: the matching interface *****/
|
/***** The useful part: the matching interface *****/
|
||||||
|
|
||||||
// Matches "text" against "pattern". If pointer arguments are
|
// Matches "text" against "re". If pointer arguments are
|
||||||
// supplied, copies matched sub-patterns into them.
|
// supplied, copies matched sub-patterns into them.
|
||||||
//
|
//
|
||||||
// You can pass in a "const char*" or a "string" for "text".
|
// You can pass in a "const char*" or a "string" for "text".
|
||||||
// You can pass in a "const char*" or a "string" or a "RE2" for "pattern".
|
// You can pass in a "const char*" or a "string" or a "RE2" for "re".
|
||||||
//
|
//
|
||||||
// The provided pointer arguments can be pointers to any scalar numeric
|
// The provided pointer arguments can be pointers to any scalar numeric
|
||||||
// type, or one of:
|
// type, or one of:
|
||||||
// string (matched piece is copied to string)
|
// string (matched piece is copied to string)
|
||||||
// StringPiece (StringPiece is mutated to point to matched piece)
|
// StringPiece (StringPiece is mutated to point to matched piece)
|
||||||
// T (where "bool T::ParseFrom(const char*, int)" exists)
|
// T (where "bool T::ParseFrom(const char*, size_t)" exists)
|
||||||
// (void*)NULL (the corresponding matched sub-pattern is not copied)
|
// (void*)NULL (the corresponding matched sub-pattern is not copied)
|
||||||
//
|
//
|
||||||
// Returns true iff all of the following conditions are satisfied:
|
// Returns true iff all of the following conditions are satisfied:
|
||||||
// a. "text" matches "pattern" exactly
|
// a. "text" matches "re" exactly
|
||||||
// b. The number of matched sub-patterns is >= number of supplied pointers
|
// b. The number of matched sub-patterns is >= number of supplied pointers
|
||||||
// c. The "i"th argument has a suitable type for holding the
|
// c. The "i"th argument has a suitable type for holding the
|
||||||
// string captured as the "i"th sub-pattern. If you pass in
|
// string captured as the "i"th sub-pattern. If you pass in
|
||||||
@ -330,32 +321,65 @@ class RE2 {
|
|||||||
// RE2::FullMatch("abc", "[a-z]+(\\d+)?", &number);
|
// RE2::FullMatch("abc", "[a-z]+(\\d+)?", &number);
|
||||||
static bool FullMatchN(const StringPiece& text, const RE2& re,
|
static bool FullMatchN(const StringPiece& text, const RE2& re,
|
||||||
const Arg* const args[], int argc);
|
const Arg* const args[], int argc);
|
||||||
static const VariadicFunction2<
|
|
||||||
bool, const StringPiece&, const RE2&, Arg, RE2::FullMatchN> FullMatch;
|
|
||||||
|
|
||||||
// Exactly like FullMatch(), except that "pattern" is allowed to match
|
// Exactly like FullMatch(), except that "re" is allowed to match
|
||||||
// a substring of "text".
|
// a substring of "text".
|
||||||
static bool PartialMatchN(const StringPiece& text, const RE2& re, // 3..16 args
|
static bool PartialMatchN(const StringPiece& text, const RE2& re,
|
||||||
const Arg* const args[], int argc);
|
const Arg* const args[], int argc);
|
||||||
static const VariadicFunction2<
|
|
||||||
bool, const StringPiece&, const RE2&, Arg, RE2::PartialMatchN> PartialMatch;
|
|
||||||
|
|
||||||
// Like FullMatch() and PartialMatch(), except that pattern has to
|
// Like FullMatch() and PartialMatch(), except that "re" has to match
|
||||||
// match a prefix of "text", and "input" is advanced past the matched
|
// a prefix of the text, and "input" is advanced past the matched
|
||||||
// text. Note: "input" is modified iff this routine returns true.
|
// text. Note: "input" is modified iff this routine returns true.
|
||||||
static bool ConsumeN(StringPiece* input, const RE2& pattern, // 3..16 args
|
static bool ConsumeN(StringPiece* input, const RE2& re,
|
||||||
const Arg* const args[], int argc);
|
const Arg* const args[], int argc);
|
||||||
static const VariadicFunction2<
|
|
||||||
bool, StringPiece*, const RE2&, Arg, RE2::ConsumeN> Consume;
|
|
||||||
|
|
||||||
// Like Consume(..), but does not anchor the match at the beginning of the
|
// Like Consume(), but does not anchor the match at the beginning of
|
||||||
// string. That is, "pattern" need not start its match at the beginning of
|
// the text. That is, "re" need not start its match at the beginning
|
||||||
// "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds the next
|
// of "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds
|
||||||
// word in "s" and stores it in "word".
|
// the next word in "s" and stores it in "word".
|
||||||
static bool FindAndConsumeN(StringPiece* input, const RE2& pattern,
|
static bool FindAndConsumeN(StringPiece* input, const RE2& re,
|
||||||
const Arg* const args[], int argc);
|
const Arg* const args[], int argc);
|
||||||
static const VariadicFunction2<
|
|
||||||
bool, StringPiece*, const RE2&, Arg, RE2::FindAndConsumeN> FindAndConsume;
|
#ifndef SWIG
|
||||||
|
private:
|
||||||
|
template <typename F, typename SP>
|
||||||
|
static inline bool Apply(F f, SP sp, const RE2& re) {
|
||||||
|
return f(sp, re, NULL, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename F, typename SP, typename... A>
|
||||||
|
static inline bool Apply(F f, SP sp, const RE2& re, const A&... a) {
|
||||||
|
const Arg* const args[] = {&a...};
|
||||||
|
const int argc = sizeof...(a);
|
||||||
|
return f(sp, re, args, argc);
|
||||||
|
}
|
||||||
|
|
||||||
|
public:
|
||||||
|
// In order to allow FullMatch() et al. to be called with a varying number
|
||||||
|
// of arguments of varying types, we use two layers of variadic templates.
|
||||||
|
// The first layer constructs the temporary Arg objects. The second layer
|
||||||
|
// (above) constructs the array of pointers to the temporary Arg objects.
|
||||||
|
|
||||||
|
template <typename... A>
|
||||||
|
static bool FullMatch(const StringPiece& text, const RE2& re, A&&... a) {
|
||||||
|
return Apply(FullMatchN, text, re, Arg(std::forward<A>(a))...);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename... A>
|
||||||
|
static bool PartialMatch(const StringPiece& text, const RE2& re, A&&... a) {
|
||||||
|
return Apply(PartialMatchN, text, re, Arg(std::forward<A>(a))...);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename... A>
|
||||||
|
static bool Consume(StringPiece* input, const RE2& re, A&&... a) {
|
||||||
|
return Apply(ConsumeN, input, re, Arg(std::forward<A>(a))...);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename... A>
|
||||||
|
static bool FindAndConsume(StringPiece* input, const RE2& re, A&&... a) {
|
||||||
|
return Apply(FindAndConsumeN, input, re, Arg(std::forward<A>(a))...);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Replace the first match of "pattern" in "str" with "rewrite".
|
// Replace the first match of "pattern" in "str" with "rewrite".
|
||||||
// Within "rewrite", backslash-escaped digits (\1 to \9) can be
|
// Within "rewrite", backslash-escaped digits (\1 to \9) can be
|
||||||
@ -397,6 +421,8 @@ class RE2 {
|
|||||||
//
|
//
|
||||||
// Returns true iff a match occurred and the extraction happened
|
// Returns true iff a match occurred and the extraction happened
|
||||||
// successfully; if no match occurs, the string is left unaffected.
|
// successfully; if no match occurs, the string is left unaffected.
|
||||||
|
//
|
||||||
|
// REQUIRES: "text" must not alias any part of "*out".
|
||||||
static bool Extract(const StringPiece &text,
|
static bool Extract(const StringPiece &text,
|
||||||
const RE2& pattern,
|
const RE2& pattern,
|
||||||
const StringPiece &rewrite,
|
const StringPiece &rewrite,
|
||||||
@ -440,17 +466,16 @@ class RE2 {
|
|||||||
// does not count: if the regexp is "(a)(b)", returns 2.
|
// does not count: if the regexp is "(a)(b)", returns 2.
|
||||||
int NumberOfCapturingGroups() const;
|
int NumberOfCapturingGroups() const;
|
||||||
|
|
||||||
|
|
||||||
// Return a map from names to capturing indices.
|
// Return a map from names to capturing indices.
|
||||||
// The map records the index of the leftmost group
|
// The map records the index of the leftmost group
|
||||||
// with the given name.
|
// with the given name.
|
||||||
// Only valid until the re is deleted.
|
// Only valid until the re is deleted.
|
||||||
const map<string, int>& NamedCapturingGroups() const;
|
const std::map<string, int>& NamedCapturingGroups() const;
|
||||||
|
|
||||||
// Return a map from capturing indices to names.
|
// Return a map from capturing indices to names.
|
||||||
// The map has no entries for unnamed groups.
|
// The map has no entries for unnamed groups.
|
||||||
// Only valid until the re is deleted.
|
// Only valid until the re is deleted.
|
||||||
const map<int, string>& CapturingGroupNames() const;
|
const std::map<int, string>& CapturingGroupNames() const;
|
||||||
|
|
||||||
// General matching routine.
|
// General matching routine.
|
||||||
// Match against text starting at offset startpos
|
// Match against text starting at offset startpos
|
||||||
@ -459,8 +484,8 @@ class RE2 {
|
|||||||
// On a successful match, fills in match[] (up to nmatch entries)
|
// On a successful match, fills in match[] (up to nmatch entries)
|
||||||
// with information about submatches.
|
// with information about submatches.
|
||||||
// I.e. matching RE2("(foo)|(bar)baz") on "barbazbla" will return true,
|
// I.e. matching RE2("(foo)|(bar)baz") on "barbazbla" will return true,
|
||||||
// setting match[0] = "barbaz", match[1] = NULL, match[2] = "bar",
|
// setting match[0] = "barbaz", match[1].data() = NULL, match[2] = "bar",
|
||||||
// match[3] = NULL, ..., up to match[nmatch-1] = NULL.
|
// match[3].data() = NULL, ..., up to match[nmatch-1].data() = NULL.
|
||||||
//
|
//
|
||||||
// Don't ask for more match information than you will use:
|
// Don't ask for more match information than you will use:
|
||||||
// runs much faster with nmatch == 1 than nmatch > 1, and
|
// runs much faster with nmatch == 1 than nmatch > 1, and
|
||||||
@ -471,10 +496,10 @@ class RE2 {
|
|||||||
// Passing text == StringPiece(NULL, 0) will be handled like any other
|
// Passing text == StringPiece(NULL, 0) will be handled like any other
|
||||||
// empty string, but note that on return, it will not be possible to tell
|
// empty string, but note that on return, it will not be possible to tell
|
||||||
// whether submatch i matched the empty string or did not match:
|
// whether submatch i matched the empty string or did not match:
|
||||||
// either way, match[i] == NULL.
|
// either way, match[i].data() == NULL.
|
||||||
bool Match(const StringPiece& text,
|
bool Match(const StringPiece& text,
|
||||||
int startpos,
|
size_t startpos,
|
||||||
int endpos,
|
size_t endpos,
|
||||||
Anchor anchor,
|
Anchor anchor,
|
||||||
StringPiece *match,
|
StringPiece *match,
|
||||||
int nmatch) const;
|
int nmatch) const;
|
||||||
@ -632,19 +657,7 @@ class RE2 {
|
|||||||
void set_one_line(bool b) { one_line_ = b; }
|
void set_one_line(bool b) { one_line_ = b; }
|
||||||
|
|
||||||
void Copy(const Options& src) {
|
void Copy(const Options& src) {
|
||||||
encoding_ = src.encoding_;
|
*this = src;
|
||||||
posix_syntax_ = src.posix_syntax_;
|
|
||||||
longest_match_ = src.longest_match_;
|
|
||||||
log_errors_ = src.log_errors_;
|
|
||||||
max_mem_ = src.max_mem_;
|
|
||||||
literal_ = src.literal_;
|
|
||||||
never_nl_ = src.never_nl_;
|
|
||||||
dot_nl_ = src.dot_nl_;
|
|
||||||
never_capture_ = src.never_capture_;
|
|
||||||
case_sensitive_ = src.case_sensitive_;
|
|
||||||
perl_classes_ = src.perl_classes_;
|
|
||||||
word_boundary_ = src.word_boundary_;
|
|
||||||
one_line_ = src.one_line_;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int ParseFlags() const;
|
int ParseFlags() const;
|
||||||
@ -663,10 +676,6 @@ class RE2 {
|
|||||||
bool perl_classes_;
|
bool perl_classes_;
|
||||||
bool word_boundary_;
|
bool word_boundary_;
|
||||||
bool one_line_;
|
bool one_line_;
|
||||||
|
|
||||||
//DISALLOW_EVIL_CONSTRUCTORS(Options);
|
|
||||||
Options(const Options&);
|
|
||||||
void operator=(const Options&);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// Returns the options set in the constructor.
|
// Returns the options set in the constructor.
|
||||||
@ -679,10 +688,8 @@ class RE2 {
|
|||||||
static inline Arg CRadix(unsigned int* x);
|
static inline Arg CRadix(unsigned int* x);
|
||||||
static inline Arg CRadix(long* x);
|
static inline Arg CRadix(long* x);
|
||||||
static inline Arg CRadix(unsigned long* x);
|
static inline Arg CRadix(unsigned long* x);
|
||||||
#ifdef RE2_HAVE_LONGLONG
|
|
||||||
static inline Arg CRadix(long long* x);
|
static inline Arg CRadix(long long* x);
|
||||||
static inline Arg CRadix(unsigned long long* x);
|
static inline Arg CRadix(unsigned long long* x);
|
||||||
#endif
|
|
||||||
|
|
||||||
static inline Arg Hex(short* x);
|
static inline Arg Hex(short* x);
|
||||||
static inline Arg Hex(unsigned short* x);
|
static inline Arg Hex(unsigned short* x);
|
||||||
@ -690,10 +697,8 @@ class RE2 {
|
|||||||
static inline Arg Hex(unsigned int* x);
|
static inline Arg Hex(unsigned int* x);
|
||||||
static inline Arg Hex(long* x);
|
static inline Arg Hex(long* x);
|
||||||
static inline Arg Hex(unsigned long* x);
|
static inline Arg Hex(unsigned long* x);
|
||||||
#ifdef RE2_HAVE_LONGLONG
|
|
||||||
static inline Arg Hex(long long* x);
|
static inline Arg Hex(long long* x);
|
||||||
static inline Arg Hex(unsigned long long* x);
|
static inline Arg Hex(unsigned long long* x);
|
||||||
#endif
|
|
||||||
|
|
||||||
static inline Arg Octal(short* x);
|
static inline Arg Octal(short* x);
|
||||||
static inline Arg Octal(unsigned short* x);
|
static inline Arg Octal(unsigned short* x);
|
||||||
@ -701,23 +706,20 @@ class RE2 {
|
|||||||
static inline Arg Octal(unsigned int* x);
|
static inline Arg Octal(unsigned int* x);
|
||||||
static inline Arg Octal(long* x);
|
static inline Arg Octal(long* x);
|
||||||
static inline Arg Octal(unsigned long* x);
|
static inline Arg Octal(unsigned long* x);
|
||||||
#ifdef RE2_HAVE_LONGLONG
|
|
||||||
static inline Arg Octal(long long* x);
|
static inline Arg Octal(long long* x);
|
||||||
static inline Arg Octal(unsigned long long* x);
|
static inline Arg Octal(unsigned long long* x);
|
||||||
#endif
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void Init(const StringPiece& pattern, const Options& options);
|
void Init(const StringPiece& pattern, const Options& options);
|
||||||
|
|
||||||
bool DoMatch(const StringPiece& text,
|
bool DoMatch(const StringPiece& text,
|
||||||
Anchor anchor,
|
Anchor anchor,
|
||||||
int* consumed,
|
size_t* consumed,
|
||||||
const Arg* const args[],
|
const Arg* const args[],
|
||||||
int n) const;
|
int n) const;
|
||||||
|
|
||||||
re2::Prog* ReverseProg() const;
|
re2::Prog* ReverseProg() const;
|
||||||
|
|
||||||
mutable Mutex* mutex_;
|
|
||||||
string pattern_; // string regular expression
|
string pattern_; // string regular expression
|
||||||
Options options_; // option flags
|
Options options_; // option flags
|
||||||
string prefix_; // required prefix (before regexp_)
|
string prefix_; // required prefix (before regexp_)
|
||||||
@ -725,8 +727,9 @@ class RE2 {
|
|||||||
re2::Regexp* entire_regexp_; // parsed regular expression
|
re2::Regexp* entire_regexp_; // parsed regular expression
|
||||||
re2::Regexp* suffix_regexp_; // parsed regular expression, prefix removed
|
re2::Regexp* suffix_regexp_; // parsed regular expression, prefix removed
|
||||||
re2::Prog* prog_; // compiled program for regexp
|
re2::Prog* prog_; // compiled program for regexp
|
||||||
mutable re2::Prog* rprog_; // reverse program for regexp
|
|
||||||
bool is_one_pass_; // can use prog_->SearchOnePass?
|
bool is_one_pass_; // can use prog_->SearchOnePass?
|
||||||
|
|
||||||
|
mutable re2::Prog* rprog_; // reverse program for regexp
|
||||||
mutable const string* error_; // Error indicator
|
mutable const string* error_; // Error indicator
|
||||||
// (or points to empty string)
|
// (or points to empty string)
|
||||||
mutable ErrorCode error_code_; // Error code
|
mutable ErrorCode error_code_; // Error code
|
||||||
@ -734,14 +737,19 @@ class RE2 {
|
|||||||
mutable int num_captures_; // Number of capturing groups
|
mutable int num_captures_; // Number of capturing groups
|
||||||
|
|
||||||
// Map from capture names to indices
|
// Map from capture names to indices
|
||||||
mutable const map<string, int>* named_groups_;
|
mutable const std::map<string, int>* named_groups_;
|
||||||
|
|
||||||
// Map from capture indices to names
|
// Map from capture indices to names
|
||||||
mutable const map<int, string>* group_names_;
|
mutable const std::map<int, string>* group_names_;
|
||||||
|
|
||||||
//DISALLOW_EVIL_CONSTRUCTORS(RE2);
|
// Onces for lazy computations.
|
||||||
RE2(const RE2&);
|
mutable std::once_flag rprog_once_;
|
||||||
void operator=(const RE2&);
|
mutable std::once_flag num_captures_once_;
|
||||||
|
mutable std::once_flag named_groups_once_;
|
||||||
|
mutable std::once_flag group_names_once_;
|
||||||
|
|
||||||
|
RE2(const RE2&) = delete;
|
||||||
|
RE2& operator=(const RE2&) = delete;
|
||||||
};
|
};
|
||||||
|
|
||||||
/***** Implementation details *****/
|
/***** Implementation details *****/
|
||||||
@ -752,7 +760,7 @@ class RE2 {
|
|||||||
template <class T>
|
template <class T>
|
||||||
class _RE2_MatchObject {
|
class _RE2_MatchObject {
|
||||||
public:
|
public:
|
||||||
static inline bool Parse(const char* str, int n, void* dest) {
|
static inline bool Parse(const char* str, size_t n, void* dest) {
|
||||||
if (dest == NULL) return true;
|
if (dest == NULL) return true;
|
||||||
T* object = reinterpret_cast<T*>(dest);
|
T* object = reinterpret_cast<T*>(dest);
|
||||||
return object->ParseFrom(str, n);
|
return object->ParseFrom(str, n);
|
||||||
@ -767,65 +775,64 @@ class RE2::Arg {
|
|||||||
// Constructor specially designed for NULL arguments
|
// Constructor specially designed for NULL arguments
|
||||||
Arg(void*);
|
Arg(void*);
|
||||||
|
|
||||||
typedef bool (*Parser)(const char* str, int n, void* dest);
|
typedef bool (*Parser)(const char* str, size_t n, void* dest);
|
||||||
|
|
||||||
// Type-specific parsers
|
// Type-specific parsers
|
||||||
#define MAKE_PARSER(type, name) \
|
#define MAKE_PARSER(type, name) \
|
||||||
Arg(type* p) : arg_(p), parser_(name) {} \
|
Arg(type* p) : arg_(p), parser_(name) {} \
|
||||||
Arg(type* p, Parser parser) : arg_(p), parser_(parser) { } \
|
Arg(type* p, Parser parser) : arg_(p), parser_(parser) {}
|
||||||
|
|
||||||
|
|
||||||
MAKE_PARSER(char, parse_char);
|
MAKE_PARSER(char, parse_char);
|
||||||
MAKE_PARSER(signed char, parse_char);
|
MAKE_PARSER(signed char, parse_schar);
|
||||||
MAKE_PARSER(unsigned char, parse_uchar);
|
MAKE_PARSER(unsigned char, parse_uchar);
|
||||||
|
MAKE_PARSER(float, parse_float);
|
||||||
|
MAKE_PARSER(double, parse_double);
|
||||||
|
MAKE_PARSER(string, parse_string);
|
||||||
|
MAKE_PARSER(StringPiece, parse_stringpiece);
|
||||||
|
|
||||||
MAKE_PARSER(short, parse_short);
|
MAKE_PARSER(short, parse_short);
|
||||||
MAKE_PARSER(unsigned short, parse_ushort);
|
MAKE_PARSER(unsigned short, parse_ushort);
|
||||||
MAKE_PARSER(int, parse_int);
|
MAKE_PARSER(int, parse_int);
|
||||||
MAKE_PARSER(unsigned int, parse_uint);
|
MAKE_PARSER(unsigned int, parse_uint);
|
||||||
MAKE_PARSER(long, parse_long);
|
MAKE_PARSER(long, parse_long);
|
||||||
MAKE_PARSER(unsigned long, parse_ulong);
|
MAKE_PARSER(unsigned long, parse_ulong);
|
||||||
#ifdef RE2_HAVE_LONGLONG
|
|
||||||
MAKE_PARSER(long long, parse_longlong);
|
MAKE_PARSER(long long, parse_longlong);
|
||||||
MAKE_PARSER(unsigned long long, parse_ulonglong);
|
MAKE_PARSER(unsigned long long, parse_ulonglong);
|
||||||
#endif
|
|
||||||
MAKE_PARSER(float, parse_float);
|
|
||||||
MAKE_PARSER(double, parse_double);
|
|
||||||
MAKE_PARSER(string, parse_string);
|
|
||||||
MAKE_PARSER(StringPiece, parse_stringpiece);
|
|
||||||
|
|
||||||
#undef MAKE_PARSER
|
#undef MAKE_PARSER
|
||||||
|
|
||||||
// Generic constructor
|
// Generic constructor templates
|
||||||
template <class T> Arg(T*, Parser parser);
|
|
||||||
// Generic constructor template
|
|
||||||
template <class T> Arg(T* p)
|
template <class T> Arg(T* p)
|
||||||
: arg_(p), parser_(_RE2_MatchObject<T>::Parse) {
|
: arg_(p), parser_(_RE2_MatchObject<T>::Parse) { }
|
||||||
}
|
template <class T> Arg(T* p, Parser parser)
|
||||||
|
: arg_(p), parser_(parser) { }
|
||||||
|
|
||||||
// Parse the data
|
// Parse the data
|
||||||
bool Parse(const char* str, int n) const;
|
bool Parse(const char* str, size_t n) const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void* arg_;
|
void* arg_;
|
||||||
Parser parser_;
|
Parser parser_;
|
||||||
|
|
||||||
static bool parse_null (const char* str, int n, void* dest);
|
static bool parse_null (const char* str, size_t n, void* dest);
|
||||||
static bool parse_char (const char* str, int n, void* dest);
|
static bool parse_char (const char* str, size_t n, void* dest);
|
||||||
static bool parse_uchar (const char* str, int n, void* dest);
|
static bool parse_schar (const char* str, size_t n, void* dest);
|
||||||
static bool parse_float (const char* str, int n, void* dest);
|
static bool parse_uchar (const char* str, size_t n, void* dest);
|
||||||
static bool parse_double (const char* str, int n, void* dest);
|
static bool parse_float (const char* str, size_t n, void* dest);
|
||||||
static bool parse_string (const char* str, int n, void* dest);
|
static bool parse_double (const char* str, size_t n, void* dest);
|
||||||
static bool parse_stringpiece (const char* str, int n, void* dest);
|
static bool parse_string (const char* str, size_t n, void* dest);
|
||||||
|
static bool parse_stringpiece (const char* str, size_t n, void* dest);
|
||||||
|
|
||||||
#define DECLARE_INTEGER_PARSER(name) \
|
#define DECLARE_INTEGER_PARSER(name) \
|
||||||
private: \
|
private: \
|
||||||
static bool parse_ ## name(const char* str, int n, void* dest); \
|
static bool parse_##name(const char* str, size_t n, void* dest); \
|
||||||
static bool parse_ ## name ## _radix( \
|
static bool parse_##name##_radix(const char* str, size_t n, void* dest, \
|
||||||
const char* str, int n, void* dest, int radix); \
|
int radix); \
|
||||||
|
\
|
||||||
public: \
|
public: \
|
||||||
static bool parse_ ## name ## _hex(const char* str, int n, void* dest); \
|
static bool parse_##name##_hex(const char* str, size_t n, void* dest); \
|
||||||
static bool parse_ ## name ## _octal(const char* str, int n, void* dest); \
|
static bool parse_##name##_octal(const char* str, size_t n, void* dest); \
|
||||||
static bool parse_ ## name ## _cradix(const char* str, int n, void* dest)
|
static bool parse_##name##_cradix(const char* str, size_t n, void* dest)
|
||||||
|
|
||||||
DECLARE_INTEGER_PARSER(short);
|
DECLARE_INTEGER_PARSER(short);
|
||||||
DECLARE_INTEGER_PARSER(ushort);
|
DECLARE_INTEGER_PARSER(ushort);
|
||||||
@ -833,29 +840,31 @@ class RE2::Arg {
|
|||||||
DECLARE_INTEGER_PARSER(uint);
|
DECLARE_INTEGER_PARSER(uint);
|
||||||
DECLARE_INTEGER_PARSER(long);
|
DECLARE_INTEGER_PARSER(long);
|
||||||
DECLARE_INTEGER_PARSER(ulong);
|
DECLARE_INTEGER_PARSER(ulong);
|
||||||
#ifdef RE2_HAVE_LONGLONG
|
|
||||||
DECLARE_INTEGER_PARSER(longlong);
|
DECLARE_INTEGER_PARSER(longlong);
|
||||||
DECLARE_INTEGER_PARSER(ulonglong);
|
DECLARE_INTEGER_PARSER(ulonglong);
|
||||||
#endif
|
|
||||||
|
|
||||||
#undef DECLARE_INTEGER_PARSER
|
#undef DECLARE_INTEGER_PARSER
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
inline RE2::Arg::Arg() : arg_(NULL), parser_(parse_null) { }
|
inline RE2::Arg::Arg() : arg_(NULL), parser_(parse_null) { }
|
||||||
inline RE2::Arg::Arg(void* p) : arg_(p), parser_(parse_null) { }
|
inline RE2::Arg::Arg(void* p) : arg_(p), parser_(parse_null) { }
|
||||||
|
|
||||||
inline bool RE2::Arg::Parse(const char* str, int n) const {
|
inline bool RE2::Arg::Parse(const char* str, size_t n) const {
|
||||||
return (*parser_)(str, n, arg_);
|
return (*parser_)(str, n, arg_);
|
||||||
}
|
}
|
||||||
|
|
||||||
// This part of the parser, appropriate only for ints, deals with bases
|
// This part of the parser, appropriate only for ints, deals with bases
|
||||||
#define MAKE_INTEGER_PARSER(type, name) \
|
#define MAKE_INTEGER_PARSER(type, name) \
|
||||||
inline RE2::Arg RE2::Hex(type* ptr) { \
|
inline RE2::Arg RE2::Hex(type* ptr) { \
|
||||||
return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _hex); } \
|
return RE2::Arg(ptr, RE2::Arg::parse_##name##_hex); \
|
||||||
|
} \
|
||||||
inline RE2::Arg RE2::Octal(type* ptr) { \
|
inline RE2::Arg RE2::Octal(type* ptr) { \
|
||||||
return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _octal); } \
|
return RE2::Arg(ptr, RE2::Arg::parse_##name##_octal); \
|
||||||
|
} \
|
||||||
inline RE2::Arg RE2::CRadix(type* ptr) { \
|
inline RE2::Arg RE2::CRadix(type* ptr) { \
|
||||||
return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _cradix); }
|
return RE2::Arg(ptr, RE2::Arg::parse_##name##_cradix); \
|
||||||
|
}
|
||||||
|
|
||||||
MAKE_INTEGER_PARSER(short, short)
|
MAKE_INTEGER_PARSER(short, short)
|
||||||
MAKE_INTEGER_PARSER(unsigned short, ushort)
|
MAKE_INTEGER_PARSER(unsigned short, ushort)
|
||||||
@ -863,15 +872,70 @@ MAKE_INTEGER_PARSER(int, int)
|
|||||||
MAKE_INTEGER_PARSER(unsigned int, uint)
|
MAKE_INTEGER_PARSER(unsigned int, uint)
|
||||||
MAKE_INTEGER_PARSER(long, long)
|
MAKE_INTEGER_PARSER(long, long)
|
||||||
MAKE_INTEGER_PARSER(unsigned long, ulong)
|
MAKE_INTEGER_PARSER(unsigned long, ulong)
|
||||||
#ifdef RE2_HAVE_LONGLONG
|
|
||||||
MAKE_INTEGER_PARSER(long long, longlong)
|
MAKE_INTEGER_PARSER(long long, longlong)
|
||||||
MAKE_INTEGER_PARSER(unsigned long long, ulonglong)
|
MAKE_INTEGER_PARSER(unsigned long long, ulonglong)
|
||||||
#endif
|
|
||||||
|
|
||||||
#undef MAKE_INTEGER_PARSER
|
#undef MAKE_INTEGER_PARSER
|
||||||
|
|
||||||
|
#ifndef SWIG
|
||||||
|
|
||||||
|
// Silence warnings about missing initializers for members of LazyRE2.
|
||||||
|
// Note that we test for Clang first because it defines __GNUC__ as well.
|
||||||
|
#if defined(__clang__)
|
||||||
|
#elif defined(__GNUC__) && __GNUC__ >= 6
|
||||||
|
#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Helper for writing global or static RE2s safely.
|
||||||
|
// Write
|
||||||
|
// static LazyRE2 re = {".*"};
|
||||||
|
// and then use *re instead of writing
|
||||||
|
// static RE2 re(".*");
|
||||||
|
// The former is more careful about multithreaded
|
||||||
|
// situations than the latter.
|
||||||
|
//
|
||||||
|
// N.B. This class never deletes the RE2 object that
|
||||||
|
// it constructs: that's a feature, so that it can be used
|
||||||
|
// for global and function static variables.
|
||||||
|
class LazyRE2 {
|
||||||
|
private:
|
||||||
|
struct NoArg {};
|
||||||
|
|
||||||
|
public:
|
||||||
|
typedef RE2 element_type; // support std::pointer_traits
|
||||||
|
|
||||||
|
// Constructor omitted to preserve braced initialization in C++98.
|
||||||
|
|
||||||
|
// Pretend to be a pointer to Type (never NULL due to on-demand creation):
|
||||||
|
RE2& operator*() const { return *get(); }
|
||||||
|
RE2* operator->() const { return get(); }
|
||||||
|
|
||||||
|
// Named accessor/initializer:
|
||||||
|
RE2* get() const {
|
||||||
|
std::call_once(once_, &LazyRE2::Init, this);
|
||||||
|
return ptr_;
|
||||||
|
}
|
||||||
|
|
||||||
|
// All data fields must be public to support {"foo"} initialization.
|
||||||
|
const char* pattern_;
|
||||||
|
RE2::CannedOptions options_;
|
||||||
|
NoArg barrier_against_excess_initializers_;
|
||||||
|
|
||||||
|
mutable RE2* ptr_;
|
||||||
|
mutable std::once_flag once_;
|
||||||
|
|
||||||
|
private:
|
||||||
|
static void Init(const LazyRE2* lazy_re2) {
|
||||||
|
lazy_re2->ptr_ = new RE2(lazy_re2->pattern_, lazy_re2->options_);
|
||||||
|
}
|
||||||
|
|
||||||
|
void operator=(const LazyRE2&); // disallowed
|
||||||
|
};
|
||||||
|
#endif // SWIG
|
||||||
|
|
||||||
} // namespace re2
|
} // namespace re2
|
||||||
|
|
||||||
using re2::RE2;
|
using re2::RE2;
|
||||||
|
using re2::LazyRE2;
|
||||||
|
|
||||||
#endif /* RE2_RE2_H */
|
#endif // RE2_RE2_H_
|
||||||
|
@ -5,8 +5,21 @@
|
|||||||
// Regular expression representation.
|
// Regular expression representation.
|
||||||
// Tested by parse_test.cc
|
// Tested by parse_test.cc
|
||||||
|
|
||||||
#include "util/util.h"
|
|
||||||
#include "re2/regexp.h"
|
#include "re2/regexp.h"
|
||||||
|
|
||||||
|
#include <stddef.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <algorithm>
|
||||||
|
#include <map>
|
||||||
|
#include <mutex>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "util/util.h"
|
||||||
|
#include "util/logging.h"
|
||||||
|
#include "util/mutex.h"
|
||||||
|
#include "util/utf.h"
|
||||||
#include "re2/stringpiece.h"
|
#include "re2/stringpiece.h"
|
||||||
#include "re2/walker-inl.h"
|
#include "re2/walker-inl.h"
|
||||||
|
|
||||||
@ -14,9 +27,9 @@ namespace re2 {
|
|||||||
|
|
||||||
// Constructor. Allocates vectors as appropriate for operator.
|
// Constructor. Allocates vectors as appropriate for operator.
|
||||||
Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
|
Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
|
||||||
: op_(op),
|
: op_(static_cast<uint8_t>(op)),
|
||||||
simple_(false),
|
simple_(false),
|
||||||
parse_flags_(static_cast<uint16>(parse_flags)),
|
parse_flags_(static_cast<uint16_t>(parse_flags)),
|
||||||
ref_(1),
|
ref_(1),
|
||||||
nsub_(0),
|
nsub_(0),
|
||||||
down_(NULL) {
|
down_(NULL) {
|
||||||
@ -43,6 +56,7 @@ Regexp::~Regexp() {
|
|||||||
delete[] runes_;
|
delete[] runes_;
|
||||||
break;
|
break;
|
||||||
case kRegexpCharClass:
|
case kRegexpCharClass:
|
||||||
|
if (cc_)
|
||||||
cc_->Delete();
|
cc_->Delete();
|
||||||
delete ccb_;
|
delete ccb_;
|
||||||
break;
|
break;
|
||||||
@ -59,30 +73,29 @@ bool Regexp::QuickDestroy() {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
static map<Regexp*, int> *ref_map;
|
// Lazily allocated.
|
||||||
GLOBAL_MUTEX(ref_mutex);
|
static Mutex* ref_mutex;
|
||||||
|
static std::map<Regexp*, int>* ref_map;
|
||||||
|
|
||||||
int Regexp::Ref() {
|
int Regexp::Ref() {
|
||||||
if (ref_ < kMaxRef)
|
if (ref_ < kMaxRef)
|
||||||
return ref_;
|
return ref_;
|
||||||
|
|
||||||
GLOBAL_MUTEX_LOCK(ref_mutex);
|
MutexLock l(ref_mutex);
|
||||||
int r = 0;
|
return (*ref_map)[this];
|
||||||
if (ref_map != NULL) {
|
|
||||||
r = (*ref_map)[this];
|
|
||||||
}
|
|
||||||
GLOBAL_MUTEX_UNLOCK(ref_mutex);
|
|
||||||
return r;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Increments reference count, returns object as convenience.
|
// Increments reference count, returns object as convenience.
|
||||||
Regexp* Regexp::Incref() {
|
Regexp* Regexp::Incref() {
|
||||||
if (ref_ >= kMaxRef-1) {
|
if (ref_ >= kMaxRef-1) {
|
||||||
|
static std::once_flag ref_once;
|
||||||
|
std::call_once(ref_once, []() {
|
||||||
|
ref_mutex = new Mutex;
|
||||||
|
ref_map = new std::map<Regexp*, int>;
|
||||||
|
});
|
||||||
|
|
||||||
// Store ref count in overflow map.
|
// Store ref count in overflow map.
|
||||||
GLOBAL_MUTEX_LOCK(ref_mutex);
|
MutexLock l(ref_mutex);
|
||||||
if (ref_map == NULL) {
|
|
||||||
ref_map = new map<Regexp*, int>;
|
|
||||||
}
|
|
||||||
if (ref_ == kMaxRef) {
|
if (ref_ == kMaxRef) {
|
||||||
// already overflowed
|
// already overflowed
|
||||||
(*ref_map)[this]++;
|
(*ref_map)[this]++;
|
||||||
@ -91,7 +104,6 @@ Regexp* Regexp::Incref() {
|
|||||||
(*ref_map)[this] = kMaxRef;
|
(*ref_map)[this] = kMaxRef;
|
||||||
ref_ = kMaxRef;
|
ref_ = kMaxRef;
|
||||||
}
|
}
|
||||||
GLOBAL_MUTEX_UNLOCK(ref_mutex);
|
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -103,15 +115,14 @@ Regexp* Regexp::Incref() {
|
|||||||
void Regexp::Decref() {
|
void Regexp::Decref() {
|
||||||
if (ref_ == kMaxRef) {
|
if (ref_ == kMaxRef) {
|
||||||
// Ref count is stored in overflow map.
|
// Ref count is stored in overflow map.
|
||||||
GLOBAL_MUTEX_LOCK(ref_mutex);
|
MutexLock l(ref_mutex);
|
||||||
int r = (*ref_map)[this] - 1;
|
int r = (*ref_map)[this] - 1;
|
||||||
if (r < kMaxRef) {
|
if (r < kMaxRef) {
|
||||||
ref_ = r;
|
ref_ = static_cast<uint16_t>(r);
|
||||||
ref_map->erase(this);
|
ref_map->erase(this);
|
||||||
} else {
|
} else {
|
||||||
(*ref_map)[this] = r;
|
(*ref_map)[this] = r;
|
||||||
}
|
}
|
||||||
GLOBAL_MUTEX_UNLOCK(ref_mutex);
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
ref_--;
|
ref_--;
|
||||||
@ -179,31 +190,45 @@ Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
|
|||||||
return re;
|
return re;
|
||||||
}
|
}
|
||||||
|
|
||||||
Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
|
Regexp* Regexp::StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags) {
|
||||||
if (sub->op() == kRegexpPlus && sub->parse_flags() == flags)
|
// Squash **, ++ and ??.
|
||||||
|
if (op == sub->op() && flags == sub->parse_flags())
|
||||||
return sub;
|
return sub;
|
||||||
Regexp* re = new Regexp(kRegexpPlus, flags);
|
|
||||||
|
// Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because
|
||||||
|
// op is Star/Plus/Quest, we just have to check that sub->op() is too.
|
||||||
|
if ((sub->op() == kRegexpStar ||
|
||||||
|
sub->op() == kRegexpPlus ||
|
||||||
|
sub->op() == kRegexpQuest) &&
|
||||||
|
flags == sub->parse_flags()) {
|
||||||
|
// If sub is Star, no need to rewrite it.
|
||||||
|
if (sub->op() == kRegexpStar)
|
||||||
|
return sub;
|
||||||
|
|
||||||
|
// Rewrite sub to Star.
|
||||||
|
Regexp* re = new Regexp(kRegexpStar, flags);
|
||||||
|
re->AllocSub(1);
|
||||||
|
re->sub()[0] = sub->sub()[0]->Incref();
|
||||||
|
sub->Decref(); // We didn't consume the reference after all.
|
||||||
|
return re;
|
||||||
|
}
|
||||||
|
|
||||||
|
Regexp* re = new Regexp(op, flags);
|
||||||
re->AllocSub(1);
|
re->AllocSub(1);
|
||||||
re->sub()[0] = sub;
|
re->sub()[0] = sub;
|
||||||
return re;
|
return re;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
|
||||||
|
return StarPlusOrQuest(kRegexpPlus, sub, flags);
|
||||||
|
}
|
||||||
|
|
||||||
Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
|
Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
|
||||||
if (sub->op() == kRegexpStar && sub->parse_flags() == flags)
|
return StarPlusOrQuest(kRegexpStar, sub, flags);
|
||||||
return sub;
|
|
||||||
Regexp* re = new Regexp(kRegexpStar, flags);
|
|
||||||
re->AllocSub(1);
|
|
||||||
re->sub()[0] = sub;
|
|
||||||
return re;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
|
Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
|
||||||
if (sub->op() == kRegexpQuest && sub->parse_flags() == flags)
|
return StarPlusOrQuest(kRegexpQuest, sub, flags);
|
||||||
return sub;
|
|
||||||
Regexp* re = new Regexp(kRegexpQuest, flags);
|
|
||||||
re->AllocSub(1);
|
|
||||||
re->sub()[0] = sub;
|
|
||||||
return re;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
|
Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
|
||||||
@ -211,6 +236,13 @@ Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
|
|||||||
if (nsub == 1)
|
if (nsub == 1)
|
||||||
return sub[0];
|
return sub[0];
|
||||||
|
|
||||||
|
if (nsub == 0) {
|
||||||
|
if (op == kRegexpAlternate)
|
||||||
|
return new Regexp(kRegexpNoMatch, flags);
|
||||||
|
else
|
||||||
|
return new Regexp(kRegexpEmptyMatch, flags);
|
||||||
|
}
|
||||||
|
|
||||||
Regexp** subcopy = NULL;
|
Regexp** subcopy = NULL;
|
||||||
if (op == kRegexpAlternate && can_factor) {
|
if (op == kRegexpAlternate && can_factor) {
|
||||||
// Going to edit sub; make a copy so we don't step on caller.
|
// Going to edit sub; make a copy so we don't step on caller.
|
||||||
@ -405,7 +437,7 @@ bool Regexp::Equal(Regexp* a, Regexp* b) {
|
|||||||
// The stack (vector) has pairs of regexps waiting to
|
// The stack (vector) has pairs of regexps waiting to
|
||||||
// be compared. The regexps are only equal if
|
// be compared. The regexps are only equal if
|
||||||
// all the pairs end up being equal.
|
// all the pairs end up being equal.
|
||||||
vector<Regexp*> stk;
|
std::vector<Regexp*> stk;
|
||||||
|
|
||||||
for (;;) {
|
for (;;) {
|
||||||
// Invariant: TopEqual(a, b) == true.
|
// Invariant: TopEqual(a, b) == true.
|
||||||
@ -445,10 +477,11 @@ bool Regexp::Equal(Regexp* a, Regexp* b) {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
int n = stk.size();
|
size_t n = stk.size();
|
||||||
if (n == 0)
|
if (n == 0)
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
DCHECK_GE(n, 2);
|
||||||
a = stk[n-2];
|
a = stk[n-2];
|
||||||
b = stk[n-1];
|
b = stk[n-1];
|
||||||
stk.resize(n-2);
|
stk.resize(n-2);
|
||||||
@ -517,7 +550,9 @@ class NumCapturesWalker : public Regexp::Walker<Ignored> {
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
int ncapture_;
|
int ncapture_;
|
||||||
DISALLOW_EVIL_CONSTRUCTORS(NumCapturesWalker);
|
|
||||||
|
NumCapturesWalker(const NumCapturesWalker&) = delete;
|
||||||
|
NumCapturesWalker& operator=(const NumCapturesWalker&) = delete;
|
||||||
};
|
};
|
||||||
|
|
||||||
int Regexp::NumCaptures() {
|
int Regexp::NumCaptures() {
|
||||||
@ -532,8 +567,8 @@ class NamedCapturesWalker : public Regexp::Walker<Ignored> {
|
|||||||
NamedCapturesWalker() : map_(NULL) {}
|
NamedCapturesWalker() : map_(NULL) {}
|
||||||
~NamedCapturesWalker() { delete map_; }
|
~NamedCapturesWalker() { delete map_; }
|
||||||
|
|
||||||
map<string, int>* TakeMap() {
|
std::map<string, int>* TakeMap() {
|
||||||
map<string, int>* m = map_;
|
std::map<string, int>* m = map_;
|
||||||
map_ = NULL;
|
map_ = NULL;
|
||||||
return m;
|
return m;
|
||||||
}
|
}
|
||||||
@ -542,7 +577,7 @@ class NamedCapturesWalker : public Regexp::Walker<Ignored> {
|
|||||||
if (re->op() == kRegexpCapture && re->name() != NULL) {
|
if (re->op() == kRegexpCapture && re->name() != NULL) {
|
||||||
// Allocate map once we find a name.
|
// Allocate map once we find a name.
|
||||||
if (map_ == NULL)
|
if (map_ == NULL)
|
||||||
map_ = new map<string, int>;
|
map_ = new std::map<string, int>;
|
||||||
|
|
||||||
// Record first occurrence of each name.
|
// Record first occurrence of each name.
|
||||||
// (The rule is that if you have the same name
|
// (The rule is that if you have the same name
|
||||||
@ -560,11 +595,13 @@ class NamedCapturesWalker : public Regexp::Walker<Ignored> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
map<string, int>* map_;
|
std::map<string, int>* map_;
|
||||||
DISALLOW_EVIL_CONSTRUCTORS(NamedCapturesWalker);
|
|
||||||
|
NamedCapturesWalker(const NamedCapturesWalker&) = delete;
|
||||||
|
NamedCapturesWalker& operator=(const NamedCapturesWalker&) = delete;
|
||||||
};
|
};
|
||||||
|
|
||||||
map<string, int>* Regexp::NamedCaptures() {
|
std::map<string, int>* Regexp::NamedCaptures() {
|
||||||
NamedCapturesWalker w;
|
NamedCapturesWalker w;
|
||||||
w.Walk(this, 0);
|
w.Walk(this, 0);
|
||||||
return w.TakeMap();
|
return w.TakeMap();
|
||||||
@ -576,8 +613,8 @@ class CaptureNamesWalker : public Regexp::Walker<Ignored> {
|
|||||||
CaptureNamesWalker() : map_(NULL) {}
|
CaptureNamesWalker() : map_(NULL) {}
|
||||||
~CaptureNamesWalker() { delete map_; }
|
~CaptureNamesWalker() { delete map_; }
|
||||||
|
|
||||||
map<int, string>* TakeMap() {
|
std::map<int, string>* TakeMap() {
|
||||||
map<int, string>* m = map_;
|
std::map<int, string>* m = map_;
|
||||||
map_ = NULL;
|
map_ = NULL;
|
||||||
return m;
|
return m;
|
||||||
}
|
}
|
||||||
@ -586,7 +623,7 @@ class CaptureNamesWalker : public Regexp::Walker<Ignored> {
|
|||||||
if (re->op() == kRegexpCapture && re->name() != NULL) {
|
if (re->op() == kRegexpCapture && re->name() != NULL) {
|
||||||
// Allocate map once we find a name.
|
// Allocate map once we find a name.
|
||||||
if (map_ == NULL)
|
if (map_ == NULL)
|
||||||
map_ = new map<int, string>;
|
map_ = new std::map<int, string>;
|
||||||
|
|
||||||
(*map_)[re->cap()] = *re->name();
|
(*map_)[re->cap()] = *re->name();
|
||||||
}
|
}
|
||||||
@ -600,11 +637,13 @@ class CaptureNamesWalker : public Regexp::Walker<Ignored> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
map<int, string>* map_;
|
std::map<int, string>* map_;
|
||||||
DISALLOW_EVIL_CONSTRUCTORS(CaptureNamesWalker);
|
|
||||||
|
CaptureNamesWalker(const CaptureNamesWalker&) = delete;
|
||||||
|
CaptureNamesWalker& operator=(const CaptureNamesWalker&) = delete;
|
||||||
};
|
};
|
||||||
|
|
||||||
map<int, string>* Regexp::CaptureNames() {
|
std::map<int, string>* Regexp::CaptureNames() {
|
||||||
CaptureNamesWalker w;
|
CaptureNamesWalker w;
|
||||||
w.Walk(this, 0);
|
w.Walk(this, 0);
|
||||||
return w.TakeMap();
|
return w.TakeMap();
|
||||||
@ -643,7 +682,7 @@ bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
|
|||||||
if (re->parse_flags() & Latin1) {
|
if (re->parse_flags() & Latin1) {
|
||||||
prefix->resize(re->nrunes_);
|
prefix->resize(re->nrunes_);
|
||||||
for (int j = 0; j < re->nrunes_; j++)
|
for (int j = 0; j < re->nrunes_; j++)
|
||||||
(*prefix)[j] = re->runes_[j];
|
(*prefix)[j] = static_cast<char>(re->runes_[j]);
|
||||||
} else {
|
} else {
|
||||||
// Convert to UTF-8 in place.
|
// Convert to UTF-8 in place.
|
||||||
// Assume worst-case space and then trim.
|
// Assume worst-case space and then trim.
|
||||||
@ -652,7 +691,7 @@ bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
|
|||||||
for (int j = 0; j < re->nrunes_; j++) {
|
for (int j = 0; j < re->nrunes_; j++) {
|
||||||
Rune r = re->runes_[j];
|
Rune r = re->runes_[j];
|
||||||
if (r < Runeself)
|
if (r < Runeself)
|
||||||
*p++ = r;
|
*p++ = static_cast<char>(r);
|
||||||
else
|
else
|
||||||
p += runetochar(p, &r);
|
p += runetochar(p, &r);
|
||||||
}
|
}
|
||||||
@ -662,14 +701,14 @@ bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
|
|||||||
|
|
||||||
case kRegexpLiteral:
|
case kRegexpLiteral:
|
||||||
if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) {
|
if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) {
|
||||||
prefix->append(1, re->rune_);
|
prefix->append(1, static_cast<char>(re->rune_));
|
||||||
} else {
|
} else {
|
||||||
char buf[UTFmax];
|
char buf[UTFmax];
|
||||||
prefix->append(buf, runetochar(buf, &re->rune_));
|
prefix->append(buf, runetochar(buf, &re->rune_));
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
*foldcase = (sub[i]->parse_flags() & FoldCase);
|
*foldcase = (sub[i]->parse_flags() & FoldCase) != 0;
|
||||||
i++;
|
i++;
|
||||||
|
|
||||||
// The rest.
|
// The rest.
|
||||||
@ -704,13 +743,13 @@ bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
|
|||||||
if (lo <= 'z' && hi >= 'A') {
|
if (lo <= 'z' && hi >= 'A') {
|
||||||
// Overlaps some alpha, maybe not all.
|
// Overlaps some alpha, maybe not all.
|
||||||
// Update bitmaps telling which ASCII letters are in the set.
|
// Update bitmaps telling which ASCII letters are in the set.
|
||||||
Rune lo1 = max<Rune>(lo, 'A');
|
Rune lo1 = std::max<Rune>(lo, 'A');
|
||||||
Rune hi1 = min<Rune>(hi, 'Z');
|
Rune hi1 = std::min<Rune>(hi, 'Z');
|
||||||
if (lo1 <= hi1)
|
if (lo1 <= hi1)
|
||||||
upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');
|
upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');
|
||||||
|
|
||||||
lo1 = max<Rune>(lo, 'a');
|
lo1 = std::max<Rune>(lo, 'a');
|
||||||
hi1 = min<Rune>(hi, 'z');
|
hi1 = std::min<Rune>(hi, 'z');
|
||||||
if (lo1 <= hi1)
|
if (lo1 <= hi1)
|
||||||
lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
|
lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
|
||||||
}
|
}
|
||||||
@ -826,7 +865,7 @@ void CharClassBuilder::RemoveAbove(Rune r) {
|
|||||||
void CharClassBuilder::Negate() {
|
void CharClassBuilder::Negate() {
|
||||||
// Build up negation and then copy in.
|
// Build up negation and then copy in.
|
||||||
// Could edit ranges in place, but C++ won't let me.
|
// Could edit ranges in place, but C++ won't let me.
|
||||||
vector<RuneRange> v;
|
std::vector<RuneRange> v;
|
||||||
v.reserve(ranges_.size() + 1);
|
v.reserve(ranges_.size() + 1);
|
||||||
|
|
||||||
// In negation, first range begins at 0, unless
|
// In negation, first range begins at 0, unless
|
||||||
@ -863,7 +902,7 @@ void CharClassBuilder::Negate() {
|
|||||||
|
|
||||||
CharClass* CharClass::New(int maxranges) {
|
CharClass* CharClass::New(int maxranges) {
|
||||||
CharClass* cc;
|
CharClass* cc;
|
||||||
uint8* data = new uint8[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
|
uint8_t* data = new uint8_t[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
|
||||||
cc = reinterpret_cast<CharClass*>(data);
|
cc = reinterpret_cast<CharClass*>(data);
|
||||||
cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc);
|
cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc);
|
||||||
cc->nranges_ = 0;
|
cc->nranges_ = 0;
|
||||||
@ -873,7 +912,7 @@ CharClass* CharClass::New(int maxranges) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void CharClass::Delete() {
|
void CharClass::Delete() {
|
||||||
uint8 *data = reinterpret_cast<uint8*>(this);
|
uint8_t* data = reinterpret_cast<uint8_t*>(this);
|
||||||
delete[] data;
|
delete[] data;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -915,7 +954,7 @@ bool CharClass::Contains(Rune r) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
CharClass* CharClassBuilder::GetCharClass() {
|
CharClass* CharClassBuilder::GetCharClass() {
|
||||||
CharClass* cc = CharClass::New(ranges_.size());
|
CharClass* cc = CharClass::New(static_cast<int>(ranges_.size()));
|
||||||
int n = 0;
|
int n = 0;
|
||||||
for (iterator it = begin(); it != end(); ++it)
|
for (iterator it = begin(); it != end(); ++it)
|
||||||
cc->ranges_[n++] = *it;
|
cc->ranges_[n++] = *it;
|
||||||
|
@ -2,6 +2,9 @@
|
|||||||
// Use of this source code is governed by a BSD-style
|
// Use of this source code is governed by a BSD-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
#ifndef RE2_REGEXP_H_
|
||||||
|
#define RE2_REGEXP_H_
|
||||||
|
|
||||||
// --- SPONSORED LINK --------------------------------------------------
|
// --- SPONSORED LINK --------------------------------------------------
|
||||||
// If you want to use this library for regular expression matching,
|
// If you want to use this library for regular expression matching,
|
||||||
// you should use re2/re2.h, which provides a class RE2 that
|
// you should use re2/re2.h, which provides a class RE2 that
|
||||||
@ -83,10 +86,14 @@
|
|||||||
// form accessible to clients, so that client code can analyze the
|
// form accessible to clients, so that client code can analyze the
|
||||||
// parsed regular expressions.
|
// parsed regular expressions.
|
||||||
|
|
||||||
#ifndef RE2_REGEXP_H__
|
#include <stdint.h>
|
||||||
#define RE2_REGEXP_H__
|
#include <map>
|
||||||
|
#include <set>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
#include "util/util.h"
|
#include "util/util.h"
|
||||||
|
#include "util/logging.h"
|
||||||
|
#include "util/utf.h"
|
||||||
#include "re2/stringpiece.h"
|
#include "re2/stringpiece.h"
|
||||||
|
|
||||||
namespace re2 {
|
namespace re2 {
|
||||||
@ -185,10 +192,10 @@ class RegexpStatus {
|
|||||||
RegexpStatus() : code_(kRegexpSuccess), tmp_(NULL) {}
|
RegexpStatus() : code_(kRegexpSuccess), tmp_(NULL) {}
|
||||||
~RegexpStatus() { delete tmp_; }
|
~RegexpStatus() { delete tmp_; }
|
||||||
|
|
||||||
void set_code(enum RegexpStatusCode code) { code_ = code; }
|
void set_code(RegexpStatusCode code) { code_ = code; }
|
||||||
void set_error_arg(const StringPiece& error_arg) { error_arg_ = error_arg; }
|
void set_error_arg(const StringPiece& error_arg) { error_arg_ = error_arg; }
|
||||||
void set_tmp(string* tmp) { delete tmp_; tmp_ = tmp; }
|
void set_tmp(string* tmp) { delete tmp_; tmp_ = tmp; }
|
||||||
enum RegexpStatusCode code() const { return code_; }
|
RegexpStatusCode code() const { return code_; }
|
||||||
const StringPiece& error_arg() const { return error_arg_; }
|
const StringPiece& error_arg() const { return error_arg_; }
|
||||||
bool ok() const { return code() == kRegexpSuccess; }
|
bool ok() const { return code() == kRegexpSuccess; }
|
||||||
|
|
||||||
@ -197,23 +204,21 @@ class RegexpStatus {
|
|||||||
|
|
||||||
// Returns text equivalent of code, e.g.:
|
// Returns text equivalent of code, e.g.:
|
||||||
// "Bad character class"
|
// "Bad character class"
|
||||||
static string CodeText(enum RegexpStatusCode code);
|
static string CodeText(RegexpStatusCode code);
|
||||||
|
|
||||||
// Returns text describing error, e.g.:
|
// Returns text describing error, e.g.:
|
||||||
// "Bad character class: [z-a]"
|
// "Bad character class: [z-a]"
|
||||||
string Text() const;
|
string Text() const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
enum RegexpStatusCode code_; // Kind of error
|
RegexpStatusCode code_; // Kind of error
|
||||||
StringPiece error_arg_; // Piece of regexp containing syntax error.
|
StringPiece error_arg_; // Piece of regexp containing syntax error.
|
||||||
string* tmp_; // Temporary storage, possibly where error_arg_ is.
|
string* tmp_; // Temporary storage, possibly where error_arg_ is.
|
||||||
|
|
||||||
DISALLOW_EVIL_CONSTRUCTORS(RegexpStatus);
|
RegexpStatus(const RegexpStatus&) = delete;
|
||||||
|
RegexpStatus& operator=(const RegexpStatus&) = delete;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Walker to implement Simplify.
|
|
||||||
class SimplifyWalker;
|
|
||||||
|
|
||||||
// Compiled form; see prog.h
|
// Compiled form; see prog.h
|
||||||
class Prog;
|
class Prog;
|
||||||
|
|
||||||
@ -261,7 +266,9 @@ class CharClass {
|
|||||||
int nrunes_;
|
int nrunes_;
|
||||||
RuneRange *ranges_;
|
RuneRange *ranges_;
|
||||||
int nranges_;
|
int nranges_;
|
||||||
DISALLOW_EVIL_CONSTRUCTORS(CharClass);
|
|
||||||
|
CharClass(const CharClass&) = delete;
|
||||||
|
CharClass& operator=(const CharClass&) = delete;
|
||||||
};
|
};
|
||||||
|
|
||||||
class Regexp {
|
class Regexp {
|
||||||
@ -306,14 +313,15 @@ class Regexp {
|
|||||||
UnicodeGroups,
|
UnicodeGroups,
|
||||||
|
|
||||||
// Internal use only.
|
// Internal use only.
|
||||||
WasDollar = 1<<15, // on kRegexpEndText: was $ in regexp text
|
WasDollar = 1<<13, // on kRegexpEndText: was $ in regexp text
|
||||||
|
AllParseFlags = (1<<14)-1,
|
||||||
};
|
};
|
||||||
|
|
||||||
// Get. No set, Regexps are logically immutable once created.
|
// Get. No set, Regexps are logically immutable once created.
|
||||||
RegexpOp op() { return static_cast<RegexpOp>(op_); }
|
RegexpOp op() { return static_cast<RegexpOp>(op_); }
|
||||||
int nsub() { return nsub_; }
|
int nsub() { return nsub_; }
|
||||||
bool simple() { return simple_; }
|
bool simple() { return simple_ != 0; }
|
||||||
enum ParseFlags parse_flags() { return static_cast<ParseFlags>(parse_flags_); }
|
ParseFlags parse_flags() { return static_cast<ParseFlags>(parse_flags_); }
|
||||||
int Ref(); // For testing.
|
int Ref(); // For testing.
|
||||||
|
|
||||||
Regexp** sub() {
|
Regexp** sub() {
|
||||||
@ -353,6 +361,7 @@ class Regexp {
|
|||||||
// removed. The result will capture exactly the same
|
// removed. The result will capture exactly the same
|
||||||
// subexpressions the original did, unless formatted with ToString.
|
// subexpressions the original did, unless formatted with ToString.
|
||||||
Regexp* Simplify();
|
Regexp* Simplify();
|
||||||
|
friend class CoalesceWalker;
|
||||||
friend class SimplifyWalker;
|
friend class SimplifyWalker;
|
||||||
|
|
||||||
// Parses the regexp src and then simplifies it and sets *dst to the
|
// Parses the regexp src and then simplifies it and sets *dst to the
|
||||||
@ -369,12 +378,12 @@ class Regexp {
|
|||||||
// Returns a map from names to capturing group indices,
|
// Returns a map from names to capturing group indices,
|
||||||
// or NULL if the regexp contains no named capture groups.
|
// or NULL if the regexp contains no named capture groups.
|
||||||
// The caller is responsible for deleting the map.
|
// The caller is responsible for deleting the map.
|
||||||
map<string, int>* NamedCaptures();
|
std::map<string, int>* NamedCaptures();
|
||||||
|
|
||||||
// Returns a map from capturing group indices to capturing group
|
// Returns a map from capturing group indices to capturing group
|
||||||
// names or NULL if the regexp contains no named capture groups. The
|
// names or NULL if the regexp contains no named capture groups. The
|
||||||
// caller is responsible for deleting the map.
|
// caller is responsible for deleting the map.
|
||||||
map<int, string>* CaptureNames();
|
std::map<int, string>* CaptureNames();
|
||||||
|
|
||||||
// Returns a string representation of the current regexp,
|
// Returns a string representation of the current regexp,
|
||||||
// using as few parentheses as possible.
|
// using as few parentheses as possible.
|
||||||
@ -410,8 +419,8 @@ class Regexp {
|
|||||||
// Construction and execution of prog will
|
// Construction and execution of prog will
|
||||||
// stay within approximately max_mem bytes of memory.
|
// stay within approximately max_mem bytes of memory.
|
||||||
// If max_mem <= 0, a reasonable default is used.
|
// If max_mem <= 0, a reasonable default is used.
|
||||||
Prog* CompileToProg(int64 max_mem);
|
Prog* CompileToProg(int64_t max_mem);
|
||||||
Prog* CompileToReverseProg(int64 max_mem);
|
Prog* CompileToReverseProg(int64_t max_mem);
|
||||||
|
|
||||||
// Whether to expect this library to find exactly the same answer as PCRE
|
// Whether to expect this library to find exactly the same answer as PCRE
|
||||||
// when running this regexp. Most regexps do mimic PCRE exactly, but a few
|
// when running this regexp. Most regexps do mimic PCRE exactly, but a few
|
||||||
@ -427,6 +436,8 @@ class Regexp {
|
|||||||
// begin with a non-empty fixed string (perhaps after ASCII
|
// begin with a non-empty fixed string (perhaps after ASCII
|
||||||
// case-folding). If so, returns the prefix and the sub-regexp that
|
// case-folding). If so, returns the prefix and the sub-regexp that
|
||||||
// follows it.
|
// follows it.
|
||||||
|
// Callers should expect *prefix, *foldcase and *suffix to be "zeroed"
|
||||||
|
// regardless of the return value.
|
||||||
bool RequiredPrefix(string* prefix, bool* foldcase, Regexp** suffix);
|
bool RequiredPrefix(string* prefix, bool* foldcase, Regexp** suffix);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
@ -441,6 +452,7 @@ class Regexp {
|
|||||||
|
|
||||||
// Helpers for Parse. Listed here so they can edit Regexps.
|
// Helpers for Parse. Listed here so they can edit Regexps.
|
||||||
class ParseState;
|
class ParseState;
|
||||||
|
|
||||||
friend class ParseState;
|
friend class ParseState;
|
||||||
friend bool ParseCharClass(StringPiece* s, Regexp** out_re,
|
friend bool ParseCharClass(StringPiece* s, Regexp** out_re,
|
||||||
RegexpStatus* status);
|
RegexpStatus* status);
|
||||||
@ -451,6 +463,10 @@ class Regexp {
|
|||||||
// Computes whether Regexp is already simple.
|
// Computes whether Regexp is already simple.
|
||||||
bool ComputeSimple();
|
bool ComputeSimple();
|
||||||
|
|
||||||
|
// Constructor that generates a Star, Plus or Quest,
|
||||||
|
// squashing the pair if sub is also a Star, Plus or Quest.
|
||||||
|
static Regexp* StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags);
|
||||||
|
|
||||||
// Constructor that generates a concatenation or alternation,
|
// Constructor that generates a concatenation or alternation,
|
||||||
// enforcing the limit on the number of subexpressions for
|
// enforcing the limit on the number of subexpressions for
|
||||||
// a particular Regexp.
|
// a particular Regexp.
|
||||||
@ -478,8 +494,7 @@ class Regexp {
|
|||||||
// Simplifies an alternation of literal strings by factoring out
|
// Simplifies an alternation of literal strings by factoring out
|
||||||
// common prefixes.
|
// common prefixes.
|
||||||
static int FactorAlternation(Regexp** sub, int nsub, ParseFlags flags);
|
static int FactorAlternation(Regexp** sub, int nsub, ParseFlags flags);
|
||||||
static int FactorAlternationRecursive(Regexp** sub, int nsub,
|
friend class FactorAlternationImpl;
|
||||||
ParseFlags flags, int maxdepth);
|
|
||||||
|
|
||||||
// Is a == b? Only efficient on regexps that have not been through
|
// Is a == b? Only efficient on regexps that have not been through
|
||||||
// Simplify yet - the expansion of a kRegexpRepeat will make this
|
// Simplify yet - the expansion of a kRegexpRepeat will make this
|
||||||
@ -488,11 +503,10 @@ class Regexp {
|
|||||||
|
|
||||||
// Allocate space for n sub-regexps.
|
// Allocate space for n sub-regexps.
|
||||||
void AllocSub(int n) {
|
void AllocSub(int n) {
|
||||||
if (n < 0 || static_cast<uint16>(n) != n)
|
DCHECK(n >= 0 && static_cast<uint16_t>(n) == n);
|
||||||
LOG(FATAL) << "Cannot AllocSub " << n;
|
|
||||||
if (n > 1)
|
if (n > 1)
|
||||||
submany_ = new Regexp*[n];
|
submany_ = new Regexp*[n];
|
||||||
nsub_ = n;
|
nsub_ = static_cast<uint16_t>(n);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Add Rune to LiteralString
|
// Add Rune to LiteralString
|
||||||
@ -502,38 +516,38 @@ class Regexp {
|
|||||||
void Swap(Regexp *that);
|
void Swap(Regexp *that);
|
||||||
|
|
||||||
// Operator. See description of operators above.
|
// Operator. See description of operators above.
|
||||||
// uint8 instead of RegexpOp to control space usage.
|
// uint8_t instead of RegexpOp to control space usage.
|
||||||
uint8 op_;
|
uint8_t op_;
|
||||||
|
|
||||||
// Is this regexp structure already simple
|
// Is this regexp structure already simple
|
||||||
// (has it been returned by Simplify)?
|
// (has it been returned by Simplify)?
|
||||||
// uint8 instead of bool to control space usage.
|
// uint8_t instead of bool to control space usage.
|
||||||
uint8 simple_;
|
uint8_t simple_;
|
||||||
|
|
||||||
// Flags saved from parsing and used during execution.
|
// Flags saved from parsing and used during execution.
|
||||||
// (Only FoldCase is used.)
|
// (Only FoldCase is used.)
|
||||||
// uint16 instead of ParseFlags to control space usage.
|
// uint16_t instead of ParseFlags to control space usage.
|
||||||
uint16 parse_flags_;
|
uint16_t parse_flags_;
|
||||||
|
|
||||||
// Reference count. Exists so that SimplifyRegexp can build
|
// Reference count. Exists so that SimplifyRegexp can build
|
||||||
// regexp structures that are dags rather than trees to avoid
|
// regexp structures that are dags rather than trees to avoid
|
||||||
// exponential blowup in space requirements.
|
// exponential blowup in space requirements.
|
||||||
// uint16 to control space usage.
|
// uint16_t to control space usage.
|
||||||
// The standard regexp routines will never generate a
|
// The standard regexp routines will never generate a
|
||||||
// ref greater than the maximum repeat count (100),
|
// ref greater than the maximum repeat count (kMaxRepeat),
|
||||||
// but even so, Incref and Decref consult an overflow map
|
// but even so, Incref and Decref consult an overflow map
|
||||||
// when ref_ reaches kMaxRef.
|
// when ref_ reaches kMaxRef.
|
||||||
uint16 ref_;
|
uint16_t ref_;
|
||||||
static const uint16 kMaxRef = 0xffff;
|
static const uint16_t kMaxRef = 0xffff;
|
||||||
|
|
||||||
// Subexpressions.
|
// Subexpressions.
|
||||||
// uint16 to control space usage.
|
// uint16_t to control space usage.
|
||||||
// Concat and Alternate handle larger numbers of subexpressions
|
// Concat and Alternate handle larger numbers of subexpressions
|
||||||
// by building concatenation or alternation trees.
|
// by building concatenation or alternation trees.
|
||||||
// Other routines should call Concat or Alternate instead of
|
// Other routines should call Concat or Alternate instead of
|
||||||
// filling in sub() by hand.
|
// filling in sub() by hand.
|
||||||
uint16 nsub_;
|
uint16_t nsub_;
|
||||||
static const uint16 kMaxNsub = 0xffff;
|
static const uint16_t kMaxNsub = 0xffff;
|
||||||
union {
|
union {
|
||||||
Regexp** submany_; // if nsub_ > 1
|
Regexp** submany_; // if nsub_ > 1
|
||||||
Regexp* subone_; // if nsub_ == 1
|
Regexp* subone_; // if nsub_ == 1
|
||||||
@ -568,11 +582,12 @@ class Regexp {
|
|||||||
void *the_union_[2]; // as big as any other element, for memset
|
void *the_union_[2]; // as big as any other element, for memset
|
||||||
};
|
};
|
||||||
|
|
||||||
DISALLOW_EVIL_CONSTRUCTORS(Regexp);
|
Regexp(const Regexp&) = delete;
|
||||||
|
Regexp& operator=(const Regexp&) = delete;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Character class set: contains non-overlapping, non-abutting RuneRanges.
|
// Character class set: contains non-overlapping, non-abutting RuneRanges.
|
||||||
typedef set<RuneRange, RuneRangeLess> RuneRangeSet;
|
typedef std::set<RuneRange, RuneRangeLess> RuneRangeSet;
|
||||||
|
|
||||||
class CharClassBuilder {
|
class CharClassBuilder {
|
||||||
public:
|
public:
|
||||||
@ -597,37 +612,41 @@ class CharClassBuilder {
|
|||||||
void AddRangeFlags(Rune lo, Rune hi, Regexp::ParseFlags parse_flags);
|
void AddRangeFlags(Rune lo, Rune hi, Regexp::ParseFlags parse_flags);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
static const uint32 AlphaMask = (1<<26) - 1;
|
static const uint32_t AlphaMask = (1<<26) - 1;
|
||||||
uint32 upper_; // bitmap of A-Z
|
uint32_t upper_; // bitmap of A-Z
|
||||||
uint32 lower_; // bitmap of a-z
|
uint32_t lower_; // bitmap of a-z
|
||||||
int nrunes_;
|
int nrunes_;
|
||||||
RuneRangeSet ranges_;
|
RuneRangeSet ranges_;
|
||||||
DISALLOW_EVIL_CONSTRUCTORS(CharClassBuilder);
|
|
||||||
|
CharClassBuilder(const CharClassBuilder&) = delete;
|
||||||
|
CharClassBuilder& operator=(const CharClassBuilder&) = delete;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Tell g++ that bitwise ops on ParseFlags produce ParseFlags.
|
// Bitwise ops on ParseFlags produce ParseFlags.
|
||||||
inline Regexp::ParseFlags operator|(Regexp::ParseFlags a, Regexp::ParseFlags b)
|
inline Regexp::ParseFlags operator|(Regexp::ParseFlags a,
|
||||||
{
|
Regexp::ParseFlags b) {
|
||||||
return static_cast<Regexp::ParseFlags>(static_cast<int>(a) | static_cast<int>(b));
|
return static_cast<Regexp::ParseFlags>(
|
||||||
|
static_cast<int>(a) | static_cast<int>(b));
|
||||||
}
|
}
|
||||||
|
|
||||||
inline Regexp::ParseFlags operator^(Regexp::ParseFlags a, Regexp::ParseFlags b)
|
inline Regexp::ParseFlags operator^(Regexp::ParseFlags a,
|
||||||
{
|
Regexp::ParseFlags b) {
|
||||||
return static_cast<Regexp::ParseFlags>(static_cast<int>(a) ^ static_cast<int>(b));
|
return static_cast<Regexp::ParseFlags>(
|
||||||
|
static_cast<int>(a) ^ static_cast<int>(b));
|
||||||
}
|
}
|
||||||
|
|
||||||
inline Regexp::ParseFlags operator&(Regexp::ParseFlags a, Regexp::ParseFlags b)
|
inline Regexp::ParseFlags operator&(Regexp::ParseFlags a,
|
||||||
{
|
Regexp::ParseFlags b) {
|
||||||
return static_cast<Regexp::ParseFlags>(static_cast<int>(a) & static_cast<int>(b));
|
return static_cast<Regexp::ParseFlags>(
|
||||||
|
static_cast<int>(a) & static_cast<int>(b));
|
||||||
}
|
}
|
||||||
|
|
||||||
inline Regexp::ParseFlags operator~(Regexp::ParseFlags a)
|
inline Regexp::ParseFlags operator~(Regexp::ParseFlags a) {
|
||||||
{
|
// Attempting to produce a value out of enum's range has undefined behaviour.
|
||||||
return static_cast<Regexp::ParseFlags>(~static_cast<int>(a));
|
return static_cast<Regexp::ParseFlags>(
|
||||||
|
~static_cast<int>(a) & static_cast<int>(Regexp::AllParseFlags));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
} // namespace re2
|
} // namespace re2
|
||||||
|
|
||||||
#endif // RE2_REGEXP_H__
|
#endif // RE2_REGEXP_H_
|
||||||
|
@ -4,36 +4,42 @@
|
|||||||
|
|
||||||
#include "re2/set.h"
|
#include "re2/set.h"
|
||||||
|
|
||||||
|
#include <stddef.h>
|
||||||
|
#include <algorithm>
|
||||||
|
#include <memory>
|
||||||
|
|
||||||
#include "util/util.h"
|
#include "util/util.h"
|
||||||
|
#include "util/logging.h"
|
||||||
#include "re2/stringpiece.h"
|
#include "re2/stringpiece.h"
|
||||||
#include "re2/prog.h"
|
#include "re2/prog.h"
|
||||||
#include "re2/re2.h"
|
#include "re2/re2.h"
|
||||||
#include "re2/regexp.h"
|
#include "re2/regexp.h"
|
||||||
|
|
||||||
using namespace re2;
|
namespace re2 {
|
||||||
|
|
||||||
RE2::Set::Set(const RE2::Options& options, RE2::Anchor anchor) {
|
RE2::Set::Set(const RE2::Options& options, RE2::Anchor anchor) {
|
||||||
options_.Copy(options);
|
options_.Copy(options);
|
||||||
|
options_.set_never_capture(true); // might unblock some optimisations
|
||||||
anchor_ = anchor;
|
anchor_ = anchor;
|
||||||
prog_ = NULL;
|
prog_ = NULL;
|
||||||
compiled_ = false;
|
compiled_ = false;
|
||||||
|
size_ = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
RE2::Set::~Set() {
|
RE2::Set::~Set() {
|
||||||
for (size_t i = 0; i < re_.size(); i++)
|
for (size_t i = 0; i < elem_.size(); i++)
|
||||||
re_[i]->Decref();
|
elem_[i].second->Decref();
|
||||||
delete prog_;
|
delete prog_;
|
||||||
}
|
}
|
||||||
|
|
||||||
int RE2::Set::Add(const StringPiece& pattern, string* error) {
|
int RE2::Set::Add(const StringPiece& pattern, string* error) {
|
||||||
if (compiled_) {
|
if (compiled_) {
|
||||||
LOG(DFATAL) << "RE2::Set::Add after Compile";
|
LOG(DFATAL) << "RE2::Set::Add() called after compiling";
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
|
Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
|
||||||
options_.ParseFlags());
|
options_.ParseFlags());
|
||||||
|
|
||||||
RegexpStatus status;
|
RegexpStatus status;
|
||||||
re2::Regexp* re = Regexp::Parse(pattern, pf, &status);
|
re2::Regexp* re = Regexp::Parse(pattern, pf, &status);
|
||||||
if (re == NULL) {
|
if (re == NULL) {
|
||||||
@ -45,7 +51,7 @@ int RE2::Set::Add(const StringPiece& pattern, string* error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Concatenate with match index and push on vector.
|
// Concatenate with match index and push on vector.
|
||||||
int n = re_.size();
|
int n = static_cast<int>(elem_.size());
|
||||||
re2::Regexp* m = re2::Regexp::HaveMatch(n, pf);
|
re2::Regexp* m = re2::Regexp::HaveMatch(n, pf);
|
||||||
if (re->op() == kRegexpConcat) {
|
if (re->op() == kRegexpConcat) {
|
||||||
int nsub = re->nsub();
|
int nsub = re->nsub();
|
||||||
@ -62,52 +68,87 @@ int RE2::Set::Add(const StringPiece& pattern, string* error) {
|
|||||||
sub[1] = m;
|
sub[1] = m;
|
||||||
re = re2::Regexp::Concat(sub, 2, pf);
|
re = re2::Regexp::Concat(sub, 2, pf);
|
||||||
}
|
}
|
||||||
re_.push_back(re);
|
elem_.emplace_back(pattern.ToString(), re);
|
||||||
return n;
|
return n;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool RE2::Set::Compile() {
|
bool RE2::Set::Compile() {
|
||||||
if (compiled_) {
|
if (compiled_) {
|
||||||
LOG(DFATAL) << "RE2::Set::Compile multiple times";
|
LOG(DFATAL) << "RE2::Set::Compile() called more than once";
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
compiled_ = true;
|
compiled_ = true;
|
||||||
|
size_ = static_cast<int>(elem_.size());
|
||||||
|
|
||||||
|
// Sort the elements by their patterns. This is good enough for now
|
||||||
|
// until we have a Regexp comparison function. (Maybe someday...)
|
||||||
|
std::sort(elem_.begin(), elem_.end(),
|
||||||
|
[](const Elem& a, const Elem& b) -> bool {
|
||||||
|
return a.first < b.first;
|
||||||
|
});
|
||||||
|
|
||||||
|
re2::Regexp** sub = new re2::Regexp*[size_];
|
||||||
|
for (size_t i = 0; i < elem_.size(); i++)
|
||||||
|
sub[i] = elem_[i].second;
|
||||||
|
elem_.clear();
|
||||||
|
elem_.shrink_to_fit();
|
||||||
|
|
||||||
Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
|
Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
|
||||||
options_.ParseFlags());
|
options_.ParseFlags());
|
||||||
re2::Regexp* re = re2::Regexp::Alternate(const_cast<re2::Regexp**>(&re_[0]),
|
re2::Regexp* re = re2::Regexp::Alternate(sub, size_, pf);
|
||||||
re_.size(), pf);
|
delete[] sub;
|
||||||
re_.clear();
|
|
||||||
re2::Regexp* sre = re->Simplify();
|
|
||||||
re->Decref();
|
|
||||||
re = sre;
|
|
||||||
if (re == NULL) {
|
|
||||||
if (options_.log_errors())
|
|
||||||
LOG(ERROR) << "Error simplifying during Compile.";
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
prog_ = Prog::CompileSet(options_, anchor_, re);
|
prog_ = Prog::CompileSet(re, anchor_, options_.max_mem());
|
||||||
|
re->Decref();
|
||||||
return prog_ != NULL;
|
return prog_ != NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool RE2::Set::Match(const StringPiece& text, vector<int>* v) const {
|
bool RE2::Set::Match(const StringPiece& text, std::vector<int>* v) const {
|
||||||
if (!compiled_) {
|
return Match(text, v, NULL);
|
||||||
LOG(DFATAL) << "RE2::Set::Match without Compile";
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
v->clear();
|
|
||||||
bool failed;
|
|
||||||
bool ret = prog_->SearchDFA(text, text, Prog::kAnchored,
|
|
||||||
Prog::kManyMatch, NULL, &failed, v);
|
|
||||||
if (failed)
|
|
||||||
LOG(DFATAL) << "RE2::Set::Match: DFA ran out of cache space";
|
|
||||||
|
|
||||||
if (ret == false)
|
bool RE2::Set::Match(const StringPiece& text, std::vector<int>* v,
|
||||||
return false;
|
ErrorInfo* error_info) const {
|
||||||
if (v->size() == 0) {
|
if (!compiled_) {
|
||||||
LOG(DFATAL) << "RE2::Set::Match: match but unknown regexp set";
|
LOG(DFATAL) << "RE2::Set::Match() called before compiling";
|
||||||
|
if (error_info != NULL)
|
||||||
|
error_info->kind = kNotCompiled;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
bool dfa_failed = false;
|
||||||
|
std::unique_ptr<SparseSet> matches;
|
||||||
|
if (v != NULL) {
|
||||||
|
matches.reset(new SparseSet(size_));
|
||||||
|
v->clear();
|
||||||
|
}
|
||||||
|
bool ret = prog_->SearchDFA(text, text, Prog::kAnchored, Prog::kManyMatch,
|
||||||
|
NULL, &dfa_failed, matches.get());
|
||||||
|
if (dfa_failed) {
|
||||||
|
if (options_.log_errors())
|
||||||
|
LOG(ERROR) << "DFA out of memory: size " << prog_->size() << ", "
|
||||||
|
<< "bytemap range " << prog_->bytemap_range() << ", "
|
||||||
|
<< "list count " << prog_->list_count();
|
||||||
|
if (error_info != NULL)
|
||||||
|
error_info->kind = kOutOfMemory;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (ret == false) {
|
||||||
|
if (error_info != NULL)
|
||||||
|
error_info->kind = kNoError;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (v != NULL) {
|
||||||
|
if (matches->empty()) {
|
||||||
|
LOG(DFATAL) << "RE2::Set::Match() matched, but no matches returned?!";
|
||||||
|
if (error_info != NULL)
|
||||||
|
error_info->kind = kInconsistent;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
v->assign(matches->begin(), matches->end());
|
||||||
|
}
|
||||||
|
if (error_info != NULL)
|
||||||
|
error_info->kind = kNoError;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
} // namespace re2
|
||||||
|
@ -2,54 +2,79 @@
|
|||||||
// Use of this source code is governed by a BSD-style
|
// Use of this source code is governed by a BSD-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
#ifndef RE2_SET_H
|
#ifndef RE2_SET_H_
|
||||||
#define RE2_SET_H
|
#define RE2_SET_H_
|
||||||
|
|
||||||
|
#include <string>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include "re2/re2.h"
|
#include "re2/re2.h"
|
||||||
|
|
||||||
namespace re2 {
|
namespace re2 {
|
||||||
using std::vector;
|
class Prog;
|
||||||
|
class Regexp;
|
||||||
|
} // namespace re2
|
||||||
|
|
||||||
|
namespace re2 {
|
||||||
|
|
||||||
// An RE2::Set represents a collection of regexps that can
|
// An RE2::Set represents a collection of regexps that can
|
||||||
// be searched for simultaneously.
|
// be searched for simultaneously.
|
||||||
class RE2::Set {
|
class RE2::Set {
|
||||||
public:
|
public:
|
||||||
|
enum ErrorKind {
|
||||||
|
kNoError = 0,
|
||||||
|
kNotCompiled, // The set is not compiled.
|
||||||
|
kOutOfMemory, // The DFA ran out of memory.
|
||||||
|
kInconsistent, // The result is inconsistent. This should never happen.
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ErrorInfo {
|
||||||
|
ErrorKind kind;
|
||||||
|
};
|
||||||
|
|
||||||
Set(const RE2::Options& options, RE2::Anchor anchor);
|
Set(const RE2::Options& options, RE2::Anchor anchor);
|
||||||
~Set();
|
~Set();
|
||||||
|
|
||||||
// Add adds regexp pattern to the set, interpreted using the RE2 options.
|
// Adds pattern to the set using the options passed to the constructor.
|
||||||
// (The RE2 constructor's default options parameter is RE2::UTF8.)
|
// Returns the index that will identify the regexp in the output of Match(),
|
||||||
// Add returns the regexp index that will be used to identify
|
// or -1 if the regexp cannot be parsed.
|
||||||
// it in the result of Match, or -1 if the regexp cannot be parsed.
|
|
||||||
// Indices are assigned in sequential order starting from 0.
|
// Indices are assigned in sequential order starting from 0.
|
||||||
// Error returns do not increment the index.
|
// Errors do not increment the index; if error is not NULL, *error will hold
|
||||||
// If an error occurs and error != NULL, *error will hold an error message.
|
// the error message from the parser.
|
||||||
int Add(const StringPiece& pattern, string* error);
|
int Add(const StringPiece& pattern, string* error);
|
||||||
|
|
||||||
// Compile prepares the Set for matching.
|
// Compiles the set in preparation for matching.
|
||||||
// Add must not be called again after Compile.
|
// Returns false if the compiler runs out of memory.
|
||||||
// Compile must be called before FullMatch or PartialMatch.
|
// Add() must not be called again after Compile().
|
||||||
// Compile may return false if it runs out of memory.
|
// Compile() must be called before Match().
|
||||||
bool Compile();
|
bool Compile();
|
||||||
|
|
||||||
// Match returns true if text matches any of the regexps in the set.
|
// Returns true if text matches at least one of the regexps in the set.
|
||||||
// If so, it fills v with the indices of the matching regexps.
|
// Fills v (if not NULL) with the indices of the matching regexps.
|
||||||
bool Match(const StringPiece& text, vector<int>* v) const;
|
// Callers must not expect v to be sorted.
|
||||||
|
bool Match(const StringPiece& text, std::vector<int>* v) const;
|
||||||
|
|
||||||
|
// As above, but populates error_info (if not NULL) when none of the regexps
|
||||||
|
// in the set matched. This can inform callers when DFA execution fails, for
|
||||||
|
// example, because they might wish to handle that case differently.
|
||||||
|
bool Match(const StringPiece& text, std::vector<int>* v,
|
||||||
|
ErrorInfo* error_info) const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
typedef std::pair<string, re2::Regexp*> Elem;
|
||||||
|
|
||||||
RE2::Options options_;
|
RE2::Options options_;
|
||||||
RE2::Anchor anchor_;
|
RE2::Anchor anchor_;
|
||||||
vector<re2::Regexp*> re_;
|
std::vector<Elem> elem_;
|
||||||
re2::Prog* prog_;
|
re2::Prog* prog_;
|
||||||
bool compiled_;
|
bool compiled_;
|
||||||
//DISALLOW_EVIL_CONSTRUCTORS(Set);
|
int size_;
|
||||||
Set(const Set&);
|
|
||||||
void operator=(const Set&);
|
Set(const Set&) = delete;
|
||||||
|
Set& operator=(const Set&) = delete;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace re2
|
} // namespace re2
|
||||||
|
|
||||||
#endif // RE2_SET_H
|
#endif // RE2_SET_H_
|
||||||
|
@ -6,7 +6,11 @@
|
|||||||
// to use simple extended regular expression features.
|
// to use simple extended regular expression features.
|
||||||
// Also sort and simplify character classes.
|
// Also sort and simplify character classes.
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
|
||||||
#include "util/util.h"
|
#include "util/util.h"
|
||||||
|
#include "util/logging.h"
|
||||||
|
#include "util/utf.h"
|
||||||
#include "re2/regexp.h"
|
#include "re2/regexp.h"
|
||||||
#include "re2/walker-inl.h"
|
#include "re2/walker-inl.h"
|
||||||
|
|
||||||
@ -61,7 +65,7 @@ bool Regexp::ComputeSimple() {
|
|||||||
// These are simple as long as the subpieces are simple.
|
// These are simple as long as the subpieces are simple.
|
||||||
subs = sub();
|
subs = sub();
|
||||||
for (int i = 0; i < nsub_; i++)
|
for (int i = 0; i < nsub_; i++)
|
||||||
if (!subs[i]->simple_)
|
if (!subs[i]->simple())
|
||||||
return false;
|
return false;
|
||||||
return true;
|
return true;
|
||||||
case kRegexpCharClass:
|
case kRegexpCharClass:
|
||||||
@ -71,12 +75,12 @@ bool Regexp::ComputeSimple() {
|
|||||||
return !cc_->empty() && !cc_->full();
|
return !cc_->empty() && !cc_->full();
|
||||||
case kRegexpCapture:
|
case kRegexpCapture:
|
||||||
subs = sub();
|
subs = sub();
|
||||||
return subs[0]->simple_;
|
return subs[0]->simple();
|
||||||
case kRegexpStar:
|
case kRegexpStar:
|
||||||
case kRegexpPlus:
|
case kRegexpPlus:
|
||||||
case kRegexpQuest:
|
case kRegexpQuest:
|
||||||
subs = sub();
|
subs = sub();
|
||||||
if (!subs[0]->simple_)
|
if (!subs[0]->simple())
|
||||||
return false;
|
return false;
|
||||||
switch (subs[0]->op_) {
|
switch (subs[0]->op_) {
|
||||||
case kRegexpStar:
|
case kRegexpStar:
|
||||||
@ -96,6 +100,37 @@ bool Regexp::ComputeSimple() {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Walker subclass used by Simplify.
|
||||||
|
// Coalesces runs of star/plus/quest/repeat of the same literal along with any
|
||||||
|
// occurrences of that literal into repeats of that literal. It also works for
|
||||||
|
// char classes, any char and any byte.
|
||||||
|
// PostVisit creates the coalesced result, which should then be simplified.
|
||||||
|
class CoalesceWalker : public Regexp::Walker<Regexp*> {
|
||||||
|
public:
|
||||||
|
CoalesceWalker() {}
|
||||||
|
virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
|
||||||
|
Regexp** child_args, int nchild_args);
|
||||||
|
virtual Regexp* Copy(Regexp* re);
|
||||||
|
virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
|
||||||
|
|
||||||
|
private:
|
||||||
|
// These functions are declared inside CoalesceWalker so that
|
||||||
|
// they can edit the private fields of the Regexps they construct.
|
||||||
|
|
||||||
|
// Returns true if r1 and r2 can be coalesced. In particular, ensures that
|
||||||
|
// the parse flags are consistent. (They will not be checked again later.)
|
||||||
|
static bool CanCoalesce(Regexp* r1, Regexp* r2);
|
||||||
|
|
||||||
|
// Coalesces *r1ptr and *r2ptr. In most cases, the array elements afterwards
|
||||||
|
// will be empty match and the coalesced op. In other cases, where part of a
|
||||||
|
// literal string was removed to be coalesced, the array elements afterwards
|
||||||
|
// will be the coalesced op and the remainder of the literal string.
|
||||||
|
static void DoCoalesce(Regexp** r1ptr, Regexp** r2ptr);
|
||||||
|
|
||||||
|
CoalesceWalker(const CoalesceWalker&) = delete;
|
||||||
|
CoalesceWalker& operator=(const CoalesceWalker&) = delete;
|
||||||
|
};
|
||||||
|
|
||||||
// Walker subclass used by Simplify.
|
// Walker subclass used by Simplify.
|
||||||
// The simplify walk is purely post-recursive: given the simplified children,
|
// The simplify walk is purely post-recursive: given the simplified children,
|
||||||
// PostVisit creates the simplified result.
|
// PostVisit creates the simplified result.
|
||||||
@ -104,9 +139,7 @@ class SimplifyWalker : public Regexp::Walker<Regexp*> {
|
|||||||
public:
|
public:
|
||||||
SimplifyWalker() {}
|
SimplifyWalker() {}
|
||||||
virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
|
virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
|
||||||
virtual Regexp* PostVisit(Regexp* re,
|
virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
|
||||||
Regexp* parent_arg,
|
|
||||||
Regexp* pre_arg,
|
|
||||||
Regexp** child_args, int nchild_args);
|
Regexp** child_args, int nchild_args);
|
||||||
virtual Regexp* Copy(Regexp* re);
|
virtual Regexp* Copy(Regexp* re);
|
||||||
virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
|
virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
|
||||||
@ -130,7 +163,8 @@ class SimplifyWalker : public Regexp::Walker<Regexp*> {
|
|||||||
// Caller must Decref return value when done with it.
|
// Caller must Decref return value when done with it.
|
||||||
static Regexp* SimplifyCharClass(Regexp* re);
|
static Regexp* SimplifyCharClass(Regexp* re);
|
||||||
|
|
||||||
DISALLOW_EVIL_CONSTRUCTORS(SimplifyWalker);
|
SimplifyWalker(const SimplifyWalker&) = delete;
|
||||||
|
SimplifyWalker& operator=(const SimplifyWalker&) = delete;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Simplifies a regular expression, returning a new regexp.
|
// Simplifies a regular expression, returning a new regexp.
|
||||||
@ -143,14 +177,261 @@ class SimplifyWalker : public Regexp::Walker<Regexp*> {
|
|||||||
// Caller must Decref() return value when done with it.
|
// Caller must Decref() return value when done with it.
|
||||||
|
|
||||||
Regexp* Regexp::Simplify() {
|
Regexp* Regexp::Simplify() {
|
||||||
if (simple_)
|
CoalesceWalker cw;
|
||||||
return Incref();
|
Regexp* cre = cw.Walk(this, NULL);
|
||||||
SimplifyWalker w;
|
if (cre == NULL)
|
||||||
return w.Walk(this, NULL);
|
return cre;
|
||||||
|
SimplifyWalker sw;
|
||||||
|
Regexp* sre = sw.Walk(cre, NULL);
|
||||||
|
cre->Decref();
|
||||||
|
return sre;
|
||||||
}
|
}
|
||||||
|
|
||||||
#define Simplify DontCallSimplify // Avoid accidental recursion
|
#define Simplify DontCallSimplify // Avoid accidental recursion
|
||||||
|
|
||||||
|
// Utility function for PostVisit implementations that compares re->sub() with
|
||||||
|
// child_args to determine whether any child_args changed. In the common case,
|
||||||
|
// where nothing changed, calls Decref() for all child_args and returns false,
|
||||||
|
// so PostVisit must return re->Incref(). Otherwise, returns true.
|
||||||
|
static bool ChildArgsChanged(Regexp* re, Regexp** child_args) {
|
||||||
|
for (int i = 0; i < re->nsub(); i++) {
|
||||||
|
Regexp* sub = re->sub()[i];
|
||||||
|
Regexp* newsub = child_args[i];
|
||||||
|
if (newsub != sub)
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
for (int i = 0; i < re->nsub(); i++) {
|
||||||
|
Regexp* newsub = child_args[i];
|
||||||
|
newsub->Decref();
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
Regexp* CoalesceWalker::Copy(Regexp* re) {
|
||||||
|
return re->Incref();
|
||||||
|
}
|
||||||
|
|
||||||
|
Regexp* CoalesceWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
|
||||||
|
// This should never be called, since we use Walk and not
|
||||||
|
// WalkExponential.
|
||||||
|
LOG(DFATAL) << "CoalesceWalker::ShortVisit called";
|
||||||
|
return re->Incref();
|
||||||
|
}
|
||||||
|
|
||||||
|
Regexp* CoalesceWalker::PostVisit(Regexp* re,
|
||||||
|
Regexp* parent_arg,
|
||||||
|
Regexp* pre_arg,
|
||||||
|
Regexp** child_args,
|
||||||
|
int nchild_args) {
|
||||||
|
if (re->nsub() == 0)
|
||||||
|
return re->Incref();
|
||||||
|
|
||||||
|
if (re->op() != kRegexpConcat) {
|
||||||
|
if (!ChildArgsChanged(re, child_args))
|
||||||
|
return re->Incref();
|
||||||
|
|
||||||
|
// Something changed. Build a new op.
|
||||||
|
Regexp* nre = new Regexp(re->op(), re->parse_flags());
|
||||||
|
nre->AllocSub(re->nsub());
|
||||||
|
Regexp** nre_subs = nre->sub();
|
||||||
|
for (int i = 0; i < re->nsub(); i++)
|
||||||
|
nre_subs[i] = child_args[i];
|
||||||
|
// Repeats and Captures have additional data that must be copied.
|
||||||
|
if (re->op() == kRegexpRepeat) {
|
||||||
|
nre->min_ = re->min();
|
||||||
|
nre->max_ = re->max();
|
||||||
|
} else if (re->op() == kRegexpCapture) {
|
||||||
|
nre->cap_ = re->cap();
|
||||||
|
}
|
||||||
|
return nre;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool can_coalesce = false;
|
||||||
|
for (int i = 0; i < re->nsub(); i++) {
|
||||||
|
if (i+1 < re->nsub() &&
|
||||||
|
CanCoalesce(child_args[i], child_args[i+1])) {
|
||||||
|
can_coalesce = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!can_coalesce) {
|
||||||
|
if (!ChildArgsChanged(re, child_args))
|
||||||
|
return re->Incref();
|
||||||
|
|
||||||
|
// Something changed. Build a new op.
|
||||||
|
Regexp* nre = new Regexp(re->op(), re->parse_flags());
|
||||||
|
nre->AllocSub(re->nsub());
|
||||||
|
Regexp** nre_subs = nre->sub();
|
||||||
|
for (int i = 0; i < re->nsub(); i++)
|
||||||
|
nre_subs[i] = child_args[i];
|
||||||
|
return nre;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < re->nsub(); i++) {
|
||||||
|
if (i+1 < re->nsub() &&
|
||||||
|
CanCoalesce(child_args[i], child_args[i+1]))
|
||||||
|
DoCoalesce(&child_args[i], &child_args[i+1]);
|
||||||
|
}
|
||||||
|
// Determine how many empty matches were left by DoCoalesce.
|
||||||
|
int n = 0;
|
||||||
|
for (int i = n; i < re->nsub(); i++) {
|
||||||
|
if (child_args[i]->op() == kRegexpEmptyMatch)
|
||||||
|
n++;
|
||||||
|
}
|
||||||
|
// Build a new op.
|
||||||
|
Regexp* nre = new Regexp(re->op(), re->parse_flags());
|
||||||
|
nre->AllocSub(re->nsub() - n);
|
||||||
|
Regexp** nre_subs = nre->sub();
|
||||||
|
for (int i = 0, j = 0; i < re->nsub(); i++) {
|
||||||
|
if (child_args[i]->op() == kRegexpEmptyMatch) {
|
||||||
|
child_args[i]->Decref();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
nre_subs[j] = child_args[i];
|
||||||
|
j++;
|
||||||
|
}
|
||||||
|
return nre;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool CoalesceWalker::CanCoalesce(Regexp* r1, Regexp* r2) {
|
||||||
|
// r1 must be a star/plus/quest/repeat of a literal, char class, any char or
|
||||||
|
// any byte.
|
||||||
|
if ((r1->op() == kRegexpStar ||
|
||||||
|
r1->op() == kRegexpPlus ||
|
||||||
|
r1->op() == kRegexpQuest ||
|
||||||
|
r1->op() == kRegexpRepeat) &&
|
||||||
|
(r1->sub()[0]->op() == kRegexpLiteral ||
|
||||||
|
r1->sub()[0]->op() == kRegexpCharClass ||
|
||||||
|
r1->sub()[0]->op() == kRegexpAnyChar ||
|
||||||
|
r1->sub()[0]->op() == kRegexpAnyByte)) {
|
||||||
|
// r2 must be a star/plus/quest/repeat of the same literal, char class,
|
||||||
|
// any char or any byte.
|
||||||
|
if ((r2->op() == kRegexpStar ||
|
||||||
|
r2->op() == kRegexpPlus ||
|
||||||
|
r2->op() == kRegexpQuest ||
|
||||||
|
r2->op() == kRegexpRepeat) &&
|
||||||
|
Regexp::Equal(r1->sub()[0], r2->sub()[0]) &&
|
||||||
|
// The parse flags must be consistent.
|
||||||
|
((r1->parse_flags() & Regexp::NonGreedy) ==
|
||||||
|
(r2->parse_flags() & Regexp::NonGreedy))) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
// ... OR an occurrence of that literal, char class, any char or any byte
|
||||||
|
if (Regexp::Equal(r1->sub()[0], r2)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
// ... OR a literal string that begins with that literal.
|
||||||
|
if (r1->sub()[0]->op() == kRegexpLiteral &&
|
||||||
|
r2->op() == kRegexpLiteralString &&
|
||||||
|
r2->runes()[0] == r1->sub()[0]->rune() &&
|
||||||
|
// The parse flags must be consistent.
|
||||||
|
((r1->sub()[0]->parse_flags() & Regexp::FoldCase) ==
|
||||||
|
(r2->parse_flags() & Regexp::FoldCase))) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
void CoalesceWalker::DoCoalesce(Regexp** r1ptr, Regexp** r2ptr) {
|
||||||
|
Regexp* r1 = *r1ptr;
|
||||||
|
Regexp* r2 = *r2ptr;
|
||||||
|
|
||||||
|
Regexp* nre = Regexp::Repeat(
|
||||||
|
r1->sub()[0]->Incref(), r1->parse_flags(), 0, 0);
|
||||||
|
|
||||||
|
switch (r1->op()) {
|
||||||
|
case kRegexpStar:
|
||||||
|
nre->min_ = 0;
|
||||||
|
nre->max_ = -1;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case kRegexpPlus:
|
||||||
|
nre->min_ = 1;
|
||||||
|
nre->max_ = -1;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case kRegexpQuest:
|
||||||
|
nre->min_ = 0;
|
||||||
|
nre->max_ = 1;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case kRegexpRepeat:
|
||||||
|
nre->min_ = r1->min();
|
||||||
|
nre->max_ = r1->max();
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
LOG(DFATAL) << "DoCoalesce failed: r1->op() is " << r1->op();
|
||||||
|
nre->Decref();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
switch (r2->op()) {
|
||||||
|
case kRegexpStar:
|
||||||
|
nre->max_ = -1;
|
||||||
|
goto LeaveEmpty;
|
||||||
|
|
||||||
|
case kRegexpPlus:
|
||||||
|
nre->min_++;
|
||||||
|
nre->max_ = -1;
|
||||||
|
goto LeaveEmpty;
|
||||||
|
|
||||||
|
case kRegexpQuest:
|
||||||
|
if (nre->max() != -1)
|
||||||
|
nre->max_++;
|
||||||
|
goto LeaveEmpty;
|
||||||
|
|
||||||
|
case kRegexpRepeat:
|
||||||
|
nre->min_ += r2->min();
|
||||||
|
if (r2->max() == -1)
|
||||||
|
nre->max_ = -1;
|
||||||
|
else if (nre->max() != -1)
|
||||||
|
nre->max_ += r2->max();
|
||||||
|
goto LeaveEmpty;
|
||||||
|
|
||||||
|
case kRegexpLiteral:
|
||||||
|
case kRegexpCharClass:
|
||||||
|
case kRegexpAnyChar:
|
||||||
|
case kRegexpAnyByte:
|
||||||
|
nre->min_++;
|
||||||
|
if (nre->max() != -1)
|
||||||
|
nre->max_++;
|
||||||
|
goto LeaveEmpty;
|
||||||
|
|
||||||
|
LeaveEmpty:
|
||||||
|
*r1ptr = new Regexp(kRegexpEmptyMatch, Regexp::NoParseFlags);
|
||||||
|
*r2ptr = nre;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case kRegexpLiteralString: {
|
||||||
|
Rune r = r1->sub()[0]->rune();
|
||||||
|
// Determine how much of the literal string is removed.
|
||||||
|
// We know that we have at least one rune. :)
|
||||||
|
int n = 1;
|
||||||
|
while (n < r2->nrunes() && r2->runes()[n] == r)
|
||||||
|
n++;
|
||||||
|
nre->min_ += n;
|
||||||
|
if (nre->max() != -1)
|
||||||
|
nre->max_ += n;
|
||||||
|
if (n == r2->nrunes())
|
||||||
|
goto LeaveEmpty;
|
||||||
|
*r1ptr = nre;
|
||||||
|
*r2ptr = Regexp::LiteralString(
|
||||||
|
&r2->runes()[n], r2->nrunes() - n, r2->parse_flags());
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
default:
|
||||||
|
LOG(DFATAL) << "DoCoalesce failed: r2->op() is " << r2->op();
|
||||||
|
nre->Decref();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
r1->Decref();
|
||||||
|
r2->Decref();
|
||||||
|
}
|
||||||
|
|
||||||
Regexp* SimplifyWalker::Copy(Regexp* re) {
|
Regexp* SimplifyWalker::Copy(Regexp* re) {
|
||||||
return re->Incref();
|
return re->Incref();
|
||||||
}
|
}
|
||||||
@ -163,7 +444,7 @@ Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {
|
Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {
|
||||||
if (re->simple_) {
|
if (re->simple()) {
|
||||||
*stop = true;
|
*stop = true;
|
||||||
return re->Incref();
|
return re->Incref();
|
||||||
}
|
}
|
||||||
@ -196,29 +477,14 @@ Regexp* SimplifyWalker::PostVisit(Regexp* re,
|
|||||||
case kRegexpConcat:
|
case kRegexpConcat:
|
||||||
case kRegexpAlternate: {
|
case kRegexpAlternate: {
|
||||||
// These are simple as long as the subpieces are simple.
|
// These are simple as long as the subpieces are simple.
|
||||||
// Two passes to avoid allocation in the common case.
|
if (!ChildArgsChanged(re, child_args)) {
|
||||||
bool changed = false;
|
|
||||||
Regexp** subs = re->sub();
|
|
||||||
for (int i = 0; i < re->nsub_; i++) {
|
|
||||||
Regexp* sub = subs[i];
|
|
||||||
Regexp* newsub = child_args[i];
|
|
||||||
if (newsub != sub) {
|
|
||||||
changed = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!changed) {
|
|
||||||
for (int i = 0; i < re->nsub_; i++) {
|
|
||||||
Regexp* newsub = child_args[i];
|
|
||||||
newsub->Decref();
|
|
||||||
}
|
|
||||||
re->simple_ = true;
|
re->simple_ = true;
|
||||||
return re->Incref();
|
return re->Incref();
|
||||||
}
|
}
|
||||||
Regexp* nre = new Regexp(re->op(), re->parse_flags());
|
Regexp* nre = new Regexp(re->op(), re->parse_flags());
|
||||||
nre->AllocSub(re->nsub_);
|
nre->AllocSub(re->nsub());
|
||||||
Regexp** nre_subs = nre->sub();
|
Regexp** nre_subs = nre->sub();
|
||||||
for (int i = 0; i <re->nsub_; i++)
|
for (int i = 0; i < re->nsub(); i++)
|
||||||
nre_subs[i] = child_args[i];
|
nre_subs[i] = child_args[i];
|
||||||
nre->simple_ = true;
|
nre->simple_ = true;
|
||||||
return nre;
|
return nre;
|
||||||
@ -234,7 +500,7 @@ Regexp* SimplifyWalker::PostVisit(Regexp* re,
|
|||||||
Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());
|
Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());
|
||||||
nre->AllocSub(1);
|
nre->AllocSub(1);
|
||||||
nre->sub()[0] = newsub;
|
nre->sub()[0] = newsub;
|
||||||
nre->cap_ = re->cap_;
|
nre->cap_ = re->cap();
|
||||||
nre->simple_ = true;
|
nre->simple_ = true;
|
||||||
return nre;
|
return nre;
|
||||||
}
|
}
|
||||||
@ -323,13 +589,12 @@ Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
|
|||||||
return Regexp::Plus(re->Incref(), f);
|
return Regexp::Plus(re->Incref(), f);
|
||||||
|
|
||||||
// General case: x{4,} is xxxx+
|
// General case: x{4,} is xxxx+
|
||||||
Regexp* nre = new Regexp(kRegexpConcat, f);
|
Regexp** nre_subs = new Regexp*[min];
|
||||||
nre->AllocSub(min);
|
|
||||||
VLOG(1) << "Simplify " << min;
|
|
||||||
Regexp** nre_subs = nre->sub();
|
|
||||||
for (int i = 0; i < min-1; i++)
|
for (int i = 0; i < min-1; i++)
|
||||||
nre_subs[i] = re->Incref();
|
nre_subs[i] = re->Incref();
|
||||||
nre_subs[min-1] = Regexp::Plus(re->Incref(), f);
|
nre_subs[min-1] = Regexp::Plus(re->Incref(), f);
|
||||||
|
Regexp* nre = Regexp::Concat(nre_subs, min, f);
|
||||||
|
delete[] nre_subs;
|
||||||
return nre;
|
return nre;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -348,11 +613,11 @@ Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
|
|||||||
// Build leading prefix: xx. Capturing only on the last one.
|
// Build leading prefix: xx. Capturing only on the last one.
|
||||||
Regexp* nre = NULL;
|
Regexp* nre = NULL;
|
||||||
if (min > 0) {
|
if (min > 0) {
|
||||||
nre = new Regexp(kRegexpConcat, f);
|
Regexp** nre_subs = new Regexp*[min];
|
||||||
nre->AllocSub(min);
|
|
||||||
Regexp** nre_subs = nre->sub();
|
|
||||||
for (int i = 0; i < min; i++)
|
for (int i = 0; i < min; i++)
|
||||||
nre_subs[i] = re->Incref();
|
nre_subs[i] = re->Incref();
|
||||||
|
nre = Regexp::Concat(nre_subs, min, f);
|
||||||
|
delete[] nre_subs;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Build and attach suffix: (x(x(x)?)?)?
|
// Build and attach suffix: (x(x(x)?)?)?
|
||||||
|
65
contrib/libre2/re2/stringpiece.cc
Normal file
65
contrib/libre2/re2/stringpiece.cc
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
// Copyright 2004 The RE2 Authors. All Rights Reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
#include "re2/stringpiece.h"
|
||||||
|
|
||||||
|
#include <ostream>
|
||||||
|
|
||||||
|
#include "util/util.h"
|
||||||
|
|
||||||
|
namespace re2 {
|
||||||
|
|
||||||
|
const StringPiece::size_type StringPiece::npos; // initialized in stringpiece.h
|
||||||
|
|
||||||
|
StringPiece::size_type StringPiece::copy(char* buf, size_type n,
|
||||||
|
size_type pos) const {
|
||||||
|
size_type ret = std::min(size_ - pos, n);
|
||||||
|
memcpy(buf, data_ + pos, ret);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
StringPiece StringPiece::substr(size_type pos, size_type n) const {
|
||||||
|
if (pos > size_) pos = size_;
|
||||||
|
if (n > size_ - pos) n = size_ - pos;
|
||||||
|
return StringPiece(data_ + pos, n);
|
||||||
|
}
|
||||||
|
|
||||||
|
StringPiece::size_type StringPiece::find(const StringPiece& s,
|
||||||
|
size_type pos) const {
|
||||||
|
if (pos > size_) return npos;
|
||||||
|
const_pointer result = std::search(data_ + pos, data_ + size_,
|
||||||
|
s.data_, s.data_ + s.size_);
|
||||||
|
size_type xpos = result - data_;
|
||||||
|
return xpos + s.size_ <= size_ ? xpos : npos;
|
||||||
|
}
|
||||||
|
|
||||||
|
StringPiece::size_type StringPiece::find(char c, size_type pos) const {
|
||||||
|
if (size_ <= 0 || pos >= size_) return npos;
|
||||||
|
const_pointer result = std::find(data_ + pos, data_ + size_, c);
|
||||||
|
return result != data_ + size_ ? result - data_ : npos;
|
||||||
|
}
|
||||||
|
|
||||||
|
StringPiece::size_type StringPiece::rfind(const StringPiece& s,
|
||||||
|
size_type pos) const {
|
||||||
|
if (size_ < s.size_) return npos;
|
||||||
|
if (s.size_ == 0) return std::min(size_, pos);
|
||||||
|
const_pointer last = data_ + std::min(size_ - s.size_, pos) + s.size_;
|
||||||
|
const_pointer result = std::find_end(data_, last, s.data_, s.data_ + s.size_);
|
||||||
|
return result != last ? result - data_ : npos;
|
||||||
|
}
|
||||||
|
|
||||||
|
StringPiece::size_type StringPiece::rfind(char c, size_type pos) const {
|
||||||
|
if (size_ <= 0) return npos;
|
||||||
|
for (size_t i = std::min(pos + 1, size_); i != 0;) {
|
||||||
|
if (data_[--i] == c) return i;
|
||||||
|
}
|
||||||
|
return npos;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::ostream& operator<<(std::ostream& o, const StringPiece& p) {
|
||||||
|
o.write(p.data(), p.size());
|
||||||
|
return o;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace re2
|
@ -2,6 +2,9 @@
|
|||||||
// Use of this source code is governed by a BSD-style
|
// Use of this source code is governed by a BSD-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
#ifndef RE2_STRINGPIECE_H_
|
||||||
|
#define RE2_STRINGPIECE_H_
|
||||||
|
|
||||||
// A string-like object that points to a sized piece of memory.
|
// A string-like object that points to a sized piece of memory.
|
||||||
//
|
//
|
||||||
// Functions or methods may use const StringPiece& parameters to accept either
|
// Functions or methods may use const StringPiece& parameters to accept either
|
||||||
@ -16,140 +19,145 @@
|
|||||||
//
|
//
|
||||||
// Arghh! I wish C++ literals were "string".
|
// Arghh! I wish C++ literals were "string".
|
||||||
|
|
||||||
#ifndef STRINGS_STRINGPIECE_H__
|
#include <stddef.h>
|
||||||
#define STRINGS_STRINGPIECE_H__
|
|
||||||
|
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <cstddef>
|
#include <algorithm>
|
||||||
#include <iosfwd>
|
#include <iosfwd>
|
||||||
|
#include <iterator>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
namespace re2 {
|
namespace re2 {
|
||||||
|
|
||||||
class StringPiece {
|
class StringPiece {
|
||||||
private:
|
|
||||||
const char* ptr_;
|
|
||||||
int length_;
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
typedef char value_type;
|
||||||
|
typedef char* pointer;
|
||||||
|
typedef const char* const_pointer;
|
||||||
|
typedef char& reference;
|
||||||
|
typedef const char& const_reference;
|
||||||
|
typedef const char* const_iterator;
|
||||||
|
typedef const_iterator iterator;
|
||||||
|
typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
|
||||||
|
typedef const_reverse_iterator reverse_iterator;
|
||||||
|
typedef size_t size_type;
|
||||||
|
typedef ptrdiff_t difference_type;
|
||||||
|
static const size_type npos = static_cast<size_type>(-1);
|
||||||
|
|
||||||
// We provide non-explicit singleton constructors so users can pass
|
// We provide non-explicit singleton constructors so users can pass
|
||||||
// in a "const char*" or a "string" wherever a "StringPiece" is
|
// in a "const char*" or a "string" wherever a "StringPiece" is
|
||||||
// expected.
|
// expected.
|
||||||
StringPiece() : ptr_(NULL), length_(0) { }
|
StringPiece()
|
||||||
StringPiece(const char* str)
|
: data_(NULL), size_(0) {}
|
||||||
: ptr_(str), length_((str == NULL) ? 0 : static_cast<int>(strlen(str))) { }
|
|
||||||
StringPiece(const std::string& str)
|
StringPiece(const std::string& str)
|
||||||
: ptr_(str.data()), length_(static_cast<int>(str.size())) { }
|
: data_(str.data()), size_(str.size()) {}
|
||||||
StringPiece(const char* offset, int len) : ptr_(offset), length_(len) { }
|
StringPiece(const char* str)
|
||||||
|
: data_(str), size_(str == NULL ? 0 : strlen(str)) {}
|
||||||
|
StringPiece(const char* str, size_type len)
|
||||||
|
: data_(str), size_(len) {}
|
||||||
|
|
||||||
// data() may return a pointer to a buffer with embedded NULs, and the
|
const_iterator begin() const { return data_; }
|
||||||
// returned buffer may or may not be null terminated. Therefore it is
|
const_iterator end() const { return data_ + size_; }
|
||||||
// typically a mistake to pass data() to a routine that expects a NUL
|
const_reverse_iterator rbegin() const {
|
||||||
// terminated string.
|
return const_reverse_iterator(data_ + size_);
|
||||||
const char* data() const { return ptr_; }
|
}
|
||||||
int size() const { return length_; }
|
const_reverse_iterator rend() const {
|
||||||
int length() const { return length_; }
|
return const_reverse_iterator(data_);
|
||||||
bool empty() const { return length_ == 0; }
|
}
|
||||||
|
|
||||||
|
size_type size() const { return size_; }
|
||||||
|
size_type length() const { return size_; }
|
||||||
|
bool empty() const { return size_ == 0; }
|
||||||
|
|
||||||
|
const_reference operator[](size_type i) const { return data_[i]; }
|
||||||
|
const_pointer data() const { return data_; }
|
||||||
|
|
||||||
|
void remove_prefix(size_type n) {
|
||||||
|
data_ += n;
|
||||||
|
size_ -= n;
|
||||||
|
}
|
||||||
|
|
||||||
|
void remove_suffix(size_type n) {
|
||||||
|
size_ -= n;
|
||||||
|
}
|
||||||
|
|
||||||
void clear() { ptr_ = NULL; length_ = 0; }
|
|
||||||
void set(const char* data, int len) { ptr_ = data; length_ = len; }
|
|
||||||
void set(const char* str) {
|
void set(const char* str) {
|
||||||
ptr_ = str;
|
data_ = str;
|
||||||
if (str != NULL)
|
size_ = str == NULL ? 0 : strlen(str);
|
||||||
length_ = static_cast<int>(strlen(str));
|
|
||||||
else
|
|
||||||
length_ = 0;
|
|
||||||
}
|
|
||||||
void set(const void* data, int len) {
|
|
||||||
ptr_ = reinterpret_cast<const char*>(data);
|
|
||||||
length_ = len;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
char operator[](int i) const { return ptr_[i]; }
|
void set(const char* str, size_type len) {
|
||||||
|
data_ = str;
|
||||||
void remove_prefix(int n) {
|
size_ = len;
|
||||||
ptr_ += n;
|
|
||||||
length_ -= n;
|
|
||||||
}
|
|
||||||
|
|
||||||
void remove_suffix(int n) {
|
|
||||||
length_ -= n;
|
|
||||||
}
|
|
||||||
|
|
||||||
int compare(const StringPiece& x) const {
|
|
||||||
int r = memcmp(ptr_, x.ptr_, std::min(length_, x.length_));
|
|
||||||
if (r == 0) {
|
|
||||||
if (length_ < x.length_) r = -1;
|
|
||||||
else if (length_ > x.length_) r = +1;
|
|
||||||
}
|
|
||||||
return r;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string as_string() const {
|
std::string as_string() const {
|
||||||
return std::string(data(), size());
|
return std::string(data_, size_);
|
||||||
}
|
}
|
||||||
|
|
||||||
// We also define ToString() here, since many other string-like
|
// We also define ToString() here, since many other string-like
|
||||||
// interfaces name the routine that converts to a C++ string
|
// interfaces name the routine that converts to a C++ string
|
||||||
// "ToString", and it's confusing to have the method that does that
|
// "ToString", and it's confusing to have the method that does that
|
||||||
// for a StringPiece be called "as_string()". We also leave the
|
// for a StringPiece be called "as_string()". We also leave the
|
||||||
// "as_string()" method defined here for existing code.
|
// "as_string()" method defined here for existing code.
|
||||||
std::string ToString() const {
|
std::string ToString() const {
|
||||||
return std::string(data(), size());
|
return std::string(data_, size_);
|
||||||
}
|
}
|
||||||
|
|
||||||
void CopyToString(std::string* target) const;
|
void CopyToString(std::string* target) const {
|
||||||
void AppendToString(std::string* target) const;
|
target->assign(data_, size_);
|
||||||
|
}
|
||||||
|
|
||||||
// Does "this" start with "x"
|
void AppendToString(std::string* target) const {
|
||||||
|
target->append(data_, size_);
|
||||||
|
}
|
||||||
|
|
||||||
|
size_type copy(char* buf, size_type n, size_type pos = 0) const;
|
||||||
|
StringPiece substr(size_type pos = 0, size_type n = npos) const;
|
||||||
|
|
||||||
|
int compare(const StringPiece& x) const {
|
||||||
|
size_type min_size = std::min(size(), x.size());
|
||||||
|
if (min_size > 0) {
|
||||||
|
int r = memcmp(data(), x.data(), min_size);
|
||||||
|
if (r < 0) return -1;
|
||||||
|
if (r > 0) return 1;
|
||||||
|
}
|
||||||
|
if (size() < x.size()) return -1;
|
||||||
|
if (size() > x.size()) return 1;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Does "this" start with "x"?
|
||||||
bool starts_with(const StringPiece& x) const {
|
bool starts_with(const StringPiece& x) const {
|
||||||
return ((length_ >= x.length_) &&
|
return x.empty() ||
|
||||||
(memcmp(ptr_, x.ptr_, x.length_) == 0));
|
(size() >= x.size() && memcmp(data(), x.data(), x.size()) == 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Does "this" end with "x"
|
// Does "this" end with "x"?
|
||||||
bool ends_with(const StringPiece& x) const {
|
bool ends_with(const StringPiece& x) const {
|
||||||
return ((length_ >= x.length_) &&
|
return x.empty() ||
|
||||||
(memcmp(ptr_ + (length_-x.length_), x.ptr_, x.length_) == 0));
|
(size() >= x.size() &&
|
||||||
|
memcmp(data() + (size() - x.size()), x.data(), x.size()) == 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
// standard STL container boilerplate
|
bool contains(const StringPiece& s) const {
|
||||||
typedef char value_type;
|
return find(s) != npos;
|
||||||
typedef const char* pointer;
|
|
||||||
typedef const char& reference;
|
|
||||||
typedef const char& const_reference;
|
|
||||||
typedef size_t size_type;
|
|
||||||
typedef ptrdiff_t difference_type;
|
|
||||||
static const size_type npos;
|
|
||||||
typedef const char* const_iterator;
|
|
||||||
typedef const char* iterator;
|
|
||||||
typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
|
|
||||||
typedef std::reverse_iterator<iterator> reverse_iterator;
|
|
||||||
iterator begin() const { return ptr_; }
|
|
||||||
iterator end() const { return ptr_ + length_; }
|
|
||||||
const_reverse_iterator rbegin() const {
|
|
||||||
return const_reverse_iterator(ptr_ + length_);
|
|
||||||
}
|
}
|
||||||
const_reverse_iterator rend() const {
|
|
||||||
return const_reverse_iterator(ptr_);
|
|
||||||
}
|
|
||||||
// STLS says return size_type, but Google says return int
|
|
||||||
int max_size() const { return length_; }
|
|
||||||
int capacity() const { return length_; }
|
|
||||||
|
|
||||||
int copy(char* buf, size_type n, size_type pos = 0) const;
|
size_type find(const StringPiece& s, size_type pos = 0) const;
|
||||||
|
size_type find(char c, size_type pos = 0) const;
|
||||||
|
size_type rfind(const StringPiece& s, size_type pos = npos) const;
|
||||||
|
size_type rfind(char c, size_type pos = npos) const;
|
||||||
|
|
||||||
int find(const StringPiece& s, size_type pos = 0) const;
|
private:
|
||||||
int find(char c, size_type pos = 0) const;
|
const_pointer data_;
|
||||||
int rfind(const StringPiece& s, size_type pos = npos) const;
|
size_type size_;
|
||||||
int rfind(char c, size_type pos = npos) const;
|
|
||||||
|
|
||||||
StringPiece substr(size_type pos, size_type n = npos) const;
|
|
||||||
|
|
||||||
static bool _equal(const StringPiece&, const StringPiece&);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
inline bool operator==(const StringPiece& x, const StringPiece& y) {
|
inline bool operator==(const StringPiece& x, const StringPiece& y) {
|
||||||
return StringPiece::_equal(x, y);
|
StringPiece::size_type len = x.size();
|
||||||
|
if (len != y.size()) return false;
|
||||||
|
return x.data() == y.data() || len == 0 ||
|
||||||
|
memcmp(x.data(), y.data(), len) == 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline bool operator!=(const StringPiece& x, const StringPiece& y) {
|
inline bool operator!=(const StringPiece& x, const StringPiece& y) {
|
||||||
@ -157,9 +165,9 @@ inline bool operator!=(const StringPiece& x, const StringPiece& y) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
inline bool operator<(const StringPiece& x, const StringPiece& y) {
|
inline bool operator<(const StringPiece& x, const StringPiece& y) {
|
||||||
const int r = memcmp(x.data(), y.data(),
|
StringPiece::size_type min_size = std::min(x.size(), y.size());
|
||||||
std::min(x.size(), y.size()));
|
int r = min_size == 0 ? 0 : memcmp(x.data(), y.data(), min_size);
|
||||||
return ((r < 0) || ((r == 0) && (x.size() < y.size())));
|
return (r < 0) || (r == 0 && x.size() < y.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
inline bool operator>(const StringPiece& x, const StringPiece& y) {
|
inline bool operator>(const StringPiece& x, const StringPiece& y) {
|
||||||
@ -174,9 +182,9 @@ inline bool operator>=(const StringPiece& x, const StringPiece& y) {
|
|||||||
return !(x < y);
|
return !(x < y);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Allow StringPiece to be logged.
|
||||||
|
std::ostream& operator<<(std::ostream& o, const StringPiece& p);
|
||||||
|
|
||||||
} // namespace re2
|
} // namespace re2
|
||||||
|
|
||||||
// allow StringPiece to be logged
|
#endif // RE2_STRINGPIECE_H_
|
||||||
extern std::ostream& operator<<(std::ostream& o, const re2::StringPiece& piece);
|
|
||||||
|
|
||||||
#endif // STRINGS_STRINGPIECE_H__
|
|
||||||
|
@ -5,7 +5,13 @@
|
|||||||
// Format a regular expression structure as a string.
|
// Format a regular expression structure as a string.
|
||||||
// Tested by parse_test.cc
|
// Tested by parse_test.cc
|
||||||
|
|
||||||
|
#include <string.h>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
#include "util/util.h"
|
#include "util/util.h"
|
||||||
|
#include "util/logging.h"
|
||||||
|
#include "util/strutil.h"
|
||||||
|
#include "util/utf.h"
|
||||||
#include "re2/regexp.h"
|
#include "re2/regexp.h"
|
||||||
#include "re2/walker-inl.h"
|
#include "re2/walker-inl.h"
|
||||||
|
|
||||||
@ -42,7 +48,8 @@ class ToStringWalker : public Regexp::Walker<int> {
|
|||||||
private:
|
private:
|
||||||
string* t_; // The string the walker appends to.
|
string* t_; // The string the walker appends to.
|
||||||
|
|
||||||
DISALLOW_EVIL_CONSTRUCTORS(ToStringWalker);
|
ToStringWalker(const ToStringWalker&) = delete;
|
||||||
|
ToStringWalker& operator=(const ToStringWalker&) = delete;
|
||||||
};
|
};
|
||||||
|
|
||||||
string Regexp::ToString() {
|
string Regexp::ToString() {
|
||||||
@ -94,6 +101,8 @@ int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) {
|
|||||||
|
|
||||||
case kRegexpCapture:
|
case kRegexpCapture:
|
||||||
t_->append("(");
|
t_->append("(");
|
||||||
|
if (re->cap() == 0)
|
||||||
|
LOG(DFATAL) << "kRegexpCapture cap() == 0";
|
||||||
if (re->name()) {
|
if (re->name()) {
|
||||||
t_->append("?P<");
|
t_->append("?P<");
|
||||||
t_->append(*re->name());
|
t_->append(*re->name());
|
||||||
@ -120,13 +129,12 @@ int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) {
|
|||||||
static void AppendLiteral(string *t, Rune r, bool foldcase) {
|
static void AppendLiteral(string *t, Rune r, bool foldcase) {
|
||||||
if (r != 0 && r < 0x80 && strchr("(){}[]*+?|.^$\\", r)) {
|
if (r != 0 && r < 0x80 && strchr("(){}[]*+?|.^$\\", r)) {
|
||||||
t->append(1, '\\');
|
t->append(1, '\\');
|
||||||
t->append(1, r);
|
t->append(1, static_cast<char>(r));
|
||||||
} else if (foldcase && 'a' <= r && r <= 'z') {
|
} else if (foldcase && 'a' <= r && r <= 'z') {
|
||||||
if ('a' <= r && r <= 'z')
|
r -= 'a' - 'A';
|
||||||
r += 'A' - 'a';
|
|
||||||
t->append(1, '[');
|
t->append(1, '[');
|
||||||
t->append(1, r);
|
t->append(1, static_cast<char>(r));
|
||||||
t->append(1, r + 'a' - 'A');
|
t->append(1, static_cast<char>(r) + 'a' - 'A');
|
||||||
t->append(1, ']');
|
t->append(1, ']');
|
||||||
} else {
|
} else {
|
||||||
AppendCCRange(t, r, r);
|
AppendCCRange(t, r, r);
|
||||||
@ -154,12 +162,14 @@ int ToStringWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg,
|
|||||||
break;
|
break;
|
||||||
|
|
||||||
case kRegexpLiteral:
|
case kRegexpLiteral:
|
||||||
AppendLiteral(t_, re->rune(), re->parse_flags() & Regexp::FoldCase);
|
AppendLiteral(t_, re->rune(),
|
||||||
|
(re->parse_flags() & Regexp::FoldCase) != 0);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case kRegexpLiteralString:
|
case kRegexpLiteralString:
|
||||||
for (int i = 0; i < re->nrunes(); i++)
|
for (int i = 0; i < re->nrunes(); i++)
|
||||||
AppendLiteral(t_, re->runes()[i], re->parse_flags() & Regexp::FoldCase);
|
AppendLiteral(t_, re->runes()[i],
|
||||||
|
(re->parse_flags() & Regexp::FoldCase) != 0);
|
||||||
if (prec < PrecConcat)
|
if (prec < PrecConcat)
|
||||||
t_->append(")");
|
t_->append(")");
|
||||||
break;
|
break;
|
||||||
@ -297,7 +307,7 @@ static void AppendCCChar(string* t, Rune r) {
|
|||||||
if (0x20 <= r && r <= 0x7E) {
|
if (0x20 <= r && r <= 0x7E) {
|
||||||
if (strchr("[]^-\\", r))
|
if (strchr("[]^-\\", r))
|
||||||
t->append("\\");
|
t->append("\\");
|
||||||
t->append(1, r);
|
t->append(1, static_cast<char>(r));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
switch (r) {
|
switch (r) {
|
||||||
|
@ -9,7 +9,7 @@ import re
|
|||||||
import urllib2
|
import urllib2
|
||||||
|
|
||||||
# Directory or URL where Unicode tables reside.
|
# Directory or URL where Unicode tables reside.
|
||||||
_UNICODE_DIR = "http://www.unicode.org/Public/6.3.0/ucd"
|
_UNICODE_DIR = "http://www.unicode.org/Public/10.0.0/ucd"
|
||||||
|
|
||||||
# Largest valid Unicode code value.
|
# Largest valid Unicode code value.
|
||||||
_RUNE_MAX = 0x10FFFF
|
_RUNE_MAX = 0x10FFFF
|
||||||
|
@ -7,7 +7,7 @@
|
|||||||
namespace re2 {
|
namespace re2 {
|
||||||
|
|
||||||
|
|
||||||
// 1034 groups, 2089 pairs, 289 ranges
|
// 1295 groups, 2620 pairs, 343 ranges
|
||||||
const CaseFold unicode_casefold[] = {
|
const CaseFold unicode_casefold[] = {
|
||||||
{ 65, 90, 32 },
|
{ 65, 90, 32 },
|
||||||
{ 97, 106, -32 },
|
{ 97, 106, -32 },
|
||||||
@ -105,13 +105,17 @@ const CaseFold unicode_casefold[] = {
|
|||||||
{ 598, 599, -205 },
|
{ 598, 599, -205 },
|
||||||
{ 601, 601, -202 },
|
{ 601, 601, -202 },
|
||||||
{ 603, 603, -203 },
|
{ 603, 603, -203 },
|
||||||
|
{ 604, 604, 42319 },
|
||||||
{ 608, 608, -205 },
|
{ 608, 608, -205 },
|
||||||
|
{ 609, 609, 42315 },
|
||||||
{ 611, 611, -207 },
|
{ 611, 611, -207 },
|
||||||
{ 613, 613, 42280 },
|
{ 613, 613, 42280 },
|
||||||
{ 614, 614, 42308 },
|
{ 614, 614, 42308 },
|
||||||
{ 616, 616, -209 },
|
{ 616, 616, -209 },
|
||||||
{ 617, 617, -211 },
|
{ 617, 617, -211 },
|
||||||
|
{ 618, 618, 42308 },
|
||||||
{ 619, 619, 10743 },
|
{ 619, 619, 10743 },
|
||||||
|
{ 620, 620, 42305 },
|
||||||
{ 623, 623, -211 },
|
{ 623, 623, -211 },
|
||||||
{ 625, 625, 10749 },
|
{ 625, 625, 10749 },
|
||||||
{ 626, 626, -213 },
|
{ 626, 626, -213 },
|
||||||
@ -119,15 +123,19 @@ const CaseFold unicode_casefold[] = {
|
|||||||
{ 637, 637, 10727 },
|
{ 637, 637, 10727 },
|
||||||
{ 640, 640, -218 },
|
{ 640, 640, -218 },
|
||||||
{ 643, 643, -218 },
|
{ 643, 643, -218 },
|
||||||
|
{ 647, 647, 42282 },
|
||||||
{ 648, 648, -218 },
|
{ 648, 648, -218 },
|
||||||
{ 649, 649, -69 },
|
{ 649, 649, -69 },
|
||||||
{ 650, 651, -217 },
|
{ 650, 651, -217 },
|
||||||
{ 652, 652, -71 },
|
{ 652, 652, -71 },
|
||||||
{ 658, 658, -219 },
|
{ 658, 658, -219 },
|
||||||
|
{ 669, 669, 42261 },
|
||||||
|
{ 670, 670, 42258 },
|
||||||
{ 837, 837, 84 },
|
{ 837, 837, 84 },
|
||||||
{ 880, 883, EvenOdd },
|
{ 880, 883, EvenOdd },
|
||||||
{ 886, 887, EvenOdd },
|
{ 886, 887, EvenOdd },
|
||||||
{ 891, 893, 130 },
|
{ 891, 893, 130 },
|
||||||
|
{ 895, 895, 116 },
|
||||||
{ 902, 902, 38 },
|
{ 902, 902, 38 },
|
||||||
{ 904, 906, 37 },
|
{ 904, 906, 37 },
|
||||||
{ 908, 908, 64 },
|
{ 908, 908, 64 },
|
||||||
@ -168,6 +176,7 @@ const CaseFold unicode_casefold[] = {
|
|||||||
{ 1008, 1008, -86 },
|
{ 1008, 1008, -86 },
|
||||||
{ 1009, 1009, -80 },
|
{ 1009, 1009, -80 },
|
||||||
{ 1010, 1010, 7 },
|
{ 1010, 1010, 7 },
|
||||||
|
{ 1011, 1011, -116 },
|
||||||
{ 1012, 1012, -92 },
|
{ 1012, 1012, -92 },
|
||||||
{ 1013, 1013, -96 },
|
{ 1013, 1013, -96 },
|
||||||
{ 1015, 1016, OddEven },
|
{ 1015, 1016, OddEven },
|
||||||
@ -176,19 +185,43 @@ const CaseFold unicode_casefold[] = {
|
|||||||
{ 1021, 1023, -130 },
|
{ 1021, 1023, -130 },
|
||||||
{ 1024, 1039, 80 },
|
{ 1024, 1039, 80 },
|
||||||
{ 1040, 1071, 32 },
|
{ 1040, 1071, 32 },
|
||||||
{ 1072, 1103, -32 },
|
{ 1072, 1073, -32 },
|
||||||
|
{ 1074, 1074, 6222 },
|
||||||
|
{ 1075, 1075, -32 },
|
||||||
|
{ 1076, 1076, 6221 },
|
||||||
|
{ 1077, 1085, -32 },
|
||||||
|
{ 1086, 1086, 6212 },
|
||||||
|
{ 1087, 1088, -32 },
|
||||||
|
{ 1089, 1090, 6210 },
|
||||||
|
{ 1091, 1097, -32 },
|
||||||
|
{ 1098, 1098, 6204 },
|
||||||
|
{ 1099, 1103, -32 },
|
||||||
{ 1104, 1119, -80 },
|
{ 1104, 1119, -80 },
|
||||||
{ 1120, 1153, EvenOdd },
|
{ 1120, 1122, EvenOdd },
|
||||||
|
{ 1123, 1123, 6180 },
|
||||||
|
{ 1124, 1153, EvenOdd },
|
||||||
{ 1162, 1215, EvenOdd },
|
{ 1162, 1215, EvenOdd },
|
||||||
{ 1216, 1216, 15 },
|
{ 1216, 1216, 15 },
|
||||||
{ 1217, 1230, OddEven },
|
{ 1217, 1230, OddEven },
|
||||||
{ 1231, 1231, -15 },
|
{ 1231, 1231, -15 },
|
||||||
{ 1232, 1319, EvenOdd },
|
{ 1232, 1327, EvenOdd },
|
||||||
{ 1329, 1366, 48 },
|
{ 1329, 1366, 48 },
|
||||||
{ 1377, 1414, -48 },
|
{ 1377, 1414, -48 },
|
||||||
{ 4256, 4293, 7264 },
|
{ 4256, 4293, 7264 },
|
||||||
{ 4295, 4295, 7264 },
|
{ 4295, 4295, 7264 },
|
||||||
{ 4301, 4301, 7264 },
|
{ 4301, 4301, 7264 },
|
||||||
|
{ 5024, 5103, 38864 },
|
||||||
|
{ 5104, 5109, 8 },
|
||||||
|
{ 5112, 5117, -8 },
|
||||||
|
{ 7296, 7296, -6254 },
|
||||||
|
{ 7297, 7297, -6253 },
|
||||||
|
{ 7298, 7298, -6244 },
|
||||||
|
{ 7299, 7299, -6242 },
|
||||||
|
{ 7300, 7300, EvenOdd },
|
||||||
|
{ 7301, 7301, -6243 },
|
||||||
|
{ 7302, 7302, -6236 },
|
||||||
|
{ 7303, 7303, -6181 },
|
||||||
|
{ 7304, 7304, 35266 },
|
||||||
{ 7545, 7545, 35332 },
|
{ 7545, 7545, 35332 },
|
||||||
{ 7549, 7549, 3814 },
|
{ 7549, 7549, 3814 },
|
||||||
{ 7680, 7776, EvenOdd },
|
{ 7680, 7776, EvenOdd },
|
||||||
@ -282,8 +315,10 @@ const CaseFold unicode_casefold[] = {
|
|||||||
{ 11520, 11557, -7264 },
|
{ 11520, 11557, -7264 },
|
||||||
{ 11559, 11559, -7264 },
|
{ 11559, 11559, -7264 },
|
||||||
{ 11565, 11565, -7264 },
|
{ 11565, 11565, -7264 },
|
||||||
{ 42560, 42605, EvenOdd },
|
{ 42560, 42570, EvenOdd },
|
||||||
{ 42624, 42647, EvenOdd },
|
{ 42571, 42571, -35267 },
|
||||||
|
{ 42572, 42605, EvenOdd },
|
||||||
|
{ 42624, 42651, EvenOdd },
|
||||||
{ 42786, 42799, EvenOdd },
|
{ 42786, 42799, EvenOdd },
|
||||||
{ 42802, 42863, EvenOdd },
|
{ 42802, 42863, EvenOdd },
|
||||||
{ 42873, 42876, OddEven },
|
{ 42873, 42876, OddEven },
|
||||||
@ -292,16 +327,35 @@ const CaseFold unicode_casefold[] = {
|
|||||||
{ 42891, 42892, OddEven },
|
{ 42891, 42892, OddEven },
|
||||||
{ 42893, 42893, -42280 },
|
{ 42893, 42893, -42280 },
|
||||||
{ 42896, 42899, EvenOdd },
|
{ 42896, 42899, EvenOdd },
|
||||||
{ 42912, 42921, EvenOdd },
|
{ 42902, 42921, EvenOdd },
|
||||||
{ 42922, 42922, -42308 },
|
{ 42922, 42922, -42308 },
|
||||||
|
{ 42923, 42923, -42319 },
|
||||||
|
{ 42924, 42924, -42315 },
|
||||||
|
{ 42925, 42925, -42305 },
|
||||||
|
{ 42926, 42926, -42308 },
|
||||||
|
{ 42928, 42928, -42258 },
|
||||||
|
{ 42929, 42929, -42282 },
|
||||||
|
{ 42930, 42930, -42261 },
|
||||||
|
{ 42931, 42931, 928 },
|
||||||
|
{ 42932, 42935, EvenOdd },
|
||||||
|
{ 43859, 43859, -928 },
|
||||||
|
{ 43888, 43967, -38864 },
|
||||||
{ 65313, 65338, 32 },
|
{ 65313, 65338, 32 },
|
||||||
{ 65345, 65370, -32 },
|
{ 65345, 65370, -32 },
|
||||||
{ 66560, 66599, 40 },
|
{ 66560, 66599, 40 },
|
||||||
{ 66600, 66639, -40 },
|
{ 66600, 66639, -40 },
|
||||||
|
{ 66736, 66771, 40 },
|
||||||
|
{ 66776, 66811, -40 },
|
||||||
|
{ 68736, 68786, 64 },
|
||||||
|
{ 68800, 68850, -64 },
|
||||||
|
{ 71840, 71871, 32 },
|
||||||
|
{ 71872, 71903, -32 },
|
||||||
|
{ 125184, 125217, 34 },
|
||||||
|
{ 125218, 125251, -34 },
|
||||||
};
|
};
|
||||||
const int num_unicode_casefold = 289;
|
const int num_unicode_casefold = 343;
|
||||||
|
|
||||||
// 1034 groups, 1055 pairs, 167 ranges
|
// 1295 groups, 1325 pairs, 191 ranges
|
||||||
const CaseFold unicode_tolower[] = {
|
const CaseFold unicode_tolower[] = {
|
||||||
{ 65, 90, 32 },
|
{ 65, 90, 32 },
|
||||||
{ 181, 181, 775 },
|
{ 181, 181, 775 },
|
||||||
@ -370,6 +424,7 @@ const CaseFold unicode_tolower[] = {
|
|||||||
{ 837, 837, 116 },
|
{ 837, 837, 116 },
|
||||||
{ 880, 882, EvenOddSkip },
|
{ 880, 882, EvenOddSkip },
|
||||||
{ 886, 886, EvenOdd },
|
{ 886, 886, EvenOdd },
|
||||||
|
{ 895, 895, 116 },
|
||||||
{ 902, 902, 38 },
|
{ 902, 902, 38 },
|
||||||
{ 904, 906, 37 },
|
{ 904, 906, 37 },
|
||||||
{ 908, 908, 64 },
|
{ 908, 908, 64 },
|
||||||
@ -397,11 +452,20 @@ const CaseFold unicode_tolower[] = {
|
|||||||
{ 1162, 1214, EvenOddSkip },
|
{ 1162, 1214, EvenOddSkip },
|
||||||
{ 1216, 1216, 15 },
|
{ 1216, 1216, 15 },
|
||||||
{ 1217, 1229, OddEvenSkip },
|
{ 1217, 1229, OddEvenSkip },
|
||||||
{ 1232, 1318, EvenOddSkip },
|
{ 1232, 1326, EvenOddSkip },
|
||||||
{ 1329, 1366, 48 },
|
{ 1329, 1366, 48 },
|
||||||
{ 4256, 4293, 7264 },
|
{ 4256, 4293, 7264 },
|
||||||
{ 4295, 4295, 7264 },
|
{ 4295, 4295, 7264 },
|
||||||
{ 4301, 4301, 7264 },
|
{ 4301, 4301, 7264 },
|
||||||
|
{ 5112, 5117, -8 },
|
||||||
|
{ 7296, 7296, -6222 },
|
||||||
|
{ 7297, 7297, -6221 },
|
||||||
|
{ 7298, 7298, -6212 },
|
||||||
|
{ 7299, 7300, -6210 },
|
||||||
|
{ 7301, 7301, -6211 },
|
||||||
|
{ 7302, 7302, -6204 },
|
||||||
|
{ 7303, 7303, -6180 },
|
||||||
|
{ 7304, 7304, 35267 },
|
||||||
{ 7680, 7828, EvenOddSkip },
|
{ 7680, 7828, EvenOddSkip },
|
||||||
{ 7835, 7835, -58 },
|
{ 7835, 7835, -58 },
|
||||||
{ 7838, 7838, -7615 },
|
{ 7838, 7838, -7615 },
|
||||||
@ -457,7 +521,7 @@ const CaseFold unicode_tolower[] = {
|
|||||||
{ 11499, 11501, OddEvenSkip },
|
{ 11499, 11501, OddEvenSkip },
|
||||||
{ 11506, 11506, EvenOdd },
|
{ 11506, 11506, EvenOdd },
|
||||||
{ 42560, 42604, EvenOddSkip },
|
{ 42560, 42604, EvenOddSkip },
|
||||||
{ 42624, 42646, EvenOddSkip },
|
{ 42624, 42650, EvenOddSkip },
|
||||||
{ 42786, 42798, EvenOddSkip },
|
{ 42786, 42798, EvenOddSkip },
|
||||||
{ 42802, 42862, EvenOddSkip },
|
{ 42802, 42862, EvenOddSkip },
|
||||||
{ 42873, 42875, OddEvenSkip },
|
{ 42873, 42875, OddEvenSkip },
|
||||||
@ -466,12 +530,26 @@ const CaseFold unicode_tolower[] = {
|
|||||||
{ 42891, 42891, OddEven },
|
{ 42891, 42891, OddEven },
|
||||||
{ 42893, 42893, -42280 },
|
{ 42893, 42893, -42280 },
|
||||||
{ 42896, 42898, EvenOddSkip },
|
{ 42896, 42898, EvenOddSkip },
|
||||||
{ 42912, 42920, EvenOddSkip },
|
{ 42902, 42920, EvenOddSkip },
|
||||||
{ 42922, 42922, -42308 },
|
{ 42922, 42922, -42308 },
|
||||||
|
{ 42923, 42923, -42319 },
|
||||||
|
{ 42924, 42924, -42315 },
|
||||||
|
{ 42925, 42925, -42305 },
|
||||||
|
{ 42926, 42926, -42308 },
|
||||||
|
{ 42928, 42928, -42258 },
|
||||||
|
{ 42929, 42929, -42282 },
|
||||||
|
{ 42930, 42930, -42261 },
|
||||||
|
{ 42931, 42931, 928 },
|
||||||
|
{ 42932, 42934, EvenOddSkip },
|
||||||
|
{ 43888, 43967, -38864 },
|
||||||
{ 65313, 65338, 32 },
|
{ 65313, 65338, 32 },
|
||||||
{ 66560, 66599, 40 },
|
{ 66560, 66599, 40 },
|
||||||
|
{ 66736, 66771, 40 },
|
||||||
|
{ 68736, 68786, 64 },
|
||||||
|
{ 71840, 71871, 32 },
|
||||||
|
{ 125184, 125217, 34 },
|
||||||
};
|
};
|
||||||
const int num_unicode_tolower = 167;
|
const int num_unicode_tolower = 191;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -2,6 +2,9 @@
|
|||||||
// Use of this source code is governed by a BSD-style
|
// Use of this source code is governed by a BSD-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
#ifndef RE2_UNICODE_CASEFOLD_H_
|
||||||
|
#define RE2_UNICODE_CASEFOLD_H_
|
||||||
|
|
||||||
// Unicode case folding tables.
|
// Unicode case folding tables.
|
||||||
|
|
||||||
// The Unicode case folding tables encode the mapping from one Unicode point
|
// The Unicode case folding tables encode the mapping from one Unicode point
|
||||||
@ -16,7 +19,7 @@
|
|||||||
// 'K' -> 'K'
|
// 'K' -> 'K'
|
||||||
//
|
//
|
||||||
// Like everything Unicode, these tables are big. If we represent the table
|
// Like everything Unicode, these tables are big. If we represent the table
|
||||||
// as a sorted list of uint32 pairs, it has 2049 entries and is 16 kB.
|
// as a sorted list of uint32_t pairs, it has 2049 entries and is 16 kB.
|
||||||
// Most table entries look like the ones around them:
|
// Most table entries look like the ones around them:
|
||||||
// 'A' maps to 'A'+32, 'B' maps to 'B'+32, etc.
|
// 'A' maps to 'A'+32, 'B' maps to 'B'+32, etc.
|
||||||
// Instead of listing all the pairs explicitly, we make a list of ranges
|
// Instead of listing all the pairs explicitly, we make a list of ranges
|
||||||
@ -36,10 +39,10 @@
|
|||||||
// The grouped form also allows for efficient fold range calculations
|
// The grouped form also allows for efficient fold range calculations
|
||||||
// rather than looping one character at a time.
|
// rather than looping one character at a time.
|
||||||
|
|
||||||
#ifndef RE2_UNICODE_CASEFOLD_H__
|
#include <stdint.h>
|
||||||
#define RE2_UNICODE_CASEFOLD_H__
|
|
||||||
|
|
||||||
#include "util/util.h"
|
#include "util/util.h"
|
||||||
|
#include "util/utf.h"
|
||||||
|
|
||||||
namespace re2 {
|
namespace re2 {
|
||||||
|
|
||||||
@ -51,9 +54,9 @@ enum {
|
|||||||
};
|
};
|
||||||
|
|
||||||
struct CaseFold {
|
struct CaseFold {
|
||||||
uint32 lo;
|
Rune lo;
|
||||||
uint32 hi;
|
Rune hi;
|
||||||
int32 delta;
|
int32_t delta;
|
||||||
};
|
};
|
||||||
|
|
||||||
extern const CaseFold unicode_casefold[];
|
extern const CaseFold unicode_casefold[];
|
||||||
@ -72,4 +75,4 @@ extern Rune ApplyFold(const CaseFold *f, Rune r);
|
|||||||
|
|
||||||
} // namespace re2
|
} // namespace re2
|
||||||
|
|
||||||
#endif // RE2_UNICODE_CASEFOLD_H__
|
#endif // RE2_UNICODE_CASEFOLD_H_
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -2,6 +2,9 @@
|
|||||||
// Use of this source code is governed by a BSD-style
|
// Use of this source code is governed by a BSD-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
#ifndef RE2_UNICODE_GROUPS_H_
|
||||||
|
#define RE2_UNICODE_GROUPS_H_
|
||||||
|
|
||||||
// Unicode character groups.
|
// Unicode character groups.
|
||||||
|
|
||||||
// The codes get split into ranges of 16-bit codes
|
// The codes get split into ranges of 16-bit codes
|
||||||
@ -15,23 +18,23 @@
|
|||||||
// to 16.5 kB of data but make the data harder to use;
|
// to 16.5 kB of data but make the data harder to use;
|
||||||
// we don't bother.
|
// we don't bother.
|
||||||
|
|
||||||
#ifndef RE2_UNICODE_GROUPS_H__
|
#include <stdint.h>
|
||||||
#define RE2_UNICODE_GROUPS_H__
|
|
||||||
|
|
||||||
#include "util/util.h"
|
#include "util/util.h"
|
||||||
|
#include "util/utf.h"
|
||||||
|
|
||||||
namespace re2 {
|
namespace re2 {
|
||||||
|
|
||||||
struct URange16
|
struct URange16
|
||||||
{
|
{
|
||||||
uint16 lo;
|
uint16_t lo;
|
||||||
uint16 hi;
|
uint16_t hi;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct URange32
|
struct URange32
|
||||||
{
|
{
|
||||||
uint32 lo;
|
Rune lo;
|
||||||
uint32 hi;
|
Rune hi;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct UGroup
|
struct UGroup
|
||||||
@ -61,4 +64,4 @@ extern const int num_perl_groups;
|
|||||||
|
|
||||||
} // namespace re2
|
} // namespace re2
|
||||||
|
|
||||||
#endif // RE2_UNICODE_GROUPS_H__
|
#endif // RE2_UNICODE_GROUPS_H_
|
||||||
|
@ -2,6 +2,9 @@
|
|||||||
// Use of this source code is governed by a BSD-style
|
// Use of this source code is governed by a BSD-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
#ifndef RE2_WALKER_INL_H_
|
||||||
|
#define RE2_WALKER_INL_H_
|
||||||
|
|
||||||
// Helper class for traversing Regexps without recursion.
|
// Helper class for traversing Regexps without recursion.
|
||||||
// Clients should declare their own subclasses that override
|
// Clients should declare their own subclasses that override
|
||||||
// the PreVisit and PostVisit methods, which are called before
|
// the PreVisit and PostVisit methods, which are called before
|
||||||
@ -10,9 +13,9 @@
|
|||||||
// Not quite the Visitor pattern, because (among other things)
|
// Not quite the Visitor pattern, because (among other things)
|
||||||
// the Visitor pattern is recursive.
|
// the Visitor pattern is recursive.
|
||||||
|
|
||||||
#ifndef RE2_WALKER_INL_H__
|
#include <stack>
|
||||||
#define RE2_WALKER_INL_H__
|
|
||||||
|
|
||||||
|
#include "util/logging.h"
|
||||||
#include "re2/regexp.h"
|
#include "re2/regexp.h"
|
||||||
|
|
||||||
namespace re2 {
|
namespace re2 {
|
||||||
@ -86,13 +89,14 @@ template<typename T> class Regexp::Walker {
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
// Walk state for the entire traversal.
|
// Walk state for the entire traversal.
|
||||||
stack<WalkState<T> >* stack_;
|
std::stack<WalkState<T> >* stack_;
|
||||||
bool stopped_early_;
|
bool stopped_early_;
|
||||||
int max_visits_;
|
int max_visits_;
|
||||||
|
|
||||||
T WalkInternal(Regexp* re, T top_arg, bool use_copy);
|
T WalkInternal(Regexp* re, T top_arg, bool use_copy);
|
||||||
|
|
||||||
DISALLOW_EVIL_CONSTRUCTORS(Walker);
|
Walker(const Walker&) = delete;
|
||||||
|
Walker& operator=(const Walker&) = delete;
|
||||||
};
|
};
|
||||||
|
|
||||||
template<typename T> T Regexp::Walker<T>::PreVisit(Regexp* re,
|
template<typename T> T Regexp::Walker<T>::PreVisit(Regexp* re,
|
||||||
@ -130,7 +134,7 @@ template<typename T> struct WalkState {
|
|||||||
};
|
};
|
||||||
|
|
||||||
template<typename T> Regexp::Walker<T>::Walker() {
|
template<typename T> Regexp::Walker<T>::Walker() {
|
||||||
stack_ = new stack<WalkState<T> >;
|
stack_ = new std::stack<WalkState<T> >;
|
||||||
stopped_early_ = false;
|
stopped_early_ = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -187,7 +191,7 @@ template<typename T> T Regexp::Walker<T>::WalkInternal(Regexp* re, T top_arg,
|
|||||||
s->child_args = &s->child_arg;
|
s->child_args = &s->child_arg;
|
||||||
else if (re->nsub_ > 1)
|
else if (re->nsub_ > 1)
|
||||||
s->child_args = new T[re->nsub_];
|
s->child_args = new T[re->nsub_];
|
||||||
// Fall through.
|
FALLTHROUGH_INTENDED;
|
||||||
}
|
}
|
||||||
default: {
|
default: {
|
||||||
if (re->nsub_ > 0) {
|
if (re->nsub_ > 0) {
|
||||||
@ -241,4 +245,4 @@ template<typename T> T Regexp::Walker<T>::WalkExponential(Regexp* re, T top_arg,
|
|||||||
|
|
||||||
} // namespace re2
|
} // namespace re2
|
||||||
|
|
||||||
#endif // RE2_WALKER_INL_H__
|
#endif // RE2_WALKER_INL_H_
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
file (READ ${SOURCE_FILENAME} CONTENT)
|
file (READ ${SOURCE_FILENAME} CONTENT)
|
||||||
string (REGEX REPLACE "using re2::RE2;" "" CONTENT "${CONTENT}")
|
string (REGEX REPLACE "using re2::RE2;" "" CONTENT "${CONTENT}")
|
||||||
|
string (REGEX REPLACE "using re2::LazyRE2;" "" CONTENT "${CONTENT}")
|
||||||
string (REGEX REPLACE "namespace re2" "namespace re2_st" CONTENT "${CONTENT}")
|
string (REGEX REPLACE "namespace re2" "namespace re2_st" CONTENT "${CONTENT}")
|
||||||
string (REGEX REPLACE "re2::" "re2_st::" CONTENT "${CONTENT}")
|
string (REGEX REPLACE "re2::" "re2_st::" CONTENT "${CONTENT}")
|
||||||
string (REGEX REPLACE "\"re2/" "\"re2_st/" CONTENT "${CONTENT}")
|
string (REGEX REPLACE "\"re2/" "\"re2_st/" CONTENT "${CONTENT}")
|
||||||
|
@ -1,168 +0,0 @@
|
|||||||
// Copyright 2000 The RE2 Authors. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
#include "util/util.h"
|
|
||||||
|
|
||||||
namespace re2 {
|
|
||||||
|
|
||||||
// ----------------------------------------------------------------------
|
|
||||||
// UnsafeArena::UnsafeArena()
|
|
||||||
// UnsafeArena::~UnsafeArena()
|
|
||||||
// Destroying the arena automatically calls Reset()
|
|
||||||
// ----------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
UnsafeArena::UnsafeArena(const size_t block_size)
|
|
||||||
: block_size_(block_size),
|
|
||||||
freestart_(NULL), // set for real in Reset()
|
|
||||||
last_alloc_(NULL),
|
|
||||||
remaining_(0),
|
|
||||||
blocks_alloced_(1),
|
|
||||||
overflow_blocks_(NULL) {
|
|
||||||
assert(block_size > kDefaultAlignment);
|
|
||||||
|
|
||||||
first_blocks_[0].mem = reinterpret_cast<char*>(malloc(block_size_));
|
|
||||||
first_blocks_[0].size = block_size_;
|
|
||||||
|
|
||||||
Reset();
|
|
||||||
}
|
|
||||||
|
|
||||||
UnsafeArena::~UnsafeArena() {
|
|
||||||
FreeBlocks();
|
|
||||||
assert(overflow_blocks_ == NULL); // FreeBlocks() should do that
|
|
||||||
// The first X blocks stay allocated always by default. Delete them now.
|
|
||||||
for (int i = 0; i < blocks_alloced_; i++)
|
|
||||||
free(first_blocks_[i].mem);
|
|
||||||
}
|
|
||||||
|
|
||||||
// ----------------------------------------------------------------------
|
|
||||||
// UnsafeArena::Reset()
|
|
||||||
// Clears all the memory an arena is using.
|
|
||||||
// ----------------------------------------------------------------------
|
|
||||||
|
|
||||||
void UnsafeArena::Reset() {
|
|
||||||
FreeBlocks();
|
|
||||||
freestart_ = first_blocks_[0].mem;
|
|
||||||
remaining_ = first_blocks_[0].size;
|
|
||||||
last_alloc_ = NULL;
|
|
||||||
|
|
||||||
// We do not know for sure whether or not the first block is aligned,
|
|
||||||
// so we fix that right now.
|
|
||||||
const int overage = reinterpret_cast<uintptr_t>(freestart_) &
|
|
||||||
(kDefaultAlignment-1);
|
|
||||||
if (overage > 0) {
|
|
||||||
const int waste = kDefaultAlignment - overage;
|
|
||||||
freestart_ += waste;
|
|
||||||
remaining_ -= waste;
|
|
||||||
}
|
|
||||||
freestart_when_empty_ = freestart_;
|
|
||||||
assert(!(reinterpret_cast<uintptr_t>(freestart_)&(kDefaultAlignment-1)));
|
|
||||||
}
|
|
||||||
|
|
||||||
// -------------------------------------------------------------
|
|
||||||
// UnsafeArena::AllocNewBlock()
|
|
||||||
// Adds and returns an AllocatedBlock.
|
|
||||||
// The returned AllocatedBlock* is valid until the next call
|
|
||||||
// to AllocNewBlock or Reset. (i.e. anything that might
|
|
||||||
// affect overflow_blocks_).
|
|
||||||
// -------------------------------------------------------------
|
|
||||||
|
|
||||||
UnsafeArena::AllocatedBlock* UnsafeArena::AllocNewBlock(const size_t block_size) {
|
|
||||||
AllocatedBlock *block;
|
|
||||||
// Find the next block.
|
|
||||||
if (static_cast<size_t>(blocks_alloced_) < arraysize(first_blocks_) ) {
|
|
||||||
// Use one of the pre-allocated blocks
|
|
||||||
block = &first_blocks_[blocks_alloced_++];
|
|
||||||
} else { // oops, out of space, move to the vector
|
|
||||||
if (overflow_blocks_ == NULL) overflow_blocks_ = new vector<AllocatedBlock>;
|
|
||||||
// Adds another block to the vector.
|
|
||||||
overflow_blocks_->resize(overflow_blocks_->size()+1);
|
|
||||||
// block points to the last block of the vector.
|
|
||||||
block = &overflow_blocks_->back();
|
|
||||||
}
|
|
||||||
|
|
||||||
block->mem = reinterpret_cast<char*>(malloc(block_size));
|
|
||||||
block->size = block_size;
|
|
||||||
|
|
||||||
return block;
|
|
||||||
}
|
|
||||||
|
|
||||||
// ----------------------------------------------------------------------
|
|
||||||
// UnsafeArena::GetMemoryFallback()
|
|
||||||
// We take memory out of our pool, aligned on the byte boundary
|
|
||||||
// requested. If we don't have space in our current pool, we
|
|
||||||
// allocate a new block (wasting the remaining space in the
|
|
||||||
// current block) and give you that. If your memory needs are
|
|
||||||
// too big for a single block, we make a special your-memory-only
|
|
||||||
// allocation -- this is equivalent to not using the arena at all.
|
|
||||||
// ----------------------------------------------------------------------
|
|
||||||
|
|
||||||
void* UnsafeArena::GetMemoryFallback(const size_t size, const int align) {
|
|
||||||
if (size == 0)
|
|
||||||
return NULL; // stl/stl_alloc.h says this is okay
|
|
||||||
|
|
||||||
assert(align > 0 && 0 == (align & (align - 1))); // must be power of 2
|
|
||||||
|
|
||||||
// If the object is more than a quarter of the block size, allocate
|
|
||||||
// it separately to avoid wasting too much space in leftover bytes
|
|
||||||
if (block_size_ == 0 || size > block_size_/4) {
|
|
||||||
// then it gets its own block in the arena
|
|
||||||
assert(align <= kDefaultAlignment); // because that's what new gives us
|
|
||||||
// This block stays separate from the rest of the world; in particular
|
|
||||||
// we don't update last_alloc_ so you can't reclaim space on this block.
|
|
||||||
return AllocNewBlock(size)->mem;
|
|
||||||
}
|
|
||||||
|
|
||||||
const int overage =
|
|
||||||
(reinterpret_cast<uintptr_t>(freestart_) & (align-1));
|
|
||||||
if (overage) {
|
|
||||||
const int waste = align - overage;
|
|
||||||
freestart_ += waste;
|
|
||||||
if (waste < static_cast<int>(remaining_)) {
|
|
||||||
remaining_ -= waste;
|
|
||||||
} else {
|
|
||||||
remaining_ = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (size > remaining_) {
|
|
||||||
AllocatedBlock *block = AllocNewBlock(block_size_);
|
|
||||||
freestart_ = block->mem;
|
|
||||||
remaining_ = block->size;
|
|
||||||
}
|
|
||||||
remaining_ -= size;
|
|
||||||
last_alloc_ = freestart_;
|
|
||||||
freestart_ += size;
|
|
||||||
assert((reinterpret_cast<uintptr_t>(last_alloc_) & (align-1)) == 0);
|
|
||||||
return reinterpret_cast<void*>(last_alloc_);
|
|
||||||
}
|
|
||||||
|
|
||||||
// ----------------------------------------------------------------------
|
|
||||||
// UnsafeArena::FreeBlocks()
|
|
||||||
// Unlike GetMemory(), which does actual work, ReturnMemory() is a
|
|
||||||
// no-op: we don't "free" memory until Reset() is called. We do
|
|
||||||
// update some stats, though. Note we do no checking that the
|
|
||||||
// pointer you pass in was actually allocated by us, or that it
|
|
||||||
// was allocated for the size you say, so be careful here!
|
|
||||||
// FreeBlocks() does the work for Reset(), actually freeing all
|
|
||||||
// memory allocated in one fell swoop.
|
|
||||||
// ----------------------------------------------------------------------
|
|
||||||
|
|
||||||
void UnsafeArena::FreeBlocks() {
|
|
||||||
for ( int i = 1; i < blocks_alloced_; ++i ) { // keep first block alloced
|
|
||||||
free(first_blocks_[i].mem);
|
|
||||||
first_blocks_[i].mem = NULL;
|
|
||||||
first_blocks_[i].size = 0;
|
|
||||||
}
|
|
||||||
blocks_alloced_ = 1;
|
|
||||||
if (overflow_blocks_ != NULL) {
|
|
||||||
vector<AllocatedBlock>::iterator it;
|
|
||||||
for (it = overflow_blocks_->begin(); it != overflow_blocks_->end(); ++it) {
|
|
||||||
free(it->mem);
|
|
||||||
}
|
|
||||||
delete overflow_blocks_; // These should be used very rarely
|
|
||||||
overflow_blocks_ = NULL;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace re2
|
|
@ -1,103 +0,0 @@
|
|||||||
// Copyright 2000 The RE2 Authors. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
// Sometimes it is necessary to allocate a large number of small
|
|
||||||
// objects. Doing this the usual way (malloc, new) is slow,
|
|
||||||
// especially for multithreaded programs. An UnsafeArena provides a
|
|
||||||
// mark/release method of memory management: it asks for a large chunk
|
|
||||||
// from the operating system and doles it out bit by bit as required.
|
|
||||||
// Then you free all the memory at once by calling UnsafeArena::Reset().
|
|
||||||
// The "Unsafe" refers to the fact that UnsafeArena is not safe to
|
|
||||||
// call from multiple threads.
|
|
||||||
//
|
|
||||||
// The global operator new that can be used as follows:
|
|
||||||
//
|
|
||||||
// #include "lib/arena-inl.h"
|
|
||||||
//
|
|
||||||
// UnsafeArena arena(1000);
|
|
||||||
// Foo* foo = new (AllocateInArena, &arena) Foo;
|
|
||||||
//
|
|
||||||
|
|
||||||
#ifndef RE2_UTIL_ARENA_H_
|
|
||||||
#define RE2_UTIL_ARENA_H_
|
|
||||||
|
|
||||||
namespace re2 {
|
|
||||||
|
|
||||||
// This class is thread-compatible.
|
|
||||||
class UnsafeArena {
|
|
||||||
public:
|
|
||||||
UnsafeArena(const size_t block_size);
|
|
||||||
virtual ~UnsafeArena();
|
|
||||||
|
|
||||||
void Reset();
|
|
||||||
|
|
||||||
// This should be the worst-case alignment for any type. This is
|
|
||||||
// good for IA-32, SPARC version 7 (the last one I know), and
|
|
||||||
// supposedly Alpha. i386 would be more time-efficient with a
|
|
||||||
// default alignment of 8, but ::operator new() uses alignment of 4,
|
|
||||||
// and an assertion will fail below after the call to MakeNewBlock()
|
|
||||||
// if you try to use a larger alignment.
|
|
||||||
#ifdef __i386__
|
|
||||||
static const int kDefaultAlignment = 4;
|
|
||||||
#else
|
|
||||||
static const int kDefaultAlignment = 8;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
private:
|
|
||||||
void* GetMemoryFallback(const size_t size, const int align);
|
|
||||||
|
|
||||||
public:
|
|
||||||
void* GetMemory(const size_t size, const int align) {
|
|
||||||
if ( size > 0 && size < remaining_ && align == 1 ) { // common case
|
|
||||||
last_alloc_ = freestart_;
|
|
||||||
freestart_ += size;
|
|
||||||
remaining_ -= size;
|
|
||||||
return reinterpret_cast<void*>(last_alloc_);
|
|
||||||
}
|
|
||||||
return GetMemoryFallback(size, align);
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
struct AllocatedBlock {
|
|
||||||
char *mem;
|
|
||||||
size_t size;
|
|
||||||
};
|
|
||||||
|
|
||||||
// The returned AllocatedBlock* is valid until the next call to AllocNewBlock
|
|
||||||
// or Reset (i.e. anything that might affect overflow_blocks_).
|
|
||||||
AllocatedBlock *AllocNewBlock(const size_t block_size);
|
|
||||||
|
|
||||||
const AllocatedBlock *IndexToBlock(int index) const;
|
|
||||||
|
|
||||||
const size_t block_size_;
|
|
||||||
char* freestart_; // beginning of the free space in most recent block
|
|
||||||
char* freestart_when_empty_; // beginning of the free space when we're empty
|
|
||||||
char* last_alloc_; // used to make sure ReturnBytes() is safe
|
|
||||||
size_t remaining_;
|
|
||||||
// STL vector isn't as efficient as it could be, so we use an array at first
|
|
||||||
int blocks_alloced_; // how many of the first_blocks_ have been alloced
|
|
||||||
AllocatedBlock first_blocks_[16]; // the length of this array is arbitrary
|
|
||||||
// if the first_blocks_ aren't enough, expand into overflow_blocks_.
|
|
||||||
vector<AllocatedBlock>* overflow_blocks_;
|
|
||||||
|
|
||||||
void FreeBlocks(); // Frees all except first block
|
|
||||||
|
|
||||||
DISALLOW_EVIL_CONSTRUCTORS(UnsafeArena);
|
|
||||||
};
|
|
||||||
|
|
||||||
// Operators for allocation on the arena
|
|
||||||
// Syntax: new (AllocateInArena, arena) MyClass;
|
|
||||||
// STL containers, etc.
|
|
||||||
enum AllocateInArenaType { AllocateInArena };
|
|
||||||
|
|
||||||
} // namespace re2
|
|
||||||
|
|
||||||
inline void* operator new(size_t size,
|
|
||||||
re2::AllocateInArenaType /* unused */,
|
|
||||||
re2::UnsafeArena *arena) {
|
|
||||||
return reinterpret_cast<char*>(arena->GetMemory(size, 1));
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif // RE2_UTIL_ARENA_H_
|
|
||||||
|
|
@ -1,137 +0,0 @@
|
|||||||
// Copyright 2006-2008 The RE2 Authors. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
#ifndef RE2_UTIL_ATOMICOPS_H__
|
|
||||||
#define RE2_UTIL_ATOMICOPS_H__
|
|
||||||
|
|
||||||
// The memory ordering constraints resemble the ones in C11.
|
|
||||||
// RELAXED - no memory ordering, just an atomic operation.
|
|
||||||
// CONSUME - data-dependent ordering.
|
|
||||||
// ACQUIRE - prevents memory accesses from hoisting above the operation.
|
|
||||||
// RELEASE - prevents memory accesses from sinking below the operation.
|
|
||||||
|
|
||||||
#if (__clang_major__ * 100 + __clang_minor__ >= 303) || \
|
|
||||||
(__GNUC__ * 1000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__ >= 40801)
|
|
||||||
|
|
||||||
#define ATOMIC_LOAD_RELAXED(x, p) do { (x) = __atomic_load_n((p), __ATOMIC_RELAXED); } while (0)
|
|
||||||
#define ATOMIC_LOAD_CONSUME(x, p) do { (x) = __atomic_load_n((p), __ATOMIC_CONSUME); } while (0)
|
|
||||||
#define ATOMIC_LOAD_ACQUIRE(x, p) do { (x) = __atomic_load_n((p), __ATOMIC_ACQUIRE); } while (0)
|
|
||||||
#define ATOMIC_STORE_RELAXED(p, v) __atomic_store_n((p), (v), __ATOMIC_RELAXED)
|
|
||||||
#define ATOMIC_STORE_RELEASE(p, v) __atomic_store_n((p), (v), __ATOMIC_RELEASE)
|
|
||||||
|
|
||||||
#else // old compiler
|
|
||||||
|
|
||||||
#define ATOMIC_LOAD_RELAXED(x, p) do { (x) = *(p); } while (0)
|
|
||||||
#define ATOMIC_LOAD_CONSUME(x, p) do { (x) = *(p); MaybeReadMemoryBarrier(); } while (0)
|
|
||||||
#define ATOMIC_LOAD_ACQUIRE(x, p) do { (x) = *(p); ReadMemoryBarrier(); } while (0)
|
|
||||||
#define ATOMIC_STORE_RELAXED(p, v) do { *(p) = (v); } while (0)
|
|
||||||
#define ATOMIC_STORE_RELEASE(p, v) do { WriteMemoryBarrier(); *(p) = (v); } while (0)
|
|
||||||
|
|
||||||
// WriteMemoryBarrier(), ReadMemoryBarrier() and MaybeReadMemoryBarrier()
|
|
||||||
// are an implementation detail and must not be used in the rest of the code.
|
|
||||||
|
|
||||||
#if defined(__i386__)
|
|
||||||
|
|
||||||
static inline void WriteMemoryBarrier() {
|
|
||||||
int x;
|
|
||||||
__asm__ __volatile__("xchgl (%0),%0" // The lock prefix is implicit for xchg.
|
|
||||||
:: "r" (&x));
|
|
||||||
}
|
|
||||||
|
|
||||||
#elif defined(__x86_64__)
|
|
||||||
|
|
||||||
// 64-bit implementations of memory barrier can be simpler, because
|
|
||||||
// "sfence" is guaranteed to exist.
|
|
||||||
static inline void WriteMemoryBarrier() {
|
|
||||||
__asm__ __volatile__("sfence" : : : "memory");
|
|
||||||
}
|
|
||||||
|
|
||||||
#elif defined(__ppc__)
|
|
||||||
|
|
||||||
static inline void WriteMemoryBarrier() {
|
|
||||||
__asm__ __volatile__("eieio" : : : "memory");
|
|
||||||
}
|
|
||||||
|
|
||||||
#elif defined(__alpha__)
|
|
||||||
|
|
||||||
static inline void WriteMemoryBarrier() {
|
|
||||||
__asm__ __volatile__("wmb" : : : "memory");
|
|
||||||
}
|
|
||||||
|
|
||||||
#elif defined(__aarch64__)
|
|
||||||
|
|
||||||
static inline void WriteMemoryBarrier() {
|
|
||||||
__asm__ __volatile__("dmb st" : : : "memory");
|
|
||||||
}
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
#include "util/mutex.h"
|
|
||||||
|
|
||||||
static inline void WriteMemoryBarrier() {
|
|
||||||
// Slight overkill, but good enough:
|
|
||||||
// any mutex implementation must have
|
|
||||||
// a read barrier after the lock operation and
|
|
||||||
// a write barrier before the unlock operation.
|
|
||||||
//
|
|
||||||
// It may be worthwhile to write architecture-specific
|
|
||||||
// barriers for the common platforms, as above, but
|
|
||||||
// this is a correct fallback.
|
|
||||||
re2::Mutex mu;
|
|
||||||
re2::MutexLock l(&mu);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
#error Need WriteMemoryBarrier for architecture.
|
|
||||||
|
|
||||||
// Windows
|
|
||||||
inline void WriteMemoryBarrier() {
|
|
||||||
LONG x;
|
|
||||||
::InterlockedExchange(&x, 0);
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Alpha has very weak memory ordering. If relying on WriteBarriers, one must
|
|
||||||
// use read barriers for the readers too.
|
|
||||||
#if defined(__alpha__)
|
|
||||||
|
|
||||||
static inline void MaybeReadMemoryBarrier() {
|
|
||||||
__asm__ __volatile__("mb" : : : "memory");
|
|
||||||
}
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
static inline void MaybeReadMemoryBarrier() {}
|
|
||||||
|
|
||||||
#endif // __alpha__
|
|
||||||
|
|
||||||
// Read barrier for various targets.
|
|
||||||
|
|
||||||
#if defined(__aarch64__)
|
|
||||||
|
|
||||||
static inline void ReadMemoryBarrier() {
|
|
||||||
__asm__ __volatile__("dmb ld" : : : "memory");
|
|
||||||
}
|
|
||||||
|
|
||||||
#elif defined(__alpha__)
|
|
||||||
|
|
||||||
static inline void ReadMemoryBarrier() {
|
|
||||||
__asm__ __volatile__("mb" : : : "memory");
|
|
||||||
}
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
static inline void ReadMemoryBarrier() {}
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif // old compiler
|
|
||||||
|
|
||||||
#ifndef NO_THREAD_SAFETY_ANALYSIS
|
|
||||||
#define NO_THREAD_SAFETY_ANALYSIS
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif // RE2_UTIL_ATOMICOPS_H__
|
|
@ -2,6 +2,12 @@
|
|||||||
// Use of this source code is governed by a BSD-style
|
// Use of this source code is governed by a BSD-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <algorithm>
|
||||||
|
#include <chrono>
|
||||||
|
|
||||||
#include "util/util.h"
|
#include "util/util.h"
|
||||||
#include "util/flags.h"
|
#include "util/flags.h"
|
||||||
#include "util/benchmark.h"
|
#include "util/benchmark.h"
|
||||||
@ -9,8 +15,11 @@
|
|||||||
|
|
||||||
DEFINE_string(test_tmpdir, "/var/tmp", "temp directory");
|
DEFINE_string(test_tmpdir, "/var/tmp", "temp directory");
|
||||||
|
|
||||||
|
#ifdef _WIN32
|
||||||
|
#define snprintf _snprintf
|
||||||
|
#endif
|
||||||
|
|
||||||
using testing::Benchmark;
|
using testing::Benchmark;
|
||||||
using namespace re2;
|
|
||||||
|
|
||||||
static Benchmark* benchmarks[10000];
|
static Benchmark* benchmarks[10000];
|
||||||
static int nbenchmarks;
|
static int nbenchmarks;
|
||||||
@ -24,19 +33,17 @@ void Benchmark::Register() {
|
|||||||
nbenchmarks++;
|
nbenchmarks++;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int64 nsec() {
|
static int64_t nsec() {
|
||||||
struct timeval tv;
|
return std::chrono::duration_cast<std::chrono::nanoseconds>(
|
||||||
if(gettimeofday(&tv, 0) < 0)
|
std::chrono::steady_clock::now().time_since_epoch()).count();
|
||||||
return -1;
|
|
||||||
return (int64)tv.tv_sec*1000*1000*1000 + tv.tv_usec*1000;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static int64 bytes;
|
static int64_t bytes;
|
||||||
static int64 ns;
|
static int64_t ns;
|
||||||
static int64 t0;
|
static int64_t t0;
|
||||||
static int64 items;
|
static int64_t items;
|
||||||
|
|
||||||
void SetBenchmarkBytesProcessed(long long x) {
|
void SetBenchmarkBytesProcessed(int64_t x) {
|
||||||
bytes = x;
|
bytes = x;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -74,7 +81,7 @@ static void runN(Benchmark *b, int n, int siz) {
|
|||||||
b->fnr(n, siz);
|
b->fnr(n, siz);
|
||||||
else {
|
else {
|
||||||
fprintf(stderr, "%s: missing function\n", b->name);
|
fprintf(stderr, "%s: missing function\n", b->name);
|
||||||
exit(2);
|
abort();
|
||||||
}
|
}
|
||||||
if(t0 != 0)
|
if(t0 != 0)
|
||||||
ns += nsec() - t0;
|
ns += nsec() - t0;
|
||||||
@ -105,11 +112,11 @@ void RunBench(Benchmark* b, int nthread, int siz) {
|
|||||||
while(ns < (int)1e9 && n < (int)1e9) {
|
while(ns < (int)1e9 && n < (int)1e9) {
|
||||||
last = n;
|
last = n;
|
||||||
if(ns/n == 0)
|
if(ns/n == 0)
|
||||||
n = 1e9;
|
n = (int)1e9;
|
||||||
else
|
else
|
||||||
n = 1e9 / (ns/n);
|
n = (int)1e9 / static_cast<int>(ns/n);
|
||||||
|
|
||||||
n = max(last+1, min(n+n/2, 100*last));
|
n = std::max(last+1, std::min(n+n/2, 100*last));
|
||||||
n = round(n);
|
n = round(n);
|
||||||
runN(b, n, siz);
|
runN(b, n, siz);
|
||||||
}
|
}
|
||||||
@ -146,7 +153,7 @@ int main(int argc, const char** argv) {
|
|||||||
Benchmark* b = benchmarks[i];
|
Benchmark* b = benchmarks[i];
|
||||||
if(match(b->name, argc, argv))
|
if(match(b->name, argc, argv))
|
||||||
for(int j = b->threadlo; j <= b->threadhi; j++)
|
for(int j = b->threadlo; j <= b->threadhi; j++)
|
||||||
for(int k = max(b->lo, 1); k <= max(b->hi, 1); k<<=1)
|
for(int k = std::max(b->lo, 1); k <= std::max(b->hi, 1); k<<=1)
|
||||||
RunBench(b, j, k);
|
RunBench(b, j, k);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2,8 +2,10 @@
|
|||||||
// Use of this source code is governed by a BSD-style
|
// Use of this source code is governed by a BSD-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
#ifndef RE2_UTIL_BENCHMARK_H__
|
#ifndef UTIL_BENCHMARK_H_
|
||||||
#define RE2_UTIL_BENCHMARK_H__
|
#define UTIL_BENCHMARK_H_
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
namespace testing {
|
namespace testing {
|
||||||
struct Benchmark {
|
struct Benchmark {
|
||||||
@ -23,7 +25,7 @@ struct Benchmark {
|
|||||||
};
|
};
|
||||||
} // namespace testing
|
} // namespace testing
|
||||||
|
|
||||||
void SetBenchmarkBytesProcessed(long long);
|
void SetBenchmarkBytesProcessed(int64_t);
|
||||||
void StopBenchmarkTiming();
|
void StopBenchmarkTiming();
|
||||||
void StartBenchmarkTiming();
|
void StartBenchmarkTiming();
|
||||||
void BenchmarkMemoryUsage();
|
void BenchmarkMemoryUsage();
|
||||||
@ -38,4 +40,4 @@ int NumCPUs();
|
|||||||
::testing::Benchmark* _benchmark_##f = \
|
::testing::Benchmark* _benchmark_##f = \
|
||||||
(new ::testing::Benchmark(#f, f, lo, hi))
|
(new ::testing::Benchmark(#f, f, lo, hi))
|
||||||
|
|
||||||
#endif // RE2_UTIL_BENCHMARK_H__
|
#endif // UTIL_BENCHMARK_H_
|
||||||
|
@ -2,13 +2,15 @@
|
|||||||
// Use of this source code is governed by a BSD-style
|
// Use of this source code is governed by a BSD-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
#ifndef UTIL_FLAGS_H_
|
||||||
|
#define UTIL_FLAGS_H_
|
||||||
|
|
||||||
// Simplified version of Google's command line flags.
|
// Simplified version of Google's command line flags.
|
||||||
// Does not support parsing the command line.
|
// Does not support parsing the command line.
|
||||||
// If you want to do that, see
|
// If you want to do that, see
|
||||||
// http://code.google.com/p/google-gflags
|
// https://gflags.github.io/gflags/
|
||||||
|
|
||||||
#ifndef RE2_UTIL_FLAGS_H__
|
#include <stdint.h>
|
||||||
#define RE2_UTIL_FLAGS_H__
|
|
||||||
|
|
||||||
#define DEFINE_flag(type, name, deflt, desc) \
|
#define DEFINE_flag(type, name, deflt, desc) \
|
||||||
namespace re2 { type FLAGS_##name = deflt; }
|
namespace re2 { type FLAGS_##name = deflt; }
|
||||||
@ -17,11 +19,11 @@
|
|||||||
namespace re2 { extern type FLAGS_##name; }
|
namespace re2 { extern type FLAGS_##name; }
|
||||||
|
|
||||||
#define DEFINE_bool(name, deflt, desc) DEFINE_flag(bool, name, deflt, desc)
|
#define DEFINE_bool(name, deflt, desc) DEFINE_flag(bool, name, deflt, desc)
|
||||||
#define DEFINE_int32(name, deflt, desc) DEFINE_flag(int32, name, deflt, desc)
|
#define DEFINE_int32(name, deflt, desc) DEFINE_flag(int32_t, name, deflt, desc)
|
||||||
#define DEFINE_string(name, deflt, desc) DEFINE_flag(string, name, deflt, desc)
|
#define DEFINE_string(name, deflt, desc) DEFINE_flag(string, name, deflt, desc)
|
||||||
|
|
||||||
#define DECLARE_bool(name) DECLARE_flag(bool, name)
|
#define DECLARE_bool(name) DECLARE_flag(bool, name)
|
||||||
#define DECLARE_int32(name) DECLARE_flag(int32, name)
|
#define DECLARE_int32(name) DECLARE_flag(int32_t, name)
|
||||||
#define DECLARE_string(name) DECLARE_flag(string, name)
|
#define DECLARE_string(name) DECLARE_flag(string, name)
|
||||||
|
|
||||||
#endif // RE2_UTIL_FLAGS_H__
|
#endif // UTIL_FLAGS_H_
|
||||||
|
21
contrib/libre2/util/fuzz.cc
Normal file
21
contrib/libre2/util/fuzz.cc
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
// Copyright 2016 The RE2 Authors. All Rights Reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
#include <stddef.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
// Entry point for libFuzzer.
|
||||||
|
extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size);
|
||||||
|
|
||||||
|
int main(int argc, char** argv) {
|
||||||
|
uint8_t data[32];
|
||||||
|
for (int i = 0; i < 32; i++) {
|
||||||
|
for (int j = 0; j < 32; j++) {
|
||||||
|
data[j] = random() & 0xFF;
|
||||||
|
}
|
||||||
|
LLVMFuzzerTestOneInput(data, 32);
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
@ -1,231 +0,0 @@
|
|||||||
// Modified by Russ Cox to add "namespace re2".
|
|
||||||
// Also threw away all but hashword and hashword2.
|
|
||||||
// http://burtleburtle.net/bob/c/lookup3.c
|
|
||||||
|
|
||||||
/*
|
|
||||||
-------------------------------------------------------------------------------
|
|
||||||
lookup3.c, by Bob Jenkins, May 2006, Public Domain.
|
|
||||||
|
|
||||||
These are functions for producing 32-bit hashes for hash table lookup.
|
|
||||||
hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
|
|
||||||
are externally useful functions. Routines to test the hash are included
|
|
||||||
if SELF_TEST is defined. You can use this free for any purpose. It's in
|
|
||||||
the public domain. It has no warranty.
|
|
||||||
|
|
||||||
You probably want to use hashlittle(). hashlittle() and hashbig()
|
|
||||||
hash byte arrays. hashlittle() is is faster than hashbig() on
|
|
||||||
little-endian machines. Intel and AMD are little-endian machines.
|
|
||||||
On second thought, you probably want hashlittle2(), which is identical to
|
|
||||||
hashlittle() except it returns two 32-bit hashes for the price of one.
|
|
||||||
You could implement hashbig2() if you wanted but I haven't bothered here.
|
|
||||||
|
|
||||||
If you want to find a hash of, say, exactly 7 integers, do
|
|
||||||
a = i1; b = i2; c = i3;
|
|
||||||
mix(a,b,c);
|
|
||||||
a += i4; b += i5; c += i6;
|
|
||||||
mix(a,b,c);
|
|
||||||
a += i7;
|
|
||||||
final(a,b,c);
|
|
||||||
then use c as the hash value. If you have a variable length array of
|
|
||||||
4-byte integers to hash, use hashword(). If you have a byte array (like
|
|
||||||
a character string), use hashlittle(). If you have several byte arrays, or
|
|
||||||
a mix of things, see the comments above hashlittle().
|
|
||||||
|
|
||||||
Why is this so big? I read 12 bytes at a time into 3 4-byte integers,
|
|
||||||
then mix those integers. This is fast (you can do a lot more thorough
|
|
||||||
mixing with 12*3 instructions on 3 integers than you can with 3 instructions
|
|
||||||
on 1 byte), but shoehorning those bytes into integers efficiently is messy.
|
|
||||||
-------------------------------------------------------------------------------
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "util/util.h"
|
|
||||||
|
|
||||||
#define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
|
|
||||||
|
|
||||||
/*
|
|
||||||
-------------------------------------------------------------------------------
|
|
||||||
mix -- mix 3 32-bit values reversibly.
|
|
||||||
|
|
||||||
This is reversible, so any information in (a,b,c) before mix() is
|
|
||||||
still in (a,b,c) after mix().
|
|
||||||
|
|
||||||
If four pairs of (a,b,c) inputs are run through mix(), or through
|
|
||||||
mix() in reverse, there are at least 32 bits of the output that
|
|
||||||
are sometimes the same for one pair and different for another pair.
|
|
||||||
This was tested for:
|
|
||||||
* pairs that differed by one bit, by two bits, in any combination
|
|
||||||
of top bits of (a,b,c), or in any combination of bottom bits of
|
|
||||||
(a,b,c).
|
|
||||||
* "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
|
|
||||||
the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
|
|
||||||
is commonly produced by subtraction) look like a single 1-bit
|
|
||||||
difference.
|
|
||||||
* the base values were pseudorandom, all zero but one bit set, or
|
|
||||||
all zero plus a counter that starts at zero.
|
|
||||||
|
|
||||||
Some k values for my "a-=c; a^=rot(c,k); c+=b;" arrangement that
|
|
||||||
satisfy this are
|
|
||||||
4 6 8 16 19 4
|
|
||||||
9 15 3 18 27 15
|
|
||||||
14 9 3 7 17 3
|
|
||||||
Well, "9 15 3 18 27 15" didn't quite get 32 bits diffing
|
|
||||||
for "differ" defined as + with a one-bit base and a two-bit delta. I
|
|
||||||
used http://burtleburtle.net/bob/hash/avalanche.html to choose
|
|
||||||
the operations, constants, and arrangements of the variables.
|
|
||||||
|
|
||||||
This does not achieve avalanche. There are input bits of (a,b,c)
|
|
||||||
that fail to affect some output bits of (a,b,c), especially of a. The
|
|
||||||
most thoroughly mixed value is c, but it doesn't really even achieve
|
|
||||||
avalanche in c.
|
|
||||||
|
|
||||||
This allows some parallelism. Read-after-writes are good at doubling
|
|
||||||
the number of bits affected, so the goal of mixing pulls in the opposite
|
|
||||||
direction as the goal of parallelism. I did what I could. Rotates
|
|
||||||
seem to cost as much as shifts on every machine I could lay my hands
|
|
||||||
on, and rotates are much kinder to the top and bottom bits, so I used
|
|
||||||
rotates.
|
|
||||||
-------------------------------------------------------------------------------
|
|
||||||
*/
|
|
||||||
#define mix(a,b,c) \
|
|
||||||
{ \
|
|
||||||
a -= c; a ^= rot(c, 4); c += b; \
|
|
||||||
b -= a; b ^= rot(a, 6); a += c; \
|
|
||||||
c -= b; c ^= rot(b, 8); b += a; \
|
|
||||||
a -= c; a ^= rot(c,16); c += b; \
|
|
||||||
b -= a; b ^= rot(a,19); a += c; \
|
|
||||||
c -= b; c ^= rot(b, 4); b += a; \
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
-------------------------------------------------------------------------------
|
|
||||||
final -- final mixing of 3 32-bit values (a,b,c) into c
|
|
||||||
|
|
||||||
Pairs of (a,b,c) values differing in only a few bits will usually
|
|
||||||
produce values of c that look totally different. This was tested for
|
|
||||||
* pairs that differed by one bit, by two bits, in any combination
|
|
||||||
of top bits of (a,b,c), or in any combination of bottom bits of
|
|
||||||
(a,b,c).
|
|
||||||
* "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
|
|
||||||
the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
|
|
||||||
is commonly produced by subtraction) look like a single 1-bit
|
|
||||||
difference.
|
|
||||||
* the base values were pseudorandom, all zero but one bit set, or
|
|
||||||
all zero plus a counter that starts at zero.
|
|
||||||
|
|
||||||
These constants passed:
|
|
||||||
14 11 25 16 4 14 24
|
|
||||||
12 14 25 16 4 14 24
|
|
||||||
and these came close:
|
|
||||||
4 8 15 26 3 22 24
|
|
||||||
10 8 15 26 3 22 24
|
|
||||||
11 8 15 26 3 22 24
|
|
||||||
-------------------------------------------------------------------------------
|
|
||||||
*/
|
|
||||||
#define final(a,b,c) \
|
|
||||||
{ \
|
|
||||||
c ^= b; c -= rot(b,14); \
|
|
||||||
a ^= c; a -= rot(c,11); \
|
|
||||||
b ^= a; b -= rot(a,25); \
|
|
||||||
c ^= b; c -= rot(b,16); \
|
|
||||||
a ^= c; a -= rot(c,4); \
|
|
||||||
b ^= a; b -= rot(a,14); \
|
|
||||||
c ^= b; c -= rot(b,24); \
|
|
||||||
}
|
|
||||||
|
|
||||||
namespace re2 {
|
|
||||||
|
|
||||||
/*
|
|
||||||
--------------------------------------------------------------------
|
|
||||||
This works on all machines. To be useful, it requires
|
|
||||||
-- that the key be an array of uint32_t's, and
|
|
||||||
-- that the length be the number of uint32_t's in the key
|
|
||||||
|
|
||||||
The function hashword() is identical to hashlittle() on little-endian
|
|
||||||
machines, and identical to hashbig() on big-endian machines,
|
|
||||||
except that the length has to be measured in uint32_ts rather than in
|
|
||||||
bytes. hashlittle() is more complicated than hashword() only because
|
|
||||||
hashlittle() has to dance around fitting the key bytes into registers.
|
|
||||||
--------------------------------------------------------------------
|
|
||||||
*/
|
|
||||||
uint32 hashword(
|
|
||||||
const uint32 *k, /* the key, an array of uint32_t values */
|
|
||||||
size_t length, /* the length of the key, in uint32_ts */
|
|
||||||
uint32 initval) /* the previous hash, or an arbitrary value */
|
|
||||||
{
|
|
||||||
uint32_t a,b,c;
|
|
||||||
|
|
||||||
/* Set up the internal state */
|
|
||||||
a = b = c = 0xdeadbeef + (((uint32_t)length)<<2) + initval;
|
|
||||||
|
|
||||||
/*------------------------------------------------- handle most of the key */
|
|
||||||
while (length > 3)
|
|
||||||
{
|
|
||||||
a += k[0];
|
|
||||||
b += k[1];
|
|
||||||
c += k[2];
|
|
||||||
mix(a,b,c);
|
|
||||||
length -= 3;
|
|
||||||
k += 3;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*------------------------------------------- handle the last 3 uint32_t's */
|
|
||||||
switch(length) /* all the case statements fall through */
|
|
||||||
{
|
|
||||||
case 3 : c+=k[2];
|
|
||||||
case 2 : b+=k[1];
|
|
||||||
case 1 : a+=k[0];
|
|
||||||
final(a,b,c);
|
|
||||||
case 0: /* case 0: nothing left to add */
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
/*------------------------------------------------------ report the result */
|
|
||||||
return c;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
|
||||||
--------------------------------------------------------------------
|
|
||||||
hashword2() -- same as hashword(), but take two seeds and return two
|
|
||||||
32-bit values. pc and pb must both be nonnull, and *pc and *pb must
|
|
||||||
both be initialized with seeds. If you pass in (*pb)==0, the output
|
|
||||||
(*pc) will be the same as the return value from hashword().
|
|
||||||
--------------------------------------------------------------------
|
|
||||||
*/
|
|
||||||
void hashword2 (
|
|
||||||
const uint32 *k, /* the key, an array of uint32_t values */
|
|
||||||
size_t length, /* the length of the key, in uint32_ts */
|
|
||||||
uint32 *pc, /* IN: seed OUT: primary hash value */
|
|
||||||
uint32 *pb) /* IN: more seed OUT: secondary hash value */
|
|
||||||
{
|
|
||||||
uint32_t a,b,c;
|
|
||||||
|
|
||||||
/* Set up the internal state */
|
|
||||||
a = b = c = 0xdeadbeef + ((uint32_t)(length<<2)) + *pc;
|
|
||||||
c += *pb;
|
|
||||||
|
|
||||||
/*------------------------------------------------- handle most of the key */
|
|
||||||
while (length > 3)
|
|
||||||
{
|
|
||||||
a += k[0];
|
|
||||||
b += k[1];
|
|
||||||
c += k[2];
|
|
||||||
mix(a,b,c);
|
|
||||||
length -= 3;
|
|
||||||
k += 3;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*------------------------------------------- handle the last 3 uint32_t's */
|
|
||||||
switch(length) /* all the case statements fall through */
|
|
||||||
{
|
|
||||||
case 3 : c+=k[2];
|
|
||||||
case 2 : b+=k[1];
|
|
||||||
case 1 : a+=k[0];
|
|
||||||
final(a,b,c);
|
|
||||||
case 0: /* case 0: nothing left to add */
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
/*------------------------------------------------------ report the result */
|
|
||||||
*pc=c; *pb=b;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace re2
|
|
@ -2,14 +2,19 @@
|
|||||||
// Use of this source code is governed by a BSD-style
|
// Use of this source code is governed by a BSD-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
#ifndef UTIL_LOGGING_H_
|
||||||
|
#define UTIL_LOGGING_H_
|
||||||
|
|
||||||
// Simplified version of Google's logging.
|
// Simplified version of Google's logging.
|
||||||
|
|
||||||
#ifndef RE2_UTIL_LOGGING_H__
|
#include <assert.h>
|
||||||
#define RE2_UTIL_LOGGING_H__
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
#include <unistd.h> /* for write */
|
#include <ostream>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
|
|
||||||
|
#include "util/util.h"
|
||||||
|
|
||||||
// Debug-only checking.
|
// Debug-only checking.
|
||||||
#define DCHECK(condition) assert(condition)
|
#define DCHECK(condition) assert(condition)
|
||||||
#define DCHECK_EQ(val1, val2) assert((val1) == (val2))
|
#define DCHECK_EQ(val1, val2) assert((val1) == (val2))
|
||||||
@ -29,33 +34,37 @@
|
|||||||
#define CHECK_NE(x, y) CHECK((x) != (y))
|
#define CHECK_NE(x, y) CHECK((x) != (y))
|
||||||
|
|
||||||
#define LOG_INFO LogMessage(__FILE__, __LINE__)
|
#define LOG_INFO LogMessage(__FILE__, __LINE__)
|
||||||
#define LOG_ERROR LOG_INFO
|
#define LOG_WARNING LogMessage(__FILE__, __LINE__)
|
||||||
#define LOG_WARNING LOG_INFO
|
#define LOG_ERROR LogMessage(__FILE__, __LINE__)
|
||||||
#define LOG_FATAL LogMessageFatal(__FILE__, __LINE__)
|
#define LOG_FATAL LogMessageFatal(__FILE__, __LINE__)
|
||||||
#define LOG_QFATAL LOG_FATAL
|
#define LOG_QFATAL LOG_FATAL
|
||||||
|
|
||||||
#define VLOG(x) if((x)>0){}else LOG_INFO.stream()
|
// It seems that one of the Windows header files defines ERROR as 0.
|
||||||
|
#ifdef _WIN32
|
||||||
|
#define LOG_0 LOG_INFO
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef NDEBUG
|
#ifdef NDEBUG
|
||||||
#define DEBUG_MODE 0
|
|
||||||
#define LOG_DFATAL LOG_ERROR
|
#define LOG_DFATAL LOG_ERROR
|
||||||
#else
|
#else
|
||||||
#define DEBUG_MODE 1
|
|
||||||
#define LOG_DFATAL LOG_FATAL
|
#define LOG_DFATAL LOG_FATAL
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define LOG(severity) LOG_ ## severity.stream()
|
#define LOG(severity) LOG_ ## severity.stream()
|
||||||
|
|
||||||
|
#define VLOG(x) if((x)>0){}else LOG_INFO.stream()
|
||||||
|
|
||||||
class LogMessage {
|
class LogMessage {
|
||||||
public:
|
public:
|
||||||
LogMessage(const char* file, int line) : flushed_(false) {
|
LogMessage(const char* file, int line)
|
||||||
|
: flushed_(false) {
|
||||||
stream() << file << ":" << line << ": ";
|
stream() << file << ":" << line << ": ";
|
||||||
}
|
}
|
||||||
void Flush() {
|
void Flush() {
|
||||||
stream() << "\n";
|
stream() << "\n";
|
||||||
string s = str_.str();
|
string s = str_.str();
|
||||||
int n = (int)s.size(); // shut up msvc
|
size_t n = s.size();
|
||||||
if(write(2, s.data(), n) < 0) {} // shut up gcc
|
if (fwrite(s.data(), 1, n, stderr) < n) {} // shut up gcc
|
||||||
flushed_ = true;
|
flushed_ = true;
|
||||||
}
|
}
|
||||||
~LogMessage() {
|
~LogMessage() {
|
||||||
@ -63,14 +72,23 @@ class LogMessage {
|
|||||||
Flush();
|
Flush();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
ostream& stream() { return str_; }
|
std::ostream& stream() { return str_; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
bool flushed_;
|
bool flushed_;
|
||||||
std::ostringstream str_;
|
std::ostringstream str_;
|
||||||
DISALLOW_EVIL_CONSTRUCTORS(LogMessage);
|
|
||||||
|
LogMessage(const LogMessage&) = delete;
|
||||||
|
LogMessage& operator=(const LogMessage&) = delete;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Silence "destructor never returns" warning for ~LogMessageFatal().
|
||||||
|
// Since this is a header file, push and then pop to limit the scope.
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
#pragma warning(push)
|
||||||
|
#pragma warning(disable: 4722)
|
||||||
|
#endif
|
||||||
|
|
||||||
class LogMessageFatal : public LogMessage {
|
class LogMessageFatal : public LogMessage {
|
||||||
public:
|
public:
|
||||||
LogMessageFatal(const char* file, int line)
|
LogMessageFatal(const char* file, int line)
|
||||||
@ -80,7 +98,12 @@ class LogMessageFatal : public LogMessage {
|
|||||||
abort();
|
abort();
|
||||||
}
|
}
|
||||||
private:
|
private:
|
||||||
DISALLOW_EVIL_CONSTRUCTORS(LogMessageFatal);
|
LogMessageFatal(const LogMessageFatal&) = delete;
|
||||||
|
LogMessageFatal& operator=(const LogMessageFatal&) = delete;
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif // RE2_UTIL_LOGGING_H__
|
#ifdef _MSC_VER
|
||||||
|
#pragma warning(pop)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif // UTIL_LOGGING_H_
|
||||||
|
41
contrib/libre2/util/mix.h
Normal file
41
contrib/libre2/util/mix.h
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
// Copyright 2016 The RE2 Authors. All Rights Reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
#ifndef UTIL_MIX_H_
|
||||||
|
#define UTIL_MIX_H_
|
||||||
|
|
||||||
|
#include <stddef.h>
|
||||||
|
#include <limits>
|
||||||
|
|
||||||
|
namespace re2 {
|
||||||
|
|
||||||
|
// Silence "truncation of constant value" warning for kMul in 32-bit mode.
|
||||||
|
// Since this is a header file, push and then pop to limit the scope.
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
#pragma warning(push)
|
||||||
|
#pragma warning(disable: 4309)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
class HashMix {
|
||||||
|
public:
|
||||||
|
HashMix() : hash_(1) {}
|
||||||
|
explicit HashMix(size_t val) : hash_(val + 83) {}
|
||||||
|
void Mix(size_t val) {
|
||||||
|
static const size_t kMul = static_cast<size_t>(0xdc3eb94af8ab4c93ULL);
|
||||||
|
hash_ *= kMul;
|
||||||
|
hash_ = ((hash_ << 19) |
|
||||||
|
(hash_ >> (std::numeric_limits<size_t>::digits - 19))) + val;
|
||||||
|
}
|
||||||
|
size_t get() const { return hash_; }
|
||||||
|
private:
|
||||||
|
size_t hash_;
|
||||||
|
};
|
||||||
|
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
#pragma warning(pop)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
} // namespace re2
|
||||||
|
|
||||||
|
#endif // UTIL_MIX_H_
|
@ -2,64 +2,41 @@
|
|||||||
// Use of this source code is governed by a BSD-style
|
// Use of this source code is governed by a BSD-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
#ifndef UTIL_MUTEX_H_
|
||||||
|
#define UTIL_MUTEX_H_
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* A simple mutex wrapper, supporting locks and read-write locks.
|
* A simple mutex wrapper, supporting locks and read-write locks.
|
||||||
* You should assume the locks are *not* re-entrant.
|
* You should assume the locks are *not* re-entrant.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#ifndef RE2_UTIL_MUTEX_H_
|
#if !defined(_WIN32)
|
||||||
#define RE2_UTIL_MUTEX_H_
|
#ifndef _POSIX_C_SOURCE
|
||||||
|
#define _POSIX_C_SOURCE 200809L
|
||||||
|
#endif
|
||||||
|
#include <unistd.h>
|
||||||
|
#if defined(_POSIX_READER_WRITER_LOCKS) && _POSIX_READER_WRITER_LOCKS > 0
|
||||||
|
#define MUTEX_IS_PTHREAD_RWLOCK
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(MUTEX_IS_PTHREAD_RWLOCK)
|
||||||
|
#include <pthread.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
typedef pthread_rwlock_t MutexType;
|
||||||
|
#else
|
||||||
|
#include <mutex>
|
||||||
|
typedef std::mutex MutexType;
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace re2 {
|
namespace re2 {
|
||||||
|
|
||||||
#define HAVE_PTHREAD 1
|
|
||||||
#define HAVE_RWLOCK 1
|
|
||||||
|
|
||||||
#if defined(NO_THREADS)
|
|
||||||
typedef int MutexType; // to keep a lock-count
|
|
||||||
#elif defined(HAVE_PTHREAD) && defined(HAVE_RWLOCK)
|
|
||||||
// Needed for pthread_rwlock_*. If it causes problems, you could take it
|
|
||||||
// out, but then you'd have to unset HAVE_RWLOCK (at least on linux -- it
|
|
||||||
// *does* cause problems for FreeBSD, or MacOSX, but isn't needed
|
|
||||||
// for locking there.)
|
|
||||||
# ifdef __linux__
|
|
||||||
# undef _XOPEN_SOURCE
|
|
||||||
# define _XOPEN_SOURCE 500 // may be needed to get the rwlock calls
|
|
||||||
# endif
|
|
||||||
# include <pthread.h>
|
|
||||||
typedef pthread_rwlock_t MutexType;
|
|
||||||
#elif defined(HAVE_PTHREAD)
|
|
||||||
# include <pthread.h>
|
|
||||||
typedef pthread_mutex_t MutexType;
|
|
||||||
#elif defined(WIN32)
|
|
||||||
# define WIN32_LEAN_AND_MEAN // We only need minimal includes
|
|
||||||
# ifdef GMUTEX_TRYLOCK
|
|
||||||
// We need Windows NT or later for TryEnterCriticalSection(). If you
|
|
||||||
// don't need that functionality, you can remove these _WIN32_WINNT
|
|
||||||
// lines, and change TryLock() to assert(0) or something.
|
|
||||||
# ifndef _WIN32_WINNT
|
|
||||||
# define _WIN32_WINNT 0x0400
|
|
||||||
# endif
|
|
||||||
# endif
|
|
||||||
# include <windows.h>
|
|
||||||
typedef CRITICAL_SECTION MutexType;
|
|
||||||
#else
|
|
||||||
# error Need to implement mutex.h for your architecture, or #define NO_THREADS
|
|
||||||
#endif
|
|
||||||
|
|
||||||
class Mutex {
|
class Mutex {
|
||||||
public:
|
public:
|
||||||
// Create a Mutex that is not held by anybody.
|
|
||||||
inline Mutex();
|
inline Mutex();
|
||||||
|
|
||||||
// Destructor
|
|
||||||
inline ~Mutex();
|
inline ~Mutex();
|
||||||
|
|
||||||
inline void Lock(); // Block if needed until free then acquire exclusively
|
inline void Lock(); // Block if needed until free then acquire exclusively
|
||||||
inline void Unlock(); // Release a lock acquired via Lock()
|
inline void Unlock(); // Release a lock acquired via Lock()
|
||||||
inline bool TryLock(); // If free, Lock() and return true, else return false
|
|
||||||
// Note that on systems that don't support read-write locks, these may
|
// Note that on systems that don't support read-write locks, these may
|
||||||
// be implemented as synonyms to Lock() and Unlock(). So you can use
|
// be implemented as synonyms to Lock() and Unlock(). So you can use
|
||||||
// these for efficiency, but don't use them anyplace where being able
|
// these for efficiency, but don't use them anyplace where being able
|
||||||
@ -68,80 +45,44 @@ class Mutex {
|
|||||||
inline void ReaderUnlock(); // Release a read share of this Mutex
|
inline void ReaderUnlock(); // Release a read share of this Mutex
|
||||||
inline void WriterLock() { Lock(); } // Acquire an exclusive lock
|
inline void WriterLock() { Lock(); } // Acquire an exclusive lock
|
||||||
inline void WriterUnlock() { Unlock(); } // Release a lock from WriterLock()
|
inline void WriterUnlock() { Unlock(); } // Release a lock from WriterLock()
|
||||||
inline void AssertHeld() { }
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
MutexType mutex_;
|
MutexType mutex_;
|
||||||
|
|
||||||
// Catch the error of writing Mutex when intending MutexLock.
|
// Catch the error of writing Mutex when intending MutexLock.
|
||||||
Mutex(Mutex *ignored);
|
Mutex(Mutex *ignored);
|
||||||
// Disallow "evil" constructors
|
|
||||||
Mutex(const Mutex&);
|
Mutex(const Mutex&) = delete;
|
||||||
void operator=(const Mutex&);
|
Mutex& operator=(const Mutex&) = delete;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Now the implementation of Mutex for various systems
|
#if defined(MUTEX_IS_PTHREAD_RWLOCK)
|
||||||
#if defined(NO_THREADS)
|
|
||||||
|
|
||||||
// When we don't have threads, we can be either reading or writing,
|
#define SAFE_PTHREAD(fncall) \
|
||||||
// but not both. We can have lots of readers at once (in no-threads
|
do { \
|
||||||
// mode, that's most likely to happen in recursive function calls),
|
if ((fncall) != 0) abort(); \
|
||||||
// but only one writer. We represent this by having mutex_ be -1 when
|
} while (0)
|
||||||
// writing and a number > 0 when reading (and 0 when no lock is held).
|
|
||||||
//
|
|
||||||
// In debug mode, we assert these invariants, while in non-debug mode
|
|
||||||
// we do nothing, for efficiency. That's why everything is in an
|
|
||||||
// assert.
|
|
||||||
#include <assert.h>
|
|
||||||
|
|
||||||
Mutex::Mutex() : mutex_(0) { }
|
|
||||||
Mutex::~Mutex() { assert(mutex_ == 0); }
|
|
||||||
void Mutex::Lock() { assert(--mutex_ == -1); }
|
|
||||||
void Mutex::Unlock() { assert(mutex_++ == -1); }
|
|
||||||
bool Mutex::TryLock() { if (mutex_) return false; Lock(); return true; }
|
|
||||||
void Mutex::ReaderLock() { assert(++mutex_ > 0); }
|
|
||||||
void Mutex::ReaderUnlock() { assert(mutex_-- > 0); }
|
|
||||||
|
|
||||||
#elif defined(HAVE_PTHREAD) && defined(HAVE_RWLOCK)
|
|
||||||
|
|
||||||
#define SAFE_PTHREAD(fncall) do { if ((fncall) != 0) abort(); } while (0)
|
|
||||||
|
|
||||||
Mutex::Mutex() { SAFE_PTHREAD(pthread_rwlock_init(&mutex_, NULL)); }
|
Mutex::Mutex() { SAFE_PTHREAD(pthread_rwlock_init(&mutex_, NULL)); }
|
||||||
Mutex::~Mutex() { SAFE_PTHREAD(pthread_rwlock_destroy(&mutex_)); }
|
Mutex::~Mutex() { SAFE_PTHREAD(pthread_rwlock_destroy(&mutex_)); }
|
||||||
void Mutex::Lock() { SAFE_PTHREAD(pthread_rwlock_wrlock(&mutex_)); }
|
void Mutex::Lock() { SAFE_PTHREAD(pthread_rwlock_wrlock(&mutex_)); }
|
||||||
void Mutex::Unlock() { SAFE_PTHREAD(pthread_rwlock_unlock(&mutex_)); }
|
void Mutex::Unlock() { SAFE_PTHREAD(pthread_rwlock_unlock(&mutex_)); }
|
||||||
bool Mutex::TryLock() { return pthread_rwlock_trywrlock(&mutex_) == 0; }
|
|
||||||
void Mutex::ReaderLock() { SAFE_PTHREAD(pthread_rwlock_rdlock(&mutex_)); }
|
void Mutex::ReaderLock() { SAFE_PTHREAD(pthread_rwlock_rdlock(&mutex_)); }
|
||||||
void Mutex::ReaderUnlock() { SAFE_PTHREAD(pthread_rwlock_unlock(&mutex_)); }
|
void Mutex::ReaderUnlock() { SAFE_PTHREAD(pthread_rwlock_unlock(&mutex_)); }
|
||||||
|
|
||||||
#undef SAFE_PTHREAD
|
#undef SAFE_PTHREAD
|
||||||
|
|
||||||
#elif defined(HAVE_PTHREAD)
|
#else
|
||||||
|
|
||||||
#define SAFE_PTHREAD(fncall) do { if ((fncall) != 0) abort(); } while (0)
|
Mutex::Mutex() { }
|
||||||
|
Mutex::~Mutex() { }
|
||||||
Mutex::Mutex() { SAFE_PTHREAD(pthread_mutex_init(&mutex_, NULL)); }
|
void Mutex::Lock() { mutex_.lock(); }
|
||||||
Mutex::~Mutex() { SAFE_PTHREAD(pthread_mutex_destroy(&mutex_)); }
|
void Mutex::Unlock() { mutex_.unlock(); }
|
||||||
void Mutex::Lock() { SAFE_PTHREAD(pthread_mutex_lock(&mutex_)); }
|
void Mutex::ReaderLock() { Lock(); } // C++11 doesn't have std::shared_mutex.
|
||||||
void Mutex::Unlock() { SAFE_PTHREAD(pthread_mutex_unlock(&mutex_)); }
|
|
||||||
bool Mutex::TryLock() { return pthread_mutex_trylock(&mutex_) == 0; }
|
|
||||||
void Mutex::ReaderLock() { Lock(); } // we don't have read-write locks
|
|
||||||
void Mutex::ReaderUnlock() { Unlock(); }
|
|
||||||
#undef SAFE_PTHREAD
|
|
||||||
|
|
||||||
#elif defined(WIN32)
|
|
||||||
|
|
||||||
Mutex::Mutex() { InitializeCriticalSection(&mutex_); }
|
|
||||||
Mutex::~Mutex() { DeleteCriticalSection(&mutex_); }
|
|
||||||
void Mutex::Lock() { EnterCriticalSection(&mutex_); }
|
|
||||||
void Mutex::Unlock() { LeaveCriticalSection(&mutex_); }
|
|
||||||
bool Mutex::TryLock() { return TryEnterCriticalSection(&mutex_) != 0; }
|
|
||||||
void Mutex::ReaderLock() { Lock(); } // we don't have read-write locks
|
|
||||||
void Mutex::ReaderUnlock() { Unlock(); }
|
void Mutex::ReaderUnlock() { Unlock(); }
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
// --------------------------------------------------------------------------
|
// --------------------------------------------------------------------------
|
||||||
// Some helper classes
|
// Some helper classes
|
||||||
|
|
||||||
@ -152,9 +93,9 @@ class MutexLock {
|
|||||||
~MutexLock() { mu_->Unlock(); }
|
~MutexLock() { mu_->Unlock(); }
|
||||||
private:
|
private:
|
||||||
Mutex * const mu_;
|
Mutex * const mu_;
|
||||||
// Disallow "evil" constructors
|
|
||||||
MutexLock(const MutexLock&);
|
MutexLock(const MutexLock&) = delete;
|
||||||
void operator=(const MutexLock&);
|
MutexLock& operator=(const MutexLock&) = delete;
|
||||||
};
|
};
|
||||||
|
|
||||||
// ReaderMutexLock and WriterMutexLock do the same, for rwlocks
|
// ReaderMutexLock and WriterMutexLock do the same, for rwlocks
|
||||||
@ -164,9 +105,9 @@ class ReaderMutexLock {
|
|||||||
~ReaderMutexLock() { mu_->ReaderUnlock(); }
|
~ReaderMutexLock() { mu_->ReaderUnlock(); }
|
||||||
private:
|
private:
|
||||||
Mutex * const mu_;
|
Mutex * const mu_;
|
||||||
// Disallow "evil" constructors
|
|
||||||
ReaderMutexLock(const ReaderMutexLock&);
|
ReaderMutexLock(const ReaderMutexLock&) = delete;
|
||||||
void operator=(const ReaderMutexLock&);
|
ReaderMutexLock& operator=(const ReaderMutexLock&) = delete;
|
||||||
};
|
};
|
||||||
|
|
||||||
class WriterMutexLock {
|
class WriterMutexLock {
|
||||||
@ -175,37 +116,16 @@ class WriterMutexLock {
|
|||||||
~WriterMutexLock() { mu_->WriterUnlock(); }
|
~WriterMutexLock() { mu_->WriterUnlock(); }
|
||||||
private:
|
private:
|
||||||
Mutex * const mu_;
|
Mutex * const mu_;
|
||||||
// Disallow "evil" constructors
|
|
||||||
WriterMutexLock(const WriterMutexLock&);
|
WriterMutexLock(const WriterMutexLock&) = delete;
|
||||||
void operator=(const WriterMutexLock&);
|
WriterMutexLock& operator=(const WriterMutexLock&) = delete;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Catch bug where variable name is omitted, e.g. MutexLock (&mu);
|
// Catch bug where variable name is omitted, e.g. MutexLock (&mu);
|
||||||
#define MutexLock(x) COMPILE_ASSERT(0, mutex_lock_decl_missing_var_name)
|
#define MutexLock(x) static_assert(false, "MutexLock declaration missing variable name")
|
||||||
#define ReaderMutexLock(x) COMPILE_ASSERT(0, rmutex_lock_decl_missing_var_name)
|
#define ReaderMutexLock(x) static_assert(false, "ReaderMutexLock declaration missing variable name")
|
||||||
#define WriterMutexLock(x) COMPILE_ASSERT(0, wmutex_lock_decl_missing_var_name)
|
#define WriterMutexLock(x) static_assert(false, "WriterMutexLock declaration missing variable name")
|
||||||
|
|
||||||
// Provide safe way to declare and use global, linker-initialized mutex. Sigh.
|
|
||||||
#ifdef HAVE_PTHREAD
|
|
||||||
|
|
||||||
#define GLOBAL_MUTEX(name) \
|
|
||||||
static pthread_mutex_t (name) = PTHREAD_MUTEX_INITIALIZER
|
|
||||||
#define GLOBAL_MUTEX_LOCK(name) \
|
|
||||||
pthread_mutex_lock(&(name))
|
|
||||||
#define GLOBAL_MUTEX_UNLOCK(name) \
|
|
||||||
pthread_mutex_unlock(&(name))
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
#define GLOBAL_MUTEX(name) \
|
|
||||||
static Mutex name
|
|
||||||
#define GLOBAL_MUTEX_LOCK(name) \
|
|
||||||
name.Lock()
|
|
||||||
#define GLOBAL_MUTEX_UNLOCK(name) \
|
|
||||||
name.Unlock()
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
} // namespace re2
|
} // namespace re2
|
||||||
|
|
||||||
#endif /* #define RE2_UTIL_MUTEX_H_ */
|
#endif // UTIL_MUTEX_H_
|
||||||
|
@ -6,12 +6,25 @@
|
|||||||
// The main changes are the addition of the HitLimit method and
|
// The main changes are the addition of the HitLimit method and
|
||||||
// compilation as PCRE in namespace re2.
|
// compilation as PCRE in namespace re2.
|
||||||
|
|
||||||
|
#include <assert.h>
|
||||||
|
#include <ctype.h>
|
||||||
#include <errno.h>
|
#include <errno.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <limits>
|
||||||
|
#include <string>
|
||||||
|
#include <utility>
|
||||||
|
|
||||||
#include "util/util.h"
|
#include "util/util.h"
|
||||||
#include "util/flags.h"
|
#include "util/flags.h"
|
||||||
|
#include "util/logging.h"
|
||||||
#include "util/pcre.h"
|
#include "util/pcre.h"
|
||||||
|
#include "util/strutil.h"
|
||||||
|
|
||||||
#if __GNUC__ > 5
|
// Silence warnings about the wacky formatting in the operator() functions.
|
||||||
|
// Note that we test for Clang first because it defines __GNUC__ as well.
|
||||||
|
#if defined(__clang__)
|
||||||
|
#elif defined(__GNUC__) && __GNUC__ >= 6
|
||||||
#pragma GCC diagnostic ignored "-Wmisleading-indentation"
|
#pragma GCC diagnostic ignored "-Wmisleading-indentation"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -26,6 +39,42 @@ DEFINE_int32(regexp_stack_limit, 256<<10, "default PCRE stack limit (bytes)");
|
|||||||
DEFINE_int32(regexp_match_limit, 1000000,
|
DEFINE_int32(regexp_match_limit, 1000000,
|
||||||
"default PCRE match limit (function calls)");
|
"default PCRE match limit (function calls)");
|
||||||
|
|
||||||
|
#ifndef USEPCRE
|
||||||
|
|
||||||
|
// Fake just enough of the PCRE API to allow this file to build. :)
|
||||||
|
|
||||||
|
struct pcre_extra {
|
||||||
|
int flags;
|
||||||
|
int match_limit;
|
||||||
|
int match_limit_recursion;
|
||||||
|
};
|
||||||
|
|
||||||
|
#define PCRE_EXTRA_MATCH_LIMIT 0
|
||||||
|
#define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0
|
||||||
|
#define PCRE_ANCHORED 0
|
||||||
|
#define PCRE_NOTEMPTY 0
|
||||||
|
#define PCRE_ERROR_NOMATCH 1
|
||||||
|
#define PCRE_ERROR_MATCHLIMIT 2
|
||||||
|
#define PCRE_ERROR_RECURSIONLIMIT 3
|
||||||
|
#define PCRE_INFO_CAPTURECOUNT 0
|
||||||
|
|
||||||
|
void pcre_free(void*) {
|
||||||
|
}
|
||||||
|
|
||||||
|
pcre* pcre_compile(const char*, int, const char**, int*, const unsigned char*) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
int pcre_exec(const pcre*, const pcre_extra*, const char*, int, int, int, int*, int) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int pcre_fullinfo(const pcre*, const pcre_extra*, int, void*) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace re2 {
|
namespace re2 {
|
||||||
|
|
||||||
// Maximum number of args we can set
|
// Maximum number of args we can set
|
||||||
@ -117,7 +166,7 @@ pcre* PCRE::Compile(Anchor anchor) {
|
|||||||
// ANCHOR_BOTH Tack a "\z" to the end of the original pattern
|
// ANCHOR_BOTH Tack a "\z" to the end of the original pattern
|
||||||
// and use a pcre anchored match.
|
// and use a pcre anchored match.
|
||||||
|
|
||||||
const char* error;
|
const char* error = "";
|
||||||
int eoffset;
|
int eoffset;
|
||||||
pcre* re;
|
pcre* re;
|
||||||
if (anchor != ANCHOR_BOTH) {
|
if (anchor != ANCHOR_BOTH) {
|
||||||
@ -181,8 +230,8 @@ bool PCRE::FullMatchFunctor::operator ()(const StringPiece& text,
|
|||||||
if (&a15 == &no_more_args) goto done; args[n++] = &a15;
|
if (&a15 == &no_more_args) goto done; args[n++] = &a15;
|
||||||
done:
|
done:
|
||||||
|
|
||||||
int consumed;
|
size_t consumed;
|
||||||
int vec[kVecSize];
|
int vec[kVecSize] = {};
|
||||||
return re.DoMatchImpl(text, ANCHOR_BOTH, &consumed, args, n, vec, kVecSize);
|
return re.DoMatchImpl(text, ANCHOR_BOTH, &consumed, args, n, vec, kVecSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -224,8 +273,8 @@ bool PCRE::PartialMatchFunctor::operator ()(const StringPiece& text,
|
|||||||
if (&a15 == &no_more_args) goto done; args[n++] = &a15;
|
if (&a15 == &no_more_args) goto done; args[n++] = &a15;
|
||||||
done:
|
done:
|
||||||
|
|
||||||
int consumed;
|
size_t consumed;
|
||||||
int vec[kVecSize];
|
int vec[kVecSize] = {};
|
||||||
return re.DoMatchImpl(text, UNANCHORED, &consumed, args, n, vec, kVecSize);
|
return re.DoMatchImpl(text, UNANCHORED, &consumed, args, n, vec, kVecSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -267,8 +316,8 @@ bool PCRE::ConsumeFunctor::operator ()(StringPiece* input,
|
|||||||
if (&a15 == &no_more_args) goto done; args[n++] = &a15;
|
if (&a15 == &no_more_args) goto done; args[n++] = &a15;
|
||||||
done:
|
done:
|
||||||
|
|
||||||
int consumed;
|
size_t consumed;
|
||||||
int vec[kVecSize];
|
int vec[kVecSize] = {};
|
||||||
if (pattern.DoMatchImpl(*input, ANCHOR_START, &consumed,
|
if (pattern.DoMatchImpl(*input, ANCHOR_START, &consumed,
|
||||||
args, n, vec, kVecSize)) {
|
args, n, vec, kVecSize)) {
|
||||||
input->remove_prefix(consumed);
|
input->remove_prefix(consumed);
|
||||||
@ -316,8 +365,8 @@ bool PCRE::FindAndConsumeFunctor::operator ()(StringPiece* input,
|
|||||||
if (&a15 == &no_more_args) goto done; args[n++] = &a15;
|
if (&a15 == &no_more_args) goto done; args[n++] = &a15;
|
||||||
done:
|
done:
|
||||||
|
|
||||||
int consumed;
|
size_t consumed;
|
||||||
int vec[kVecSize];
|
int vec[kVecSize] = {};
|
||||||
if (pattern.DoMatchImpl(*input, UNANCHORED, &consumed,
|
if (pattern.DoMatchImpl(*input, UNANCHORED, &consumed,
|
||||||
args, n, vec, kVecSize)) {
|
args, n, vec, kVecSize)) {
|
||||||
input->remove_prefix(consumed);
|
input->remove_prefix(consumed);
|
||||||
@ -330,7 +379,7 @@ done:
|
|||||||
bool PCRE::Replace(string *str,
|
bool PCRE::Replace(string *str,
|
||||||
const PCRE& pattern,
|
const PCRE& pattern,
|
||||||
const StringPiece& rewrite) {
|
const StringPiece& rewrite) {
|
||||||
int vec[kVecSize];
|
int vec[kVecSize] = {};
|
||||||
int matches = pattern.TryMatch(*str, 0, UNANCHORED, true, vec, kVecSize);
|
int matches = pattern.TryMatch(*str, 0, UNANCHORED, true, vec, kVecSize);
|
||||||
if (matches == 0)
|
if (matches == 0)
|
||||||
return false;
|
return false;
|
||||||
@ -349,12 +398,12 @@ int PCRE::GlobalReplace(string *str,
|
|||||||
const PCRE& pattern,
|
const PCRE& pattern,
|
||||||
const StringPiece& rewrite) {
|
const StringPiece& rewrite) {
|
||||||
int count = 0;
|
int count = 0;
|
||||||
int vec[kVecSize];
|
int vec[kVecSize] = {};
|
||||||
string out;
|
string out;
|
||||||
size_t start = 0;
|
size_t start = 0;
|
||||||
bool last_match_was_empty_string = false;
|
bool last_match_was_empty_string = false;
|
||||||
|
|
||||||
for (; start <= str->length();) {
|
while (start <= str->size()) {
|
||||||
// If the previous match was for the empty string, we shouldn't
|
// If the previous match was for the empty string, we shouldn't
|
||||||
// just match again: we'll match in the same way and get an
|
// just match again: we'll match in the same way and get an
|
||||||
// infinite loop. Instead, we do the match in a special way:
|
// infinite loop. Instead, we do the match in a special way:
|
||||||
@ -370,19 +419,20 @@ int PCRE::GlobalReplace(string *str,
|
|||||||
matches = pattern.TryMatch(*str, start, ANCHOR_START, false,
|
matches = pattern.TryMatch(*str, start, ANCHOR_START, false,
|
||||||
vec, kVecSize);
|
vec, kVecSize);
|
||||||
if (matches <= 0) {
|
if (matches <= 0) {
|
||||||
if (start < str->length())
|
if (start < str->size())
|
||||||
out.push_back((*str)[start]);
|
out.push_back((*str)[start]);
|
||||||
start++;
|
start++;
|
||||||
last_match_was_empty_string = false;
|
last_match_was_empty_string = false;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
matches = pattern.TryMatch(*str, start, UNANCHORED, true, vec, kVecSize);
|
matches = pattern.TryMatch(*str, start, UNANCHORED, true,
|
||||||
|
vec, kVecSize);
|
||||||
if (matches <= 0)
|
if (matches <= 0)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
int matchstart = vec[0], matchend = vec[1];
|
size_t matchstart = vec[0], matchend = vec[1];
|
||||||
assert(matchstart >= static_cast<int>(start));
|
assert(matchstart >= start);
|
||||||
assert(matchend >= matchstart);
|
assert(matchend >= matchstart);
|
||||||
|
|
||||||
out.append(*str, start, matchstart - start);
|
out.append(*str, start, matchstart - start);
|
||||||
@ -395,8 +445,9 @@ int PCRE::GlobalReplace(string *str,
|
|||||||
if (count == 0)
|
if (count == 0)
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
if (start < str->length())
|
if (start < str->size())
|
||||||
out.append(*str, start, str->length() - start);
|
out.append(*str, start, str->size() - start);
|
||||||
|
using std::swap;
|
||||||
swap(out, *str);
|
swap(out, *str);
|
||||||
return count;
|
return count;
|
||||||
}
|
}
|
||||||
@ -405,7 +456,7 @@ bool PCRE::Extract(const StringPiece &text,
|
|||||||
const PCRE& pattern,
|
const PCRE& pattern,
|
||||||
const StringPiece &rewrite,
|
const StringPiece &rewrite,
|
||||||
string *out) {
|
string *out) {
|
||||||
int vec[kVecSize];
|
int vec[kVecSize] = {};
|
||||||
int matches = pattern.TryMatch(text, 0, UNANCHORED, true, vec, kVecSize);
|
int matches = pattern.TryMatch(text, 0, UNANCHORED, true, vec, kVecSize);
|
||||||
if (matches == 0)
|
if (matches == 0)
|
||||||
return false;
|
return false;
|
||||||
@ -424,7 +475,7 @@ string PCRE::QuoteMeta(const StringPiece& unquoted) {
|
|||||||
// that. (This also makes it identical to the perl function of the
|
// that. (This also makes it identical to the perl function of the
|
||||||
// same name except for the null-character special case;
|
// same name except for the null-character special case;
|
||||||
// see `perldoc -f quotemeta`.)
|
// see `perldoc -f quotemeta`.)
|
||||||
for (int ii = 0; ii < unquoted.length(); ++ii) {
|
for (size_t ii = 0; ii < unquoted.size(); ++ii) {
|
||||||
// Note that using 'isalnum' here raises the benchmark time from
|
// Note that using 'isalnum' here raises the benchmark time from
|
||||||
// 32ns to 58ns:
|
// 32ns to 58ns:
|
||||||
if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
|
if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
|
||||||
@ -451,7 +502,7 @@ string PCRE::QuoteMeta(const StringPiece& unquoted) {
|
|||||||
/***** Actual matching and rewriting code *****/
|
/***** Actual matching and rewriting code *****/
|
||||||
|
|
||||||
bool PCRE::HitLimit() {
|
bool PCRE::HitLimit() {
|
||||||
return hit_limit_;
|
return hit_limit_ != 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void PCRE::ClearHitLimit() {
|
void PCRE::ClearHitLimit() {
|
||||||
@ -459,7 +510,7 @@ void PCRE::ClearHitLimit() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
int PCRE::TryMatch(const StringPiece& text,
|
int PCRE::TryMatch(const StringPiece& text,
|
||||||
int startpos,
|
size_t startpos,
|
||||||
Anchor anchor,
|
Anchor anchor,
|
||||||
bool empty_ok,
|
bool empty_ok,
|
||||||
int *vec,
|
int *vec,
|
||||||
@ -499,8 +550,8 @@ int PCRE::TryMatch(const StringPiece& text,
|
|||||||
int rc = pcre_exec(re, // The regular expression object
|
int rc = pcre_exec(re, // The regular expression object
|
||||||
&extra,
|
&extra,
|
||||||
(text.data() == NULL) ? "" : text.data(),
|
(text.data() == NULL) ? "" : text.data(),
|
||||||
text.size(),
|
static_cast<int>(text.size()),
|
||||||
startpos,
|
static_cast<int>(startpos),
|
||||||
options,
|
options,
|
||||||
vec,
|
vec,
|
||||||
vecsize);
|
vecsize);
|
||||||
@ -554,14 +605,9 @@ int PCRE::TryMatch(const StringPiece& text,
|
|||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if !__clang__
|
|
||||||
#pragma GCC diagnostic push
|
|
||||||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
bool PCRE::DoMatchImpl(const StringPiece& text,
|
bool PCRE::DoMatchImpl(const StringPiece& text,
|
||||||
Anchor anchor,
|
Anchor anchor,
|
||||||
int* consumed,
|
size_t* consumed,
|
||||||
const Arg* const* args,
|
const Arg* const* args,
|
||||||
int n,
|
int n,
|
||||||
int* vec,
|
int* vec,
|
||||||
@ -589,7 +635,17 @@ bool PCRE::DoMatchImpl(const StringPiece& text,
|
|||||||
for (int i = 0; i < n; i++) {
|
for (int i = 0; i < n; i++) {
|
||||||
const int start = vec[2*(i+1)];
|
const int start = vec[2*(i+1)];
|
||||||
const int limit = vec[2*(i+1)+1];
|
const int limit = vec[2*(i+1)+1];
|
||||||
if (!args[i]->Parse(text.data() + start, limit-start)) {
|
|
||||||
|
// Avoid invoking undefined behavior when text.data() happens
|
||||||
|
// to be null and start happens to be -1, the latter being the
|
||||||
|
// case for an unmatched subexpression. Even if text.data() is
|
||||||
|
// not null, pointing one byte before was a longstanding bug.
|
||||||
|
const char* addr = NULL;
|
||||||
|
if (start != -1) {
|
||||||
|
addr = text.data() + start;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!args[i]->Parse(addr, limit-start)) {
|
||||||
// TODO: Should we indicate what the error was?
|
// TODO: Should we indicate what the error was?
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -598,17 +654,13 @@ bool PCRE::DoMatchImpl(const StringPiece& text,
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if !__clang__
|
|
||||||
#pragma GCC diagnostic pop
|
|
||||||
#endif
|
|
||||||
|
|
||||||
bool PCRE::DoMatch(const StringPiece& text,
|
bool PCRE::DoMatch(const StringPiece& text,
|
||||||
Anchor anchor,
|
Anchor anchor,
|
||||||
int* consumed,
|
size_t* consumed,
|
||||||
const Arg* const args[],
|
const Arg* const args[],
|
||||||
int n) const {
|
int n) const {
|
||||||
assert(n >= 0);
|
assert(n >= 0);
|
||||||
size_t const vecsize = (1 + n) * 3; // results + PCRE workspace
|
const int vecsize = (1 + n) * 3; // results + PCRE workspace
|
||||||
// (as for kVecSize)
|
// (as for kVecSize)
|
||||||
int* vec = new int[vecsize];
|
int* vec = new int[vecsize];
|
||||||
bool b = DoMatchImpl(text, anchor, consumed, args, n, vec, vecsize);
|
bool b = DoMatchImpl(text, anchor, consumed, args, n, vec, vecsize);
|
||||||
@ -695,41 +747,52 @@ int PCRE::NumberOfCapturingGroups() const {
|
|||||||
if (re_partial_ == NULL) return -1;
|
if (re_partial_ == NULL) return -1;
|
||||||
|
|
||||||
int result;
|
int result;
|
||||||
CHECK(pcre_fullinfo(re_partial_, // The regular expression object
|
int rc = pcre_fullinfo(re_partial_, // The regular expression object
|
||||||
NULL, // We did not study the pattern
|
NULL, // We did not study the pattern
|
||||||
PCRE_INFO_CAPTURECOUNT,
|
PCRE_INFO_CAPTURECOUNT,
|
||||||
&result) == 0);
|
&result);
|
||||||
|
if (rc != 0) {
|
||||||
|
PCREPORT(ERROR) << "Unexpected return code: " << rc;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/***** Parsers for various types *****/
|
/***** Parsers for various types *****/
|
||||||
|
|
||||||
bool PCRE::Arg::parse_null(const char* str, int n, void* dest) {
|
bool PCRE::Arg::parse_null(const char* str, size_t n, void* dest) {
|
||||||
// We fail if somebody asked us to store into a non-NULL void* pointer
|
// We fail if somebody asked us to store into a non-NULL void* pointer
|
||||||
return (dest == NULL);
|
return (dest == NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool PCRE::Arg::parse_string(const char* str, int n, void* dest) {
|
bool PCRE::Arg::parse_string(const char* str, size_t n, void* dest) {
|
||||||
if (dest == NULL) return true;
|
if (dest == NULL) return true;
|
||||||
reinterpret_cast<string*>(dest)->assign(str, n);
|
reinterpret_cast<string*>(dest)->assign(str, n);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool PCRE::Arg::parse_stringpiece(const char* str, int n, void* dest) {
|
bool PCRE::Arg::parse_stringpiece(const char* str, size_t n, void* dest) {
|
||||||
if (dest == NULL) return true;
|
if (dest == NULL) return true;
|
||||||
reinterpret_cast<StringPiece*>(dest)->set(str, n);
|
*(reinterpret_cast<StringPiece*>(dest)) = StringPiece(str, n);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool PCRE::Arg::parse_char(const char* str, int n, void* dest) {
|
bool PCRE::Arg::parse_char(const char* str, size_t n, void* dest) {
|
||||||
if (n != 1) return false;
|
if (n != 1) return false;
|
||||||
if (dest == NULL) return true;
|
if (dest == NULL) return true;
|
||||||
*(reinterpret_cast<char*>(dest)) = str[0];
|
*(reinterpret_cast<char*>(dest)) = str[0];
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool PCRE::Arg::parse_uchar(const char* str, int n, void* dest) {
|
bool PCRE::Arg::parse_schar(const char* str, size_t n, void* dest) {
|
||||||
|
if (n != 1) return false;
|
||||||
|
if (dest == NULL) return true;
|
||||||
|
*(reinterpret_cast<signed char*>(dest)) = str[0];
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool PCRE::Arg::parse_uchar(const char* str, size_t n, void* dest) {
|
||||||
if (n != 1) return false;
|
if (n != 1) return false;
|
||||||
if (dest == NULL) return true;
|
if (dest == NULL) return true;
|
||||||
*(reinterpret_cast<unsigned char*>(dest)) = str[0];
|
*(reinterpret_cast<unsigned char*>(dest)) = str[0];
|
||||||
@ -746,7 +809,7 @@ static const int kMaxNumberLength = 32;
|
|||||||
// a. "str" if no termination is needed
|
// a. "str" if no termination is needed
|
||||||
// b. "buf" if the string was copied and null-terminated
|
// b. "buf" if the string was copied and null-terminated
|
||||||
// c. "" if the input was invalid and has no hope of being parsed
|
// c. "" if the input was invalid and has no hope of being parsed
|
||||||
static const char* TerminateNumber(char* buf, const char* str, int n) {
|
static const char* TerminateNumber(char* buf, const char* str, size_t n) {
|
||||||
if ((n > 0) && isspace(*str)) {
|
if ((n > 0) && isspace(*str)) {
|
||||||
// We are less forgiving than the strtoxxx() routines and do not
|
// We are less forgiving than the strtoxxx() routines and do not
|
||||||
// allow leading spaces.
|
// allow leading spaces.
|
||||||
@ -769,7 +832,7 @@ static const char* TerminateNumber(char* buf, const char* str, int n) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool PCRE::Arg::parse_long_radix(const char* str,
|
bool PCRE::Arg::parse_long_radix(const char* str,
|
||||||
int n,
|
size_t n,
|
||||||
void* dest,
|
void* dest,
|
||||||
int radix) {
|
int radix) {
|
||||||
if (n == 0) return false;
|
if (n == 0) return false;
|
||||||
@ -786,7 +849,7 @@ bool PCRE::Arg::parse_long_radix(const char* str,
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool PCRE::Arg::parse_ulong_radix(const char* str,
|
bool PCRE::Arg::parse_ulong_radix(const char* str,
|
||||||
int n,
|
size_t n,
|
||||||
void* dest,
|
void* dest,
|
||||||
int radix) {
|
int radix) {
|
||||||
if (n == 0) return false;
|
if (n == 0) return false;
|
||||||
@ -809,55 +872,55 @@ bool PCRE::Arg::parse_ulong_radix(const char* str,
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool PCRE::Arg::parse_short_radix(const char* str,
|
bool PCRE::Arg::parse_short_radix(const char* str,
|
||||||
int n,
|
size_t n,
|
||||||
void* dest,
|
void* dest,
|
||||||
int radix) {
|
int radix) {
|
||||||
long r;
|
long r;
|
||||||
if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
|
if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
|
||||||
if ((short)r != r) return false; // Out of range
|
if ((short)r != r) return false; // Out of range
|
||||||
if (dest == NULL) return true;
|
if (dest == NULL) return true;
|
||||||
*(reinterpret_cast<short*>(dest)) = r;
|
*(reinterpret_cast<short*>(dest)) = (short)r;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool PCRE::Arg::parse_ushort_radix(const char* str,
|
bool PCRE::Arg::parse_ushort_radix(const char* str,
|
||||||
int n,
|
size_t n,
|
||||||
void* dest,
|
void* dest,
|
||||||
int radix) {
|
int radix) {
|
||||||
unsigned long r;
|
unsigned long r;
|
||||||
if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
|
if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
|
||||||
if ((ushort)r != r) return false; // Out of range
|
if ((unsigned short)r != r) return false; // Out of range
|
||||||
if (dest == NULL) return true;
|
if (dest == NULL) return true;
|
||||||
*(reinterpret_cast<unsigned short*>(dest)) = r;
|
*(reinterpret_cast<unsigned short*>(dest)) = (unsigned short)r;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool PCRE::Arg::parse_int_radix(const char* str,
|
bool PCRE::Arg::parse_int_radix(const char* str,
|
||||||
int n,
|
size_t n,
|
||||||
void* dest,
|
void* dest,
|
||||||
int radix) {
|
int radix) {
|
||||||
long r;
|
long r;
|
||||||
if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
|
if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
|
||||||
if ((int)r != r) return false; // Out of range
|
if ((int)r != r) return false; // Out of range
|
||||||
if (dest == NULL) return true;
|
if (dest == NULL) return true;
|
||||||
*(reinterpret_cast<int*>(dest)) = r;
|
*(reinterpret_cast<int*>(dest)) = (int)r;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool PCRE::Arg::parse_uint_radix(const char* str,
|
bool PCRE::Arg::parse_uint_radix(const char* str,
|
||||||
int n,
|
size_t n,
|
||||||
void* dest,
|
void* dest,
|
||||||
int radix) {
|
int radix) {
|
||||||
unsigned long r;
|
unsigned long r;
|
||||||
if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
|
if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
|
||||||
if ((uint)r != r) return false; // Out of range
|
if ((unsigned int)r != r) return false; // Out of range
|
||||||
if (dest == NULL) return true;
|
if (dest == NULL) return true;
|
||||||
*(reinterpret_cast<unsigned int*>(dest)) = r;
|
*(reinterpret_cast<unsigned int*>(dest)) = (unsigned int)r;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool PCRE::Arg::parse_longlong_radix(const char* str,
|
bool PCRE::Arg::parse_longlong_radix(const char* str,
|
||||||
int n,
|
size_t n,
|
||||||
void* dest,
|
void* dest,
|
||||||
int radix) {
|
int radix) {
|
||||||
if (n == 0) return false;
|
if (n == 0) return false;
|
||||||
@ -865,16 +928,16 @@ bool PCRE::Arg::parse_longlong_radix(const char* str,
|
|||||||
str = TerminateNumber(buf, str, n);
|
str = TerminateNumber(buf, str, n);
|
||||||
char* end;
|
char* end;
|
||||||
errno = 0;
|
errno = 0;
|
||||||
int64 r = strtoll(str, &end, radix);
|
long long r = strtoll(str, &end, radix);
|
||||||
if (end != str + n) return false; // Leftover junk
|
if (end != str + n) return false; // Leftover junk
|
||||||
if (errno) return false;
|
if (errno) return false;
|
||||||
if (dest == NULL) return true;
|
if (dest == NULL) return true;
|
||||||
*(reinterpret_cast<int64*>(dest)) = r;
|
*(reinterpret_cast<long long*>(dest)) = r;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool PCRE::Arg::parse_ulonglong_radix(const char* str,
|
bool PCRE::Arg::parse_ulonglong_radix(const char* str,
|
||||||
int n,
|
size_t n,
|
||||||
void* dest,
|
void* dest,
|
||||||
int radix) {
|
int radix) {
|
||||||
if (n == 0) return false;
|
if (n == 0) return false;
|
||||||
@ -887,26 +950,32 @@ bool PCRE::Arg::parse_ulonglong_radix(const char* str,
|
|||||||
}
|
}
|
||||||
char* end;
|
char* end;
|
||||||
errno = 0;
|
errno = 0;
|
||||||
uint64 r = strtoull(str, &end, radix);
|
unsigned long long r = strtoull(str, &end, radix);
|
||||||
if (end != str + n) return false; // Leftover junk
|
if (end != str + n) return false; // Leftover junk
|
||||||
if (errno) return false;
|
if (errno) return false;
|
||||||
if (dest == NULL) return true;
|
if (dest == NULL) return true;
|
||||||
*(reinterpret_cast<uint64*>(dest)) = r;
|
*(reinterpret_cast<unsigned long long*>(dest)) = r;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool PCRE::Arg::parse_double(const char* str, int n, void* dest) {
|
static bool parse_double_float(const char* str, size_t n, bool isfloat,
|
||||||
|
void* dest) {
|
||||||
if (n == 0) return false;
|
if (n == 0) return false;
|
||||||
static const int kMaxLength = 200;
|
static const int kMaxLength = 200;
|
||||||
char buf[kMaxLength];
|
char buf[kMaxLength];
|
||||||
if (n >= kMaxLength) return false;
|
if (n >= kMaxLength) return false;
|
||||||
memcpy(buf, str, n);
|
memcpy(buf, str, n);
|
||||||
buf[n] = '\0';
|
buf[n] = '\0';
|
||||||
errno = 0;
|
|
||||||
char* end;
|
char* end;
|
||||||
double r = strtod(buf, &end);
|
errno = 0;
|
||||||
|
double r;
|
||||||
|
if (isfloat) {
|
||||||
|
r = strtof(buf, &end);
|
||||||
|
} else {
|
||||||
|
r = strtod(buf, &end);
|
||||||
|
}
|
||||||
if (end != buf + n) {
|
if (end != buf + n) {
|
||||||
#ifdef COMPILER_MSVC
|
#ifdef _WIN32
|
||||||
// Microsoft's strtod() doesn't handle inf and nan, so we have to
|
// Microsoft's strtod() doesn't handle inf and nan, so we have to
|
||||||
// handle it explicitly. Speed is not important here because this
|
// handle it explicitly. Speed is not important here because this
|
||||||
// code is only called in unit tests.
|
// code is only called in unit tests.
|
||||||
@ -918,12 +987,12 @@ bool PCRE::Arg::parse_double(const char* str, int n, void* dest) {
|
|||||||
} else if ('+' == *i) {
|
} else if ('+' == *i) {
|
||||||
++i;
|
++i;
|
||||||
}
|
}
|
||||||
if (0 == stricmp(i, "inf") || 0 == stricmp(i, "infinity")) {
|
if (0 == _stricmp(i, "inf") || 0 == _stricmp(i, "infinity")) {
|
||||||
r = numeric_limits<double>::infinity();
|
r = std::numeric_limits<double>::infinity();
|
||||||
if (!pos)
|
if (!pos)
|
||||||
r = -r;
|
r = -r;
|
||||||
} else if (0 == stricmp(i, "nan")) {
|
} else if (0 == _stricmp(i, "nan")) {
|
||||||
r = numeric_limits<double>::quiet_NaN();
|
r = std::numeric_limits<double>::quiet_NaN();
|
||||||
} else {
|
} else {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -933,42 +1002,47 @@ bool PCRE::Arg::parse_double(const char* str, int n, void* dest) {
|
|||||||
}
|
}
|
||||||
if (errno) return false;
|
if (errno) return false;
|
||||||
if (dest == NULL) return true;
|
if (dest == NULL) return true;
|
||||||
|
if (isfloat) {
|
||||||
|
*(reinterpret_cast<float*>(dest)) = (float)r;
|
||||||
|
} else {
|
||||||
*(reinterpret_cast<double*>(dest)) = r;
|
*(reinterpret_cast<double*>(dest)) = r;
|
||||||
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool PCRE::Arg::parse_float(const char* str, int n, void* dest) {
|
bool PCRE::Arg::parse_double(const char* str, size_t n, void* dest) {
|
||||||
double r;
|
return parse_double_float(str, n, false, dest);
|
||||||
if (!parse_double(str, n, &r)) return false;
|
|
||||||
if (dest == NULL) return true;
|
|
||||||
*(reinterpret_cast<float*>(dest)) = static_cast<float>(r);
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool PCRE::Arg::parse_float(const char* str, size_t n, void* dest) {
|
||||||
|
return parse_double_float(str, n, true, dest);
|
||||||
|
}
|
||||||
|
|
||||||
#define DEFINE_INTEGER_PARSERS(name) \
|
#define DEFINE_INTEGER_PARSER(name) \
|
||||||
bool PCRE::Arg::parse_##name(const char* str, int n, void* dest) { \
|
bool PCRE::Arg::parse_##name(const char* str, size_t n, void* dest) { \
|
||||||
return parse_##name##_radix(str, n, dest, 10); \
|
return parse_##name##_radix(str, n, dest, 10); \
|
||||||
} \
|
} \
|
||||||
bool PCRE::Arg::parse_##name##_hex(const char* str, int n, void* dest) { \
|
bool PCRE::Arg::parse_##name##_hex(const char* str, size_t n, void* dest) { \
|
||||||
return parse_##name##_radix(str, n, dest, 16); \
|
return parse_##name##_radix(str, n, dest, 16); \
|
||||||
} \
|
} \
|
||||||
bool PCRE::Arg::parse_##name##_octal(const char* str, int n, void* dest) { \
|
bool PCRE::Arg::parse_##name##_octal(const char* str, size_t n, \
|
||||||
|
void* dest) { \
|
||||||
return parse_##name##_radix(str, n, dest, 8); \
|
return parse_##name##_radix(str, n, dest, 8); \
|
||||||
} \
|
} \
|
||||||
bool PCRE::Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \
|
bool PCRE::Arg::parse_##name##_cradix(const char* str, size_t n, \
|
||||||
|
void* dest) { \
|
||||||
return parse_##name##_radix(str, n, dest, 0); \
|
return parse_##name##_radix(str, n, dest, 0); \
|
||||||
}
|
}
|
||||||
|
|
||||||
DEFINE_INTEGER_PARSERS(short);
|
DEFINE_INTEGER_PARSER(short);
|
||||||
DEFINE_INTEGER_PARSERS(ushort);
|
DEFINE_INTEGER_PARSER(ushort);
|
||||||
DEFINE_INTEGER_PARSERS(int);
|
DEFINE_INTEGER_PARSER(int);
|
||||||
DEFINE_INTEGER_PARSERS(uint);
|
DEFINE_INTEGER_PARSER(uint);
|
||||||
DEFINE_INTEGER_PARSERS(long);
|
DEFINE_INTEGER_PARSER(long);
|
||||||
DEFINE_INTEGER_PARSERS(ulong);
|
DEFINE_INTEGER_PARSER(ulong);
|
||||||
DEFINE_INTEGER_PARSERS(longlong);
|
DEFINE_INTEGER_PARSER(longlong);
|
||||||
DEFINE_INTEGER_PARSERS(ulonglong);
|
DEFINE_INTEGER_PARSER(ulonglong);
|
||||||
|
|
||||||
#undef DEFINE_INTEGER_PARSERS
|
#undef DEFINE_INTEGER_PARSER
|
||||||
|
|
||||||
} // namespace re2
|
} // namespace re2
|
||||||
|
@ -2,6 +2,9 @@
|
|||||||
// Use of this source code is governed by a BSD-style
|
// Use of this source code is governed by a BSD-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
#ifndef UTIL_PCRE_H_
|
||||||
|
#define UTIL_PCRE_H_
|
||||||
|
|
||||||
// This is a variant of PCRE's pcrecpp.h, originally written at Google.
|
// This is a variant of PCRE's pcrecpp.h, originally written at Google.
|
||||||
// The main changes are the addition of the HitLimit method and
|
// The main changes are the addition of the HitLimit method and
|
||||||
// compilation as PCRE in namespace re2.
|
// compilation as PCRE in namespace re2.
|
||||||
@ -167,22 +170,9 @@ namespace re2 {
|
|||||||
const bool UsingPCRE = true;
|
const bool UsingPCRE = true;
|
||||||
} // namespace re2
|
} // namespace re2
|
||||||
#else
|
#else
|
||||||
|
struct pcre; // opaque
|
||||||
namespace re2 {
|
namespace re2 {
|
||||||
const bool UsingPCRE = false;
|
const bool UsingPCRE = false;
|
||||||
struct pcre;
|
|
||||||
struct pcre_extra { int flags, match_limit, match_limit_recursion; };
|
|
||||||
#define pcre_free(x) {}
|
|
||||||
#define PCRE_EXTRA_MATCH_LIMIT 0
|
|
||||||
#define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0
|
|
||||||
#define PCRE_ANCHORED 0
|
|
||||||
#define PCRE_NOTEMPTY 0
|
|
||||||
#define PCRE_ERROR_NOMATCH 1
|
|
||||||
#define PCRE_ERROR_MATCHLIMIT 2
|
|
||||||
#define PCRE_ERROR_RECURSIONLIMIT 3
|
|
||||||
#define PCRE_INFO_CAPTURECOUNT 0
|
|
||||||
#define pcre_compile(a,b,c,d,e) ({ (void)(a); (void)(b); *(c)=""; *(d)=0; (void)(e); ((pcre*)0); })
|
|
||||||
#define pcre_exec(a, b, c, d, e, f, g, h) ({ (void)(a); (void)(b); (void)(c); (void)(d); (void)(e); (void)(f); (void)(g); (void)(h); 0; })
|
|
||||||
#define pcre_fullinfo(a, b, c, d) ({ (void)(a); (void)(b); (void)(c); *(d) = 0; 0; })
|
|
||||||
} // namespace re2
|
} // namespace re2
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -258,7 +248,7 @@ class PCRE {
|
|||||||
// type, or one of:
|
// type, or one of:
|
||||||
// string (matched piece is copied to string)
|
// string (matched piece is copied to string)
|
||||||
// StringPiece (StringPiece is mutated to point to matched piece)
|
// StringPiece (StringPiece is mutated to point to matched piece)
|
||||||
// T (where "bool T::ParseFrom(const char*, int)" exists)
|
// T (where "bool T::ParseFrom(const char*, size_t)" exists)
|
||||||
// (void*)NULL (the corresponding matched sub-pattern is not copied)
|
// (void*)NULL (the corresponding matched sub-pattern is not copied)
|
||||||
//
|
//
|
||||||
// Returns true iff all of the following conditions are satisfied:
|
// Returns true iff all of the following conditions are satisfied:
|
||||||
@ -452,7 +442,7 @@ class PCRE {
|
|||||||
// "*consumed" if successful.
|
// "*consumed" if successful.
|
||||||
bool DoMatch(const StringPiece& text,
|
bool DoMatch(const StringPiece& text,
|
||||||
Anchor anchor,
|
Anchor anchor,
|
||||||
int* consumed,
|
size_t* consumed,
|
||||||
const Arg* const* args, int n) const;
|
const Arg* const* args, int n) const;
|
||||||
|
|
||||||
// Return the number of capturing subpatterns, or -1 if the
|
// Return the number of capturing subpatterns, or -1 if the
|
||||||
@ -475,7 +465,7 @@ class PCRE {
|
|||||||
// When matching PCRE("(foo)|hello") against "hello", it will return 1.
|
// When matching PCRE("(foo)|hello") against "hello", it will return 1.
|
||||||
// But the values for all subpattern are filled in into "vec".
|
// But the values for all subpattern are filled in into "vec".
|
||||||
int TryMatch(const StringPiece& text,
|
int TryMatch(const StringPiece& text,
|
||||||
int startpos,
|
size_t startpos,
|
||||||
Anchor anchor,
|
Anchor anchor,
|
||||||
bool empty_ok,
|
bool empty_ok,
|
||||||
int *vec,
|
int *vec,
|
||||||
@ -492,7 +482,7 @@ class PCRE {
|
|||||||
// internal implementation for DoMatch
|
// internal implementation for DoMatch
|
||||||
bool DoMatchImpl(const StringPiece& text,
|
bool DoMatchImpl(const StringPiece& text,
|
||||||
Anchor anchor,
|
Anchor anchor,
|
||||||
int* consumed,
|
size_t* consumed,
|
||||||
const Arg* const args[],
|
const Arg* const args[],
|
||||||
int n,
|
int n,
|
||||||
int* vec,
|
int* vec,
|
||||||
@ -510,7 +500,9 @@ class PCRE {
|
|||||||
int match_limit_; // Limit on execution resources
|
int match_limit_; // Limit on execution resources
|
||||||
int stack_limit_; // Limit on stack resources (bytes)
|
int stack_limit_; // Limit on stack resources (bytes)
|
||||||
mutable int32_t hit_limit_; // Hit limit during execution (bool)?
|
mutable int32_t hit_limit_; // Hit limit during execution (bool)?
|
||||||
DISALLOW_EVIL_CONSTRUCTORS(PCRE);
|
|
||||||
|
PCRE(const PCRE&) = delete;
|
||||||
|
PCRE& operator=(const PCRE&) = delete;
|
||||||
};
|
};
|
||||||
|
|
||||||
// PCRE_Options allow you to set the PCRE::Options, plus any pcre
|
// PCRE_Options allow you to set the PCRE::Options, plus any pcre
|
||||||
@ -565,7 +557,7 @@ class PCRE_Options {
|
|||||||
template <class T>
|
template <class T>
|
||||||
class _PCRE_MatchObject {
|
class _PCRE_MatchObject {
|
||||||
public:
|
public:
|
||||||
static inline bool Parse(const char* str, int n, void* dest) {
|
static inline bool Parse(const char* str, size_t n, void* dest) {
|
||||||
if (dest == NULL) return true;
|
if (dest == NULL) return true;
|
||||||
T* object = reinterpret_cast<T*>(dest);
|
T* object = reinterpret_cast<T*>(dest);
|
||||||
return object->ParseFrom(str, n);
|
return object->ParseFrom(str, n);
|
||||||
@ -580,16 +572,21 @@ class PCRE::Arg {
|
|||||||
// Constructor specially designed for NULL arguments
|
// Constructor specially designed for NULL arguments
|
||||||
Arg(void*);
|
Arg(void*);
|
||||||
|
|
||||||
typedef bool (*Parser)(const char* str, int n, void* dest);
|
typedef bool (*Parser)(const char* str, size_t n, void* dest);
|
||||||
|
|
||||||
// Type-specific parsers
|
// Type-specific parsers
|
||||||
#define MAKE_PARSER(type, name) \
|
#define MAKE_PARSER(type, name) \
|
||||||
Arg(type* p) : arg_(p), parser_(name) {} \
|
Arg(type* p) : arg_(p), parser_(name) {} \
|
||||||
Arg(type* p, Parser parser) : arg_(p), parser_(parser) { } \
|
Arg(type* p, Parser parser) : arg_(p), parser_(parser) {}
|
||||||
|
|
||||||
|
|
||||||
MAKE_PARSER(char, parse_char);
|
MAKE_PARSER(char, parse_char);
|
||||||
|
MAKE_PARSER(signed char, parse_schar);
|
||||||
MAKE_PARSER(unsigned char, parse_uchar);
|
MAKE_PARSER(unsigned char, parse_uchar);
|
||||||
|
MAKE_PARSER(float, parse_float);
|
||||||
|
MAKE_PARSER(double, parse_double);
|
||||||
|
MAKE_PARSER(string, parse_string);
|
||||||
|
MAKE_PARSER(StringPiece, parse_stringpiece);
|
||||||
|
|
||||||
MAKE_PARSER(short, parse_short);
|
MAKE_PARSER(short, parse_short);
|
||||||
MAKE_PARSER(unsigned short, parse_ushort);
|
MAKE_PARSER(unsigned short, parse_ushort);
|
||||||
MAKE_PARSER(int, parse_int);
|
MAKE_PARSER(int, parse_int);
|
||||||
@ -598,10 +595,6 @@ class PCRE::Arg {
|
|||||||
MAKE_PARSER(unsigned long, parse_ulong);
|
MAKE_PARSER(unsigned long, parse_ulong);
|
||||||
MAKE_PARSER(long long, parse_longlong);
|
MAKE_PARSER(long long, parse_longlong);
|
||||||
MAKE_PARSER(unsigned long long, parse_ulonglong);
|
MAKE_PARSER(unsigned long long, parse_ulonglong);
|
||||||
MAKE_PARSER(float, parse_float);
|
|
||||||
MAKE_PARSER(double, parse_double);
|
|
||||||
MAKE_PARSER(string, parse_string);
|
|
||||||
MAKE_PARSER(StringPiece, parse_stringpiece);
|
|
||||||
|
|
||||||
#undef MAKE_PARSER
|
#undef MAKE_PARSER
|
||||||
|
|
||||||
@ -613,29 +606,31 @@ class PCRE::Arg {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Parse the data
|
// Parse the data
|
||||||
bool Parse(const char* str, int n) const;
|
bool Parse(const char* str, size_t n) const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void* arg_;
|
void* arg_;
|
||||||
Parser parser_;
|
Parser parser_;
|
||||||
|
|
||||||
static bool parse_null (const char* str, int n, void* dest);
|
static bool parse_null (const char* str, size_t n, void* dest);
|
||||||
static bool parse_char (const char* str, int n, void* dest);
|
static bool parse_char (const char* str, size_t n, void* dest);
|
||||||
static bool parse_uchar (const char* str, int n, void* dest);
|
static bool parse_schar (const char* str, size_t n, void* dest);
|
||||||
static bool parse_float (const char* str, int n, void* dest);
|
static bool parse_uchar (const char* str, size_t n, void* dest);
|
||||||
static bool parse_double (const char* str, int n, void* dest);
|
static bool parse_float (const char* str, size_t n, void* dest);
|
||||||
static bool parse_string (const char* str, int n, void* dest);
|
static bool parse_double (const char* str, size_t n, void* dest);
|
||||||
static bool parse_stringpiece (const char* str, int n, void* dest);
|
static bool parse_string (const char* str, size_t n, void* dest);
|
||||||
|
static bool parse_stringpiece (const char* str, size_t n, void* dest);
|
||||||
|
|
||||||
#define DECLARE_INTEGER_PARSER(name) \
|
#define DECLARE_INTEGER_PARSER(name) \
|
||||||
private: \
|
private: \
|
||||||
static bool parse_ ## name(const char* str, int n, void* dest); \
|
static bool parse_##name(const char* str, size_t n, void* dest); \
|
||||||
static bool parse_ ## name ## _radix( \
|
static bool parse_##name##_radix(const char* str, size_t n, void* dest, \
|
||||||
const char* str, int n, void* dest, int radix); \
|
int radix); \
|
||||||
|
\
|
||||||
public: \
|
public: \
|
||||||
static bool parse_ ## name ## _hex(const char* str, int n, void* dest); \
|
static bool parse_##name##_hex(const char* str, size_t n, void* dest); \
|
||||||
static bool parse_ ## name ## _octal(const char* str, int n, void* dest); \
|
static bool parse_##name##_octal(const char* str, size_t n, void* dest); \
|
||||||
static bool parse_ ## name ## _cradix(const char* str, int n, void* dest)
|
static bool parse_##name##_cradix(const char* str, size_t n, void* dest)
|
||||||
|
|
||||||
DECLARE_INTEGER_PARSER(short);
|
DECLARE_INTEGER_PARSER(short);
|
||||||
DECLARE_INTEGER_PARSER(ushort);
|
DECLARE_INTEGER_PARSER(ushort);
|
||||||
@ -647,23 +642,27 @@ class PCRE::Arg {
|
|||||||
DECLARE_INTEGER_PARSER(ulonglong);
|
DECLARE_INTEGER_PARSER(ulonglong);
|
||||||
|
|
||||||
#undef DECLARE_INTEGER_PARSER
|
#undef DECLARE_INTEGER_PARSER
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
inline PCRE::Arg::Arg() : arg_(NULL), parser_(parse_null) { }
|
inline PCRE::Arg::Arg() : arg_(NULL), parser_(parse_null) { }
|
||||||
inline PCRE::Arg::Arg(void* p) : arg_(p), parser_(parse_null) { }
|
inline PCRE::Arg::Arg(void* p) : arg_(p), parser_(parse_null) { }
|
||||||
|
|
||||||
inline bool PCRE::Arg::Parse(const char* str, int n) const {
|
inline bool PCRE::Arg::Parse(const char* str, size_t n) const {
|
||||||
return (*parser_)(str, n, arg_);
|
return (*parser_)(str, n, arg_);
|
||||||
}
|
}
|
||||||
|
|
||||||
// This part of the parser, appropriate only for ints, deals with bases
|
// This part of the parser, appropriate only for ints, deals with bases
|
||||||
#define MAKE_INTEGER_PARSER(type, name) \
|
#define MAKE_INTEGER_PARSER(type, name) \
|
||||||
inline PCRE::Arg Hex(type* ptr) { \
|
inline PCRE::Arg Hex(type* ptr) { \
|
||||||
return PCRE::Arg(ptr, PCRE::Arg::parse_ ## name ## _hex); } \
|
return PCRE::Arg(ptr, PCRE::Arg::parse_##name##_hex); \
|
||||||
|
} \
|
||||||
inline PCRE::Arg Octal(type* ptr) { \
|
inline PCRE::Arg Octal(type* ptr) { \
|
||||||
return PCRE::Arg(ptr, PCRE::Arg::parse_ ## name ## _octal); } \
|
return PCRE::Arg(ptr, PCRE::Arg::parse_##name##_octal); \
|
||||||
|
} \
|
||||||
inline PCRE::Arg CRadix(type* ptr) { \
|
inline PCRE::Arg CRadix(type* ptr) { \
|
||||||
return PCRE::Arg(ptr, PCRE::Arg::parse_ ## name ## _cradix); }
|
return PCRE::Arg(ptr, PCRE::Arg::parse_##name##_cradix); \
|
||||||
|
}
|
||||||
|
|
||||||
MAKE_INTEGER_PARSER(short, short);
|
MAKE_INTEGER_PARSER(short, short);
|
||||||
MAKE_INTEGER_PARSER(unsigned short, ushort);
|
MAKE_INTEGER_PARSER(unsigned short, ushort);
|
||||||
@ -677,3 +676,5 @@ MAKE_INTEGER_PARSER(unsigned long long, ulonglong);
|
|||||||
#undef MAKE_INTEGER_PARSER
|
#undef MAKE_INTEGER_PARSER
|
||||||
|
|
||||||
} // namespace re2
|
} // namespace re2
|
||||||
|
|
||||||
|
#endif // UTIL_PCRE_H_
|
||||||
|
@ -1,34 +0,0 @@
|
|||||||
// Copyright 2005-2009 The RE2 Authors. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
// Modified from Google perftools's tcmalloc_unittest.cc.
|
|
||||||
|
|
||||||
#include "util/random.h"
|
|
||||||
|
|
||||||
namespace re2 {
|
|
||||||
|
|
||||||
int32 ACMRandom::Next() {
|
|
||||||
const int32 M = 2147483647L; // 2^31-1
|
|
||||||
const int32 A = 16807;
|
|
||||||
// In effect, we are computing seed_ = (seed_ * A) % M, where M = 2^31-1
|
|
||||||
uint32 lo = A * (int32)(seed_ & 0xFFFF);
|
|
||||||
uint32 hi = A * (int32)((uint32)seed_ >> 16);
|
|
||||||
lo += (hi & 0x7FFF) << 16;
|
|
||||||
if (lo > M) {
|
|
||||||
lo &= M;
|
|
||||||
++lo;
|
|
||||||
}
|
|
||||||
lo += hi >> 15;
|
|
||||||
if (lo > M) {
|
|
||||||
lo &= M;
|
|
||||||
++lo;
|
|
||||||
}
|
|
||||||
return (seed_ = (int32) lo);
|
|
||||||
}
|
|
||||||
|
|
||||||
int32 ACMRandom::Uniform(int32 n) {
|
|
||||||
return Next() % n;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace re2
|
|
@ -1,29 +0,0 @@
|
|||||||
// Copyright 2005-2009 The RE2 Authors. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
// Modified from Google perftools's tcmalloc_unittest.cc.
|
|
||||||
|
|
||||||
#ifndef RE2_UTIL_RANDOM_H__
|
|
||||||
#define RE2_UTIL_RANDOM_H__
|
|
||||||
|
|
||||||
#include "util/util.h"
|
|
||||||
|
|
||||||
namespace re2 {
|
|
||||||
|
|
||||||
// ACM minimal standard random number generator. (re-entrant.)
|
|
||||||
class ACMRandom {
|
|
||||||
public:
|
|
||||||
ACMRandom(int32 seed) : seed_(seed) {}
|
|
||||||
int32 Next();
|
|
||||||
int32 Uniform(int32);
|
|
||||||
|
|
||||||
void Reset(int32 seed) { seed_ = seed; }
|
|
||||||
|
|
||||||
private:
|
|
||||||
int32 seed_;
|
|
||||||
};
|
|
||||||
|
|
||||||
} // namespace re2
|
|
||||||
|
|
||||||
#endif // RE2_UTIL_RANDOM_H__
|
|
@ -11,8 +11,10 @@
|
|||||||
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
|
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
|
||||||
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
|
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <stdarg.h>
|
#include <stdarg.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
#include "util/utf.h"
|
#include "util/utf.h"
|
||||||
|
|
||||||
namespace re2 {
|
namespace re2 {
|
||||||
@ -133,7 +135,7 @@ runetochar(char *str, const Rune *rune)
|
|||||||
*/
|
*/
|
||||||
c = *rune;
|
c = *rune;
|
||||||
if(c <= Rune1) {
|
if(c <= Rune1) {
|
||||||
str[0] = c;
|
str[0] = static_cast<char>(c);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -142,7 +144,7 @@ runetochar(char *str, const Rune *rune)
|
|||||||
* 0080-07FF => T2 Tx
|
* 0080-07FF => T2 Tx
|
||||||
*/
|
*/
|
||||||
if(c <= Rune2) {
|
if(c <= Rune2) {
|
||||||
str[0] = T2 | (c >> 1*Bitx);
|
str[0] = T2 | static_cast<char>(c >> 1*Bitx);
|
||||||
str[1] = Tx | (c & Maskx);
|
str[1] = Tx | (c & Maskx);
|
||||||
return 2;
|
return 2;
|
||||||
}
|
}
|
||||||
@ -161,7 +163,7 @@ runetochar(char *str, const Rune *rune)
|
|||||||
* 0800-FFFF => T3 Tx Tx
|
* 0800-FFFF => T3 Tx Tx
|
||||||
*/
|
*/
|
||||||
if (c <= Rune3) {
|
if (c <= Rune3) {
|
||||||
str[0] = T3 | (c >> 2*Bitx);
|
str[0] = T3 | static_cast<char>(c >> 2*Bitx);
|
||||||
str[1] = Tx | ((c >> 1*Bitx) & Maskx);
|
str[1] = Tx | ((c >> 1*Bitx) & Maskx);
|
||||||
str[2] = Tx | (c & Maskx);
|
str[2] = Tx | (c & Maskx);
|
||||||
return 3;
|
return 3;
|
||||||
@ -171,7 +173,7 @@ runetochar(char *str, const Rune *rune)
|
|||||||
* four character sequence (21-bit value)
|
* four character sequence (21-bit value)
|
||||||
* 10000-1FFFFF => T4 Tx Tx Tx
|
* 10000-1FFFFF => T4 Tx Tx Tx
|
||||||
*/
|
*/
|
||||||
str[0] = T4 | (c >> 3*Bitx);
|
str[0] = T4 | static_cast<char>(c >> 3*Bitx);
|
||||||
str[1] = Tx | ((c >> 2*Bitx) & Maskx);
|
str[1] = Tx | ((c >> 2*Bitx) & Maskx);
|
||||||
str[2] = Tx | ((c >> 1*Bitx) & Maskx);
|
str[2] = Tx | ((c >> 1*Bitx) & Maskx);
|
||||||
str[3] = Tx | (c & Maskx);
|
str[3] = Tx | (c & Maskx);
|
||||||
|
@ -2,6 +2,9 @@
|
|||||||
// Use of this source code is governed by a BSD-style
|
// Use of this source code is governed by a BSD-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
#ifndef UTIL_SPARSE_ARRAY_H_
|
||||||
|
#define UTIL_SPARSE_ARRAY_H_
|
||||||
|
|
||||||
// DESCRIPTION
|
// DESCRIPTION
|
||||||
//
|
//
|
||||||
// SparseArray<T>(m) is a map from integers in [0, m) to T values.
|
// SparseArray<T>(m) is a map from integers in [0, m) to T values.
|
||||||
@ -52,47 +55,58 @@
|
|||||||
|
|
||||||
// IMPLEMENTATION
|
// IMPLEMENTATION
|
||||||
//
|
//
|
||||||
// SparseArray uses a vector dense_ and an array sparse_to_dense_, both of
|
// SparseArray is an array dense_ and an array sparse_, both of size max_size_.
|
||||||
// size max_size_. At any point, the number of elements in the sparse array is
|
// At any point, the number of elements in the sparse array is size_.
|
||||||
// size_.
|
|
||||||
//
|
//
|
||||||
// The vector dense_ contains the size_ elements in the sparse array (with
|
// The array dense_ contains the size_ elements in the sparse array (with
|
||||||
// their indices),
|
// their indices),
|
||||||
// in the order that the elements were first inserted. This array is dense:
|
// in the order that the elements were first inserted. This array is dense:
|
||||||
// the size_ pairs are dense_[0] through dense_[size_-1].
|
// the size_ pairs are dense_[0] through dense_[size_-1].
|
||||||
//
|
//
|
||||||
// The array sparse_to_dense_ maps from indices in [0,m) to indices in
|
// The array sparse_ maps from indices in [0,m) to indices in [0,size_).
|
||||||
// [0,size_).
|
// For indices present in the array, dense_[sparse_[i]].index_ == i.
|
||||||
// For indices present in the array, dense_[sparse_to_dense_[i]].index_ == i.
|
// For indices not present in the array, sparse_ can contain any value at all,
|
||||||
// For indices not present in the array, sparse_to_dense_ can contain
|
// perhaps outside the range [0, size_) but perhaps not.
|
||||||
// any value at all, perhaps outside the range [0, size_) but perhaps not.
|
|
||||||
//
|
//
|
||||||
// The lax requirement on sparse_to_dense_ values makes clearing
|
// The lax requirement on sparse_ values makes clearing the array very easy:
|
||||||
// the array very easy: set size_ to 0. Lookups are slightly more
|
// set size_ to 0. Lookups are slightly more complicated.
|
||||||
// complicated. An index i has a value in the array if and only if:
|
// An index i has a value in the array if and only if:
|
||||||
// sparse_to_dense_[i] is in [0, size_) AND
|
// sparse_[i] is in [0, size_) AND
|
||||||
// dense_[sparse_to_dense_[i]].index_ == i.
|
// dense_[sparse_[i]].index_ == i.
|
||||||
// If both these properties hold, only then it is safe to refer to
|
// If both these properties hold, only then it is safe to refer to
|
||||||
// dense_[sparse_to_dense_[i]].value_
|
// dense_[sparse_[i]].value_
|
||||||
// as the value associated with index i.
|
// as the value associated with index i.
|
||||||
//
|
//
|
||||||
// To insert a new entry, set sparse_to_dense_[i] to size_,
|
// To insert a new entry, set sparse_[i] to size_,
|
||||||
// initialize dense_[size_], and then increment size_.
|
// initialize dense_[size_], and then increment size_.
|
||||||
//
|
//
|
||||||
// Deletion of specific values from the array is implemented by
|
// Deletion of specific values from the array is implemented by
|
||||||
// swapping dense_[size_-1] and the dense_ being deleted and then
|
// swapping dense_[size_-1] and the dense_ being deleted and then
|
||||||
// updating the appropriate sparse_to_dense_ entries.
|
// updating the appropriate sparse_ entries.
|
||||||
//
|
//
|
||||||
// To make the sparse array as efficient as possible for non-primitive types,
|
// To make the sparse array as efficient as possible for non-primitive types,
|
||||||
// elements may or may not be destroyed when they are deleted from the sparse
|
// elements may or may not be destroyed when they are deleted from the sparse
|
||||||
// array through a call to erase(), erase_existing() or resize(). They
|
// array through a call to erase(), erase_existing() or resize(). They
|
||||||
// immediately become inaccessible, but they are only guaranteed to be
|
// immediately become inaccessible, but they are only guaranteed to be
|
||||||
// destroyed when the SparseArray destructor is called.
|
// destroyed when the SparseArray destructor is called.
|
||||||
|
//
|
||||||
|
// A moved-from SparseArray will be empty.
|
||||||
|
|
||||||
#ifndef RE2_UTIL_SPARSE_ARRAY_H__
|
// Doing this simplifies the logic below.
|
||||||
#define RE2_UTIL_SPARSE_ARRAY_H__
|
#ifndef __has_feature
|
||||||
|
#define __has_feature(x) 0
|
||||||
|
#endif
|
||||||
|
|
||||||
#include "util/util.h"
|
#include <assert.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <string.h>
|
||||||
|
#if __has_feature(memory_sanitizer)
|
||||||
|
#include <sanitizer/msan_interface.h>
|
||||||
|
#endif
|
||||||
|
#include <algorithm>
|
||||||
|
#include <memory>
|
||||||
|
#include <type_traits>
|
||||||
|
#include <utility>
|
||||||
|
|
||||||
namespace re2 {
|
namespace re2 {
|
||||||
|
|
||||||
@ -100,36 +114,49 @@ template<typename Value>
|
|||||||
class SparseArray {
|
class SparseArray {
|
||||||
public:
|
public:
|
||||||
SparseArray();
|
SparseArray();
|
||||||
SparseArray(int max_size);
|
explicit SparseArray(int max_size);
|
||||||
~SparseArray();
|
~SparseArray();
|
||||||
|
|
||||||
// IndexValue pairs: exposed in SparseArray::iterator.
|
// IndexValue pairs: exposed in SparseArray::iterator.
|
||||||
class IndexValue;
|
class IndexValue;
|
||||||
|
static_assert(std::is_trivially_destructible<IndexValue>::value,
|
||||||
|
"IndexValue must be trivially destructible");
|
||||||
|
|
||||||
typedef IndexValue value_type;
|
typedef IndexValue value_type;
|
||||||
typedef typename vector<IndexValue>::iterator iterator;
|
typedef IndexValue* iterator;
|
||||||
typedef typename vector<IndexValue>::const_iterator const_iterator;
|
typedef const IndexValue* const_iterator;
|
||||||
|
|
||||||
inline const IndexValue& iv(int i) const;
|
SparseArray(const SparseArray& src);
|
||||||
|
SparseArray(SparseArray&& src) /*noexcept*/;
|
||||||
|
|
||||||
|
SparseArray& operator=(const SparseArray& src);
|
||||||
|
SparseArray& operator=(SparseArray&& src) /*noexcept*/;
|
||||||
|
|
||||||
|
const IndexValue& iv(int i) const;
|
||||||
|
|
||||||
// Return the number of entries in the array.
|
// Return the number of entries in the array.
|
||||||
int size() const {
|
int size() const {
|
||||||
return size_;
|
return size_;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Indicate whether the array is empty.
|
||||||
|
int empty() const {
|
||||||
|
return size_ == 0;
|
||||||
|
}
|
||||||
|
|
||||||
// Iterate over the array.
|
// Iterate over the array.
|
||||||
iterator begin() {
|
iterator begin() {
|
||||||
return dense_.begin();
|
return dense_.get();
|
||||||
}
|
}
|
||||||
iterator end() {
|
iterator end() {
|
||||||
return dense_.begin() + size_;
|
return dense_.get() + size_;
|
||||||
}
|
}
|
||||||
|
|
||||||
const_iterator begin() const {
|
const_iterator begin() const {
|
||||||
return dense_.begin();
|
return dense_.get();
|
||||||
}
|
}
|
||||||
const_iterator end() const {
|
const_iterator end() const {
|
||||||
return dense_.begin() + size_;
|
return dense_.get() + size_;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Change the maximum size of the array.
|
// Change the maximum size of the array.
|
||||||
@ -148,39 +175,68 @@ class SparseArray {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Check whether index i is in the array.
|
// Check whether index i is in the array.
|
||||||
inline bool has_index(int i) const;
|
bool has_index(int i) const;
|
||||||
|
|
||||||
// Comparison function for sorting.
|
// Comparison function for sorting.
|
||||||
// Can sort the sparse array so that future iterations
|
// Can sort the sparse array so that future iterations
|
||||||
// will visit indices in increasing order using
|
// will visit indices in increasing order using
|
||||||
// sort(arr.begin(), arr.end(), arr.less);
|
// std::sort(arr.begin(), arr.end(), arr.less);
|
||||||
static bool less(const IndexValue& a, const IndexValue& b);
|
static bool less(const IndexValue& a, const IndexValue& b);
|
||||||
|
|
||||||
public:
|
public:
|
||||||
// Set the value at index i to v.
|
// Set the value at index i to v.
|
||||||
inline iterator set(int i, Value v);
|
iterator set(int i, const Value& v) {
|
||||||
|
return SetInternal(true, i, v);
|
||||||
|
}
|
||||||
|
iterator set(int i, Value&& v) { // NOLINT
|
||||||
|
return SetInternal(true, i, std::move(v));
|
||||||
|
}
|
||||||
|
|
||||||
pair<iterator, bool> insert(const value_type& new_value);
|
std::pair<iterator, bool> insert(const value_type& v) {
|
||||||
|
return InsertInternal(v);
|
||||||
|
}
|
||||||
|
std::pair<iterator, bool> insert(value_type&& v) { // NOLINT
|
||||||
|
return InsertInternal(std::move(v));
|
||||||
|
}
|
||||||
|
|
||||||
// Returns the value at index i
|
template <typename... Args>
|
||||||
// or defaultv if index i is not initialized in the array.
|
std::pair<iterator, bool> emplace(Args&&... args) { // NOLINT
|
||||||
inline Value get(int i, Value defaultv) const;
|
return InsertInternal(value_type(std::forward<Args>(args)...));
|
||||||
|
}
|
||||||
|
|
||||||
iterator find(int i);
|
iterator find(int i) {
|
||||||
|
if (has_index(i))
|
||||||
|
return dense_.get() + sparse_[i];
|
||||||
|
return end();
|
||||||
|
}
|
||||||
|
|
||||||
const_iterator find(int i) const;
|
const_iterator find(int i) const {
|
||||||
|
if (has_index(i))
|
||||||
|
return dense_.get() + sparse_[i];
|
||||||
|
return end();
|
||||||
|
}
|
||||||
|
|
||||||
// Change the value at index i to v.
|
// Change the value at index i to v.
|
||||||
// Fast but unsafe: only use if has_index(i) is true.
|
// Fast but unsafe: only use if has_index(i) is true.
|
||||||
inline iterator set_existing(int i, Value v);
|
iterator set_existing(int i, const Value& v) {
|
||||||
|
return SetExistingInternal(i, v);
|
||||||
|
}
|
||||||
|
iterator set_existing(int i, Value&& v) { // NOLINT
|
||||||
|
return SetExistingInternal(i, std::move(v));
|
||||||
|
}
|
||||||
|
|
||||||
// Set the value at the new index i to v.
|
// Set the value at the new index i to v.
|
||||||
// Fast but unsafe: only use if has_index(i) is false.
|
// Fast but unsafe: only use if has_index(i) is false.
|
||||||
inline iterator set_new(int i, Value v);
|
iterator set_new(int i, const Value& v) {
|
||||||
|
return SetInternal(false, i, v);
|
||||||
|
}
|
||||||
|
iterator set_new(int i, Value&& v) { // NOLINT
|
||||||
|
return SetInternal(false, i, std::move(v));
|
||||||
|
}
|
||||||
|
|
||||||
// Get the value at index i from the array..
|
// Get the value at index i from the array..
|
||||||
// Fast but unsafe: only use if has_index(i) is true.
|
// Fast but unsafe: only use if has_index(i) is true.
|
||||||
inline Value get_existing(int i) const;
|
const Value& get_existing(int i) const;
|
||||||
|
|
||||||
// Erasing items from the array during iteration is in general
|
// Erasing items from the array during iteration is in general
|
||||||
// NOT safe. There is one special case, which is that the current
|
// NOT safe. There is one special case, which is that the current
|
||||||
@ -201,37 +257,132 @@ class SparseArray {
|
|||||||
// the iterators could walk past the end of the array.
|
// the iterators could walk past the end of the array.
|
||||||
|
|
||||||
// Erases the element at index i from the array.
|
// Erases the element at index i from the array.
|
||||||
inline void erase(int i);
|
void erase(int i);
|
||||||
|
|
||||||
// Erases the element at index i from the array.
|
// Erases the element at index i from the array.
|
||||||
// Fast but unsafe: only use if has_index(i) is true.
|
// Fast but unsafe: only use if has_index(i) is true.
|
||||||
inline void erase_existing(int i);
|
void erase_existing(int i);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
template <typename U>
|
||||||
|
std::pair<iterator, bool> InsertInternal(U&& v) {
|
||||||
|
DebugCheckInvariants();
|
||||||
|
std::pair<iterator, bool> p;
|
||||||
|
if (has_index(v.index_)) {
|
||||||
|
p = {dense_.get() + sparse_[v.index_], false};
|
||||||
|
} else {
|
||||||
|
p = {set_new(std::forward<U>(v).index_, std::forward<U>(v).second), true};
|
||||||
|
}
|
||||||
|
DebugCheckInvariants();
|
||||||
|
return p;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename U>
|
||||||
|
iterator SetInternal(bool allow_overwrite, int i, U&& v) { // NOLINT
|
||||||
|
DebugCheckInvariants();
|
||||||
|
if (static_cast<uint32_t>(i) >= static_cast<uint32_t>(max_size_)) {
|
||||||
|
assert(false && "illegal index");
|
||||||
|
// Semantically, end() would be better here, but we already know
|
||||||
|
// the user did something stupid, so begin() insulates them from
|
||||||
|
// dereferencing an invalid pointer.
|
||||||
|
return begin();
|
||||||
|
}
|
||||||
|
if (!allow_overwrite) {
|
||||||
|
assert(!has_index(i));
|
||||||
|
create_index(i);
|
||||||
|
} else {
|
||||||
|
if (!has_index(i))
|
||||||
|
create_index(i);
|
||||||
|
}
|
||||||
|
return set_existing(i, std::forward<U>(v)); // NOLINT
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename U>
|
||||||
|
iterator SetExistingInternal(int i, U&& v) { // NOLINT
|
||||||
|
DebugCheckInvariants();
|
||||||
|
assert(has_index(i));
|
||||||
|
dense_[sparse_[i]].value() = std::forward<U>(v);
|
||||||
|
DebugCheckInvariants();
|
||||||
|
return dense_.get() + sparse_[i];
|
||||||
|
}
|
||||||
|
|
||||||
// Add the index i to the array.
|
// Add the index i to the array.
|
||||||
// Only use if has_index(i) is known to be false.
|
// Only use if has_index(i) is known to be false.
|
||||||
// Since it doesn't set the value associated with i,
|
// Since it doesn't set the value associated with i,
|
||||||
// this function is private, only intended as a helper
|
// this function is private, only intended as a helper
|
||||||
// for other methods.
|
// for other methods.
|
||||||
inline void create_index(int i);
|
void create_index(int i);
|
||||||
|
|
||||||
// In debug mode, verify that some invariant properties of the class
|
// In debug mode, verify that some invariant properties of the class
|
||||||
// are being maintained. This is called at the end of the constructor
|
// are being maintained. This is called at the end of the constructor
|
||||||
// and at the beginning and end of all public non-const member functions.
|
// and at the beginning and end of all public non-const member functions.
|
||||||
inline void DebugCheckInvariants() const;
|
void DebugCheckInvariants() const;
|
||||||
|
|
||||||
int size_;
|
// Initializes memory for elements [min, max).
|
||||||
int max_size_;
|
void MaybeInitializeMemory(int min, int max) {
|
||||||
int* sparse_to_dense_;
|
#if __has_feature(memory_sanitizer)
|
||||||
vector<IndexValue> dense_;
|
__msan_unpoison(sparse_.get() + min, (max - min) * sizeof sparse_[0]);
|
||||||
bool valgrind_;
|
#elif defined(RE2_ON_VALGRIND)
|
||||||
|
for (int i = min; i < max; i++) {
|
||||||
|
sparse_[i] = 0xababababU;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
DISALLOW_EVIL_CONSTRUCTORS(SparseArray);
|
int size_ = 0;
|
||||||
|
int max_size_ = 0;
|
||||||
|
std::unique_ptr<int[]> sparse_;
|
||||||
|
std::unique_ptr<IndexValue[]> dense_;
|
||||||
};
|
};
|
||||||
|
|
||||||
template<typename Value>
|
template<typename Value>
|
||||||
SparseArray<Value>::SparseArray()
|
SparseArray<Value>::SparseArray() = default;
|
||||||
: size_(0), max_size_(0), sparse_to_dense_(NULL), dense_(), valgrind_(RunningOnValgrind()) {}
|
|
||||||
|
template<typename Value>
|
||||||
|
SparseArray<Value>::SparseArray(const SparseArray& src)
|
||||||
|
: size_(src.size_),
|
||||||
|
max_size_(src.max_size_),
|
||||||
|
sparse_(new int[max_size_]),
|
||||||
|
dense_(new IndexValue[max_size_]) {
|
||||||
|
std::copy_n(src.sparse_.get(), max_size_, sparse_.get());
|
||||||
|
std::copy_n(src.dense_.get(), max_size_, dense_.get());
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename Value>
|
||||||
|
SparseArray<Value>::SparseArray(SparseArray&& src) /*noexcept*/ // NOLINT
|
||||||
|
: size_(src.size_),
|
||||||
|
max_size_(src.max_size_),
|
||||||
|
sparse_(std::move(src.sparse_)),
|
||||||
|
dense_(std::move(src.dense_)) {
|
||||||
|
src.size_ = 0;
|
||||||
|
src.max_size_ = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename Value>
|
||||||
|
SparseArray<Value>& SparseArray<Value>::operator=(const SparseArray& src) {
|
||||||
|
size_ = src.size_;
|
||||||
|
max_size_ = src.max_size_;
|
||||||
|
std::unique_ptr<int[]> a(new int[max_size_]);
|
||||||
|
std::copy_n(src.sparse_.get(), src.max_size_, a.get());
|
||||||
|
sparse_ = std::move(a);
|
||||||
|
std::unique_ptr<IndexValue[]> b(new IndexValue[max_size_]);
|
||||||
|
std::copy_n(src.dense_.get(), src.max_size_, b.get());
|
||||||
|
dense_ = std::move(b);
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename Value>
|
||||||
|
SparseArray<Value>& SparseArray<Value>::operator=(
|
||||||
|
SparseArray&& src) /*noexcept*/ { // NOLINT
|
||||||
|
size_ = src.size_;
|
||||||
|
max_size_ = src.max_size_;
|
||||||
|
sparse_ = std::move(src.sparse_);
|
||||||
|
dense_ = std::move(src.dense_);
|
||||||
|
// clear out the source
|
||||||
|
src.size_ = 0;
|
||||||
|
src.max_size_ = 0;
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
// IndexValue pairs: exposed in SparseArray::iterator.
|
// IndexValue pairs: exposed in SparseArray::iterator.
|
||||||
template<typename Value>
|
template<typename Value>
|
||||||
@ -242,48 +393,55 @@ class SparseArray<Value>::IndexValue {
|
|||||||
typedef Value second_type;
|
typedef Value second_type;
|
||||||
|
|
||||||
IndexValue() {}
|
IndexValue() {}
|
||||||
IndexValue(int index, const Value& value) : second(value), index_(index) {}
|
IndexValue(int i, const Value& v) : index_(i), second(v) {}
|
||||||
|
IndexValue(int i, Value&& v) : index_(i), second(std::move(v)) {}
|
||||||
|
|
||||||
int index() const { return index_; }
|
int index() const { return index_; }
|
||||||
Value value() const { return second; }
|
|
||||||
|
|
||||||
// Provide the data in the 'second' member so that the utilities
|
Value& value() /*&*/ { return second; }
|
||||||
// in map-util work.
|
const Value& value() const /*&*/ { return second; }
|
||||||
Value second;
|
//Value&& value() /*&&*/ { return std::move(second); } // NOLINT
|
||||||
|
|
||||||
private:
|
private:
|
||||||
int index_;
|
int index_;
|
||||||
|
|
||||||
|
public:
|
||||||
|
// Provide the data in the 'second' member so that the utilities
|
||||||
|
// in map-util work.
|
||||||
|
// TODO(billydonahue): 'second' is public for short-term compatibility.
|
||||||
|
// Users will be transitioned to using value() accessor.
|
||||||
|
Value second;
|
||||||
};
|
};
|
||||||
|
|
||||||
template<typename Value>
|
template<typename Value>
|
||||||
const typename SparseArray<Value>::IndexValue&
|
const typename SparseArray<Value>::IndexValue&
|
||||||
SparseArray<Value>::iv(int i) const {
|
SparseArray<Value>::iv(int i) const {
|
||||||
DCHECK_GE(i, 0);
|
assert(i >= 0);
|
||||||
DCHECK_LT(i, size_);
|
assert(i < size_);
|
||||||
return dense_[i];
|
return dense_[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
// Change the maximum size of the array.
|
// Change the maximum size of the array.
|
||||||
// Invalidates all iterators.
|
// Invalidates all iterators.
|
||||||
template<typename Value>
|
template<typename Value>
|
||||||
void SparseArray<Value>::resize(int new_max_size) {
|
void SparseArray<Value>::resize(int max_size) {
|
||||||
DebugCheckInvariants();
|
DebugCheckInvariants();
|
||||||
if (new_max_size > max_size_) {
|
if (max_size > max_size_) {
|
||||||
int* a = new int[new_max_size];
|
std::unique_ptr<int[]> a(new int[max_size]);
|
||||||
if (sparse_to_dense_) {
|
if (sparse_) {
|
||||||
memmove(a, sparse_to_dense_, max_size_*sizeof a[0]);
|
std::copy_n(sparse_.get(), max_size_, a.get());
|
||||||
// Don't need to zero the memory but appease Valgrind.
|
|
||||||
if (valgrind_) {
|
|
||||||
for (int i = max_size_; i < new_max_size; i++)
|
|
||||||
a[i] = 0xababababU;
|
|
||||||
}
|
}
|
||||||
delete[] sparse_to_dense_;
|
sparse_ = std::move(a);
|
||||||
}
|
|
||||||
sparse_to_dense_ = a;
|
|
||||||
|
|
||||||
dense_.resize(new_max_size);
|
std::unique_ptr<IndexValue[]> b(new IndexValue[max_size]);
|
||||||
|
if (dense_) {
|
||||||
|
std::copy_n(dense_.get(), max_size_, b.get());
|
||||||
}
|
}
|
||||||
max_size_ = new_max_size;
|
dense_ = std::move(b);
|
||||||
|
|
||||||
|
MaybeInitializeMemory(max_size_, max_size);
|
||||||
|
}
|
||||||
|
max_size_ = max_size;
|
||||||
if (size_ > max_size_)
|
if (size_ > max_size_)
|
||||||
size_ = max_size_;
|
size_ = max_size_;
|
||||||
DebugCheckInvariants();
|
DebugCheckInvariants();
|
||||||
@ -292,97 +450,20 @@ void SparseArray<Value>::resize(int new_max_size) {
|
|||||||
// Check whether index i is in the array.
|
// Check whether index i is in the array.
|
||||||
template<typename Value>
|
template<typename Value>
|
||||||
bool SparseArray<Value>::has_index(int i) const {
|
bool SparseArray<Value>::has_index(int i) const {
|
||||||
DCHECK_GE(i, 0);
|
assert(i >= 0);
|
||||||
DCHECK_LT(i, max_size_);
|
assert(i < max_size_);
|
||||||
if (static_cast<uint>(i) >= static_cast<uint>(max_size_)) {
|
if (static_cast<uint32_t>(i) >= static_cast<uint32_t>(max_size_)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
// Unsigned comparison avoids checking sparse_to_dense_[i] < 0.
|
// Unsigned comparison avoids checking sparse_[i] < 0.
|
||||||
return (uint)sparse_to_dense_[i] < (uint)size_ &&
|
return (uint32_t)sparse_[i] < (uint32_t)size_ &&
|
||||||
dense_[sparse_to_dense_[i]].index_ == i;
|
dense_[sparse_[i]].index_ == i;
|
||||||
}
|
|
||||||
|
|
||||||
// Set the value at index i to v.
|
|
||||||
template<typename Value>
|
|
||||||
typename SparseArray<Value>::iterator SparseArray<Value>::set(int i, Value v) {
|
|
||||||
DebugCheckInvariants();
|
|
||||||
if (static_cast<uint>(i) >= static_cast<uint>(max_size_)) {
|
|
||||||
// Semantically, end() would be better here, but we already know
|
|
||||||
// the user did something stupid, so begin() insulates them from
|
|
||||||
// dereferencing an invalid pointer.
|
|
||||||
return begin();
|
|
||||||
}
|
|
||||||
if (!has_index(i))
|
|
||||||
create_index(i);
|
|
||||||
return set_existing(i, v);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename Value>
|
template<typename Value>
|
||||||
pair<typename SparseArray<Value>::iterator, bool> SparseArray<Value>::insert(
|
const Value& SparseArray<Value>::get_existing(int i) const {
|
||||||
const value_type& new_value) {
|
assert(has_index(i));
|
||||||
DebugCheckInvariants();
|
return dense_[sparse_[i]].second;
|
||||||
pair<typename SparseArray<Value>::iterator, bool> p;
|
|
||||||
if (has_index(new_value.index_)) {
|
|
||||||
p = make_pair(dense_.begin() + sparse_to_dense_[new_value.index_], false);
|
|
||||||
} else {
|
|
||||||
p = make_pair(set_new(new_value.index_, new_value.second), true);
|
|
||||||
}
|
|
||||||
DebugCheckInvariants();
|
|
||||||
return p;
|
|
||||||
}
|
|
||||||
|
|
||||||
template<typename Value>
|
|
||||||
Value SparseArray<Value>::get(int i, Value defaultv) const {
|
|
||||||
if (!has_index(i))
|
|
||||||
return defaultv;
|
|
||||||
return get_existing(i);
|
|
||||||
}
|
|
||||||
|
|
||||||
template<typename Value>
|
|
||||||
typename SparseArray<Value>::iterator SparseArray<Value>::find(int i) {
|
|
||||||
if (has_index(i))
|
|
||||||
return dense_.begin() + sparse_to_dense_[i];
|
|
||||||
return end();
|
|
||||||
}
|
|
||||||
|
|
||||||
template<typename Value>
|
|
||||||
typename SparseArray<Value>::const_iterator
|
|
||||||
SparseArray<Value>::find(int i) const {
|
|
||||||
if (has_index(i)) {
|
|
||||||
return dense_.begin() + sparse_to_dense_[i];
|
|
||||||
}
|
|
||||||
return end();
|
|
||||||
}
|
|
||||||
|
|
||||||
template<typename Value>
|
|
||||||
typename SparseArray<Value>::iterator
|
|
||||||
SparseArray<Value>::set_existing(int i, Value v) {
|
|
||||||
DebugCheckInvariants();
|
|
||||||
DCHECK(has_index(i));
|
|
||||||
dense_[sparse_to_dense_[i]].second = v;
|
|
||||||
DebugCheckInvariants();
|
|
||||||
return dense_.begin() + sparse_to_dense_[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
template<typename Value>
|
|
||||||
typename SparseArray<Value>::iterator
|
|
||||||
SparseArray<Value>::set_new(int i, Value v) {
|
|
||||||
DebugCheckInvariants();
|
|
||||||
if (static_cast<uint>(i) >= static_cast<uint>(max_size_)) {
|
|
||||||
// Semantically, end() would be better here, but we already know
|
|
||||||
// the user did something stupid, so begin() insulates them from
|
|
||||||
// dereferencing an invalid pointer.
|
|
||||||
return begin();
|
|
||||||
}
|
|
||||||
DCHECK(!has_index(i));
|
|
||||||
create_index(i);
|
|
||||||
return set_existing(i, v);
|
|
||||||
}
|
|
||||||
|
|
||||||
template<typename Value>
|
|
||||||
Value SparseArray<Value>::get_existing(int i) const {
|
|
||||||
DCHECK(has_index(i));
|
|
||||||
return dense_[sparse_to_dense_[i]].second;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename Value>
|
template<typename Value>
|
||||||
@ -396,11 +477,11 @@ void SparseArray<Value>::erase(int i) {
|
|||||||
template<typename Value>
|
template<typename Value>
|
||||||
void SparseArray<Value>::erase_existing(int i) {
|
void SparseArray<Value>::erase_existing(int i) {
|
||||||
DebugCheckInvariants();
|
DebugCheckInvariants();
|
||||||
DCHECK(has_index(i));
|
assert(has_index(i));
|
||||||
int di = sparse_to_dense_[i];
|
int di = sparse_[i];
|
||||||
if (di < size_ - 1) {
|
if (di < size_ - 1) {
|
||||||
dense_[di] = dense_[size_ - 1];
|
dense_[di] = std::move(dense_[size_ - 1]);
|
||||||
sparse_to_dense_[dense_[di].index_] = di;
|
sparse_[dense_[di].index_] = di;
|
||||||
}
|
}
|
||||||
size_--;
|
size_--;
|
||||||
DebugCheckInvariants();
|
DebugCheckInvariants();
|
||||||
@ -408,38 +489,30 @@ void SparseArray<Value>::erase_existing(int i) {
|
|||||||
|
|
||||||
template<typename Value>
|
template<typename Value>
|
||||||
void SparseArray<Value>::create_index(int i) {
|
void SparseArray<Value>::create_index(int i) {
|
||||||
DCHECK(!has_index(i));
|
assert(!has_index(i));
|
||||||
DCHECK_LT(size_, max_size_);
|
assert(size_ < max_size_);
|
||||||
sparse_to_dense_[i] = size_;
|
sparse_[i] = size_;
|
||||||
dense_[size_].index_ = i;
|
dense_[size_].index_ = i;
|
||||||
size_++;
|
size_++;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename Value> SparseArray<Value>::SparseArray(int max_size) {
|
template<typename Value> SparseArray<Value>::SparseArray(int max_size) {
|
||||||
max_size_ = max_size;
|
sparse_.reset(new int[max_size]);
|
||||||
sparse_to_dense_ = new int[max_size];
|
dense_.reset(new IndexValue[max_size]);
|
||||||
valgrind_ = RunningOnValgrind();
|
|
||||||
dense_.resize(max_size);
|
|
||||||
// Don't need to zero the new memory, but appease Valgrind.
|
|
||||||
if (valgrind_) {
|
|
||||||
for (int i = 0; i < max_size; i++) {
|
|
||||||
sparse_to_dense_[i] = 0xababababU;
|
|
||||||
dense_[i].index_ = 0xababababU;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
size_ = 0;
|
size_ = 0;
|
||||||
|
MaybeInitializeMemory(size_, max_size);
|
||||||
|
max_size_ = max_size;
|
||||||
DebugCheckInvariants();
|
DebugCheckInvariants();
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename Value> SparseArray<Value>::~SparseArray() {
|
template<typename Value> SparseArray<Value>::~SparseArray() {
|
||||||
DebugCheckInvariants();
|
DebugCheckInvariants();
|
||||||
delete[] sparse_to_dense_;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename Value> void SparseArray<Value>::DebugCheckInvariants() const {
|
template<typename Value> void SparseArray<Value>::DebugCheckInvariants() const {
|
||||||
DCHECK_LE(0, size_);
|
assert(0 <= size_);
|
||||||
DCHECK_LE(size_, max_size_);
|
assert(size_ <= max_size_);
|
||||||
DCHECK(size_ == 0 || sparse_to_dense_ != NULL);
|
assert(size_ == 0 || sparse_ != NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Comparison function for sorting.
|
// Comparison function for sorting.
|
||||||
@ -450,4 +523,4 @@ template<typename Value> bool SparseArray<Value>::less(const IndexValue& a,
|
|||||||
|
|
||||||
} // namespace re2
|
} // namespace re2
|
||||||
|
|
||||||
#endif // RE2_UTIL_SPARSE_ARRAY_H__
|
#endif // UTIL_SPARSE_ARRAY_H_
|
||||||
|
@ -2,9 +2,12 @@
|
|||||||
// Use of this source code is governed by a BSD-style
|
// Use of this source code is governed by a BSD-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
#ifndef UTIL_SPARSE_SET_H_
|
||||||
|
#define UTIL_SPARSE_SET_H_
|
||||||
|
|
||||||
// DESCRIPTION
|
// DESCRIPTION
|
||||||
//
|
//
|
||||||
// SparseSet<T>(m) is a set of integers in [0, m).
|
// SparseSet(m) is a set of integers in [0, m).
|
||||||
// It requires sizeof(int)*m memory, but it provides
|
// It requires sizeof(int)*m memory, but it provides
|
||||||
// fast iteration through the elements in the set and fast clearing
|
// fast iteration through the elements in the set and fast clearing
|
||||||
// of the set.
|
// of the set.
|
||||||
@ -20,7 +23,7 @@
|
|||||||
// is the number of items in the set (not O(m)).
|
// is the number of items in the set (not O(m)).
|
||||||
//
|
//
|
||||||
// The set iterator visits entries in the order they were first
|
// The set iterator visits entries in the order they were first
|
||||||
// inserted into the array. It is safe to add items to the set while
|
// inserted into the set. It is safe to add items to the set while
|
||||||
// using an iterator: the iterator will visit indices added to the set
|
// using an iterator: the iterator will visit indices added to the set
|
||||||
// during the iteration, but will not re-visit indices whose values
|
// during the iteration, but will not re-visit indices whose values
|
||||||
// change after visiting. Thus SparseSet can be a convenient
|
// change after visiting. Thus SparseSet can be a convenient
|
||||||
@ -38,142 +41,226 @@
|
|||||||
// Representation for Sparse Sets, ACM Letters on Programming Languages
|
// Representation for Sparse Sets, ACM Letters on Programming Languages
|
||||||
// and Systems, Volume 2, Issue 1-4 (March-Dec. 1993), pp. 59-69.
|
// and Systems, Volume 2, Issue 1-4 (March-Dec. 1993), pp. 59-69.
|
||||||
//
|
//
|
||||||
// For a generalization to sparse array, see sparse_array.h.
|
// This is a specialization of sparse array; see sparse_array.h.
|
||||||
|
|
||||||
// IMPLEMENTATION
|
// IMPLEMENTATION
|
||||||
//
|
//
|
||||||
// See sparse_array.h for implementation details
|
// See sparse_array.h for implementation details.
|
||||||
|
|
||||||
#ifndef RE2_UTIL_SPARSE_SET_H__
|
// Doing this simplifies the logic below.
|
||||||
#define RE2_UTIL_SPARSE_SET_H__
|
#ifndef __has_feature
|
||||||
|
#define __has_feature(x) 0
|
||||||
|
#endif
|
||||||
|
|
||||||
#include "util/util.h"
|
#include <assert.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <string.h>
|
||||||
|
#if __has_feature(memory_sanitizer)
|
||||||
|
#include <sanitizer/msan_interface.h>
|
||||||
|
#endif
|
||||||
|
#include <algorithm>
|
||||||
|
#include <memory>
|
||||||
|
#include <utility>
|
||||||
|
|
||||||
namespace re2 {
|
namespace re2 {
|
||||||
|
|
||||||
class SparseSet {
|
template<typename Value>
|
||||||
|
class SparseSetT {
|
||||||
public:
|
public:
|
||||||
SparseSet()
|
SparseSetT();
|
||||||
: size_(0), max_size_(0), sparse_to_dense_(NULL), dense_(NULL), valgrind_(RunningOnValgrind()) {}
|
explicit SparseSetT(int max_size);
|
||||||
|
~SparseSetT();
|
||||||
SparseSet(int max_size) {
|
|
||||||
max_size_ = max_size;
|
|
||||||
sparse_to_dense_ = new int[max_size];
|
|
||||||
dense_ = new int[max_size];
|
|
||||||
valgrind_ = RunningOnValgrind();
|
|
||||||
// Don't need to zero the memory, but do so anyway
|
|
||||||
// to appease Valgrind.
|
|
||||||
if (valgrind_) {
|
|
||||||
for (int i = 0; i < max_size; i++) {
|
|
||||||
dense_[i] = 0xababababU;
|
|
||||||
sparse_to_dense_[i] = 0xababababU;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
size_ = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
~SparseSet() {
|
|
||||||
delete[] sparse_to_dense_;
|
|
||||||
delete[] dense_;
|
|
||||||
}
|
|
||||||
|
|
||||||
typedef int* iterator;
|
typedef int* iterator;
|
||||||
typedef const int* const_iterator;
|
typedef const int* const_iterator;
|
||||||
|
|
||||||
int size() const { return size_; }
|
// Return the number of entries in the set.
|
||||||
iterator begin() { return dense_; }
|
int size() const {
|
||||||
iterator end() { return dense_ + size_; }
|
return size_;
|
||||||
const_iterator begin() const { return dense_; }
|
}
|
||||||
const_iterator end() const { return dense_ + size_; }
|
|
||||||
|
|
||||||
// Change the maximum size of the array.
|
// Indicate whether the set is empty.
|
||||||
|
int empty() const {
|
||||||
|
return size_ == 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Iterate over the set.
|
||||||
|
iterator begin() {
|
||||||
|
return dense_.get();
|
||||||
|
}
|
||||||
|
iterator end() {
|
||||||
|
return dense_.get() + size_;
|
||||||
|
}
|
||||||
|
|
||||||
|
const_iterator begin() const {
|
||||||
|
return dense_.get();
|
||||||
|
}
|
||||||
|
const_iterator end() const {
|
||||||
|
return dense_.get() + size_;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Change the maximum size of the set.
|
||||||
// Invalidates all iterators.
|
// Invalidates all iterators.
|
||||||
void resize(int new_max_size) {
|
void resize(int max_size);
|
||||||
if (size_ > new_max_size)
|
|
||||||
size_ = new_max_size;
|
|
||||||
if (new_max_size > max_size_) {
|
|
||||||
int* a = new int[new_max_size];
|
|
||||||
if (sparse_to_dense_) {
|
|
||||||
memmove(a, sparse_to_dense_, max_size_*sizeof a[0]);
|
|
||||||
if (valgrind_) {
|
|
||||||
for (int i = max_size_; i < new_max_size; i++)
|
|
||||||
a[i] = 0xababababU;
|
|
||||||
}
|
|
||||||
delete[] sparse_to_dense_;
|
|
||||||
}
|
|
||||||
sparse_to_dense_ = a;
|
|
||||||
|
|
||||||
a = new int[new_max_size];
|
// Return the maximum size of the set.
|
||||||
if (dense_) {
|
|
||||||
memmove(a, dense_, size_*sizeof a[0]);
|
|
||||||
if (valgrind_) {
|
|
||||||
for (int i = size_; i < new_max_size; i++)
|
|
||||||
a[i] = 0xababababU;
|
|
||||||
}
|
|
||||||
delete[] dense_;
|
|
||||||
}
|
|
||||||
dense_ = a;
|
|
||||||
}
|
|
||||||
max_size_ = new_max_size;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Return the maximum size of the array.
|
|
||||||
// Indices can be in the range [0, max_size).
|
// Indices can be in the range [0, max_size).
|
||||||
int max_size() const { return max_size_; }
|
int max_size() const {
|
||||||
|
return max_size_;
|
||||||
// Clear the array.
|
|
||||||
void clear() { size_ = 0; }
|
|
||||||
|
|
||||||
// Check whether i is in the array.
|
|
||||||
bool contains(int i) const {
|
|
||||||
DCHECK_GE(i, 0);
|
|
||||||
DCHECK_LT(i, max_size_);
|
|
||||||
if (static_cast<uint>(i) >= static_cast<uint>(max_size_)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
// Unsigned comparison avoids checking sparse_to_dense_[i] < 0.
|
|
||||||
return (uint)sparse_to_dense_[i] < (uint)size_ &&
|
|
||||||
dense_[sparse_to_dense_[i]] == i;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Adds i to the set.
|
// Clear the set.
|
||||||
void insert(int i) {
|
void clear() {
|
||||||
if (!contains(i))
|
size_ = 0;
|
||||||
insert_new(i);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Set the value at the new index i to v.
|
// Check whether index i is in the set.
|
||||||
|
bool contains(int i) const;
|
||||||
|
|
||||||
|
// Comparison function for sorting.
|
||||||
|
// Can sort the sparse set so that future iterations
|
||||||
|
// will visit indices in increasing order using
|
||||||
|
// std::sort(arr.begin(), arr.end(), arr.less);
|
||||||
|
static bool less(int a, int b);
|
||||||
|
|
||||||
|
public:
|
||||||
|
// Insert index i into the set.
|
||||||
|
iterator insert(int i) {
|
||||||
|
return InsertInternal(true, i);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Insert index i into the set.
|
||||||
// Fast but unsafe: only use if contains(i) is false.
|
// Fast but unsafe: only use if contains(i) is false.
|
||||||
void insert_new(int i) {
|
iterator insert_new(int i) {
|
||||||
if (static_cast<uint>(i) >= static_cast<uint>(max_size_)) {
|
return InsertInternal(false, i);
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
iterator InsertInternal(bool allow_existing, int i) {
|
||||||
|
DebugCheckInvariants();
|
||||||
|
if (static_cast<uint32_t>(i) >= static_cast<uint32_t>(max_size_)) {
|
||||||
|
assert(false && "illegal index");
|
||||||
// Semantically, end() would be better here, but we already know
|
// Semantically, end() would be better here, but we already know
|
||||||
// the user did something stupid, so begin() insulates them from
|
// the user did something stupid, so begin() insulates them from
|
||||||
// dereferencing an invalid pointer.
|
// dereferencing an invalid pointer.
|
||||||
return;
|
return begin();
|
||||||
}
|
}
|
||||||
DCHECK(!contains(i));
|
if (!allow_existing) {
|
||||||
DCHECK_LT(size_, max_size_);
|
assert(!contains(i));
|
||||||
sparse_to_dense_[i] = size_;
|
create_index(i);
|
||||||
|
} else {
|
||||||
|
if (!contains(i))
|
||||||
|
create_index(i);
|
||||||
|
}
|
||||||
|
DebugCheckInvariants();
|
||||||
|
return dense_.get() + sparse_[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add the index i to the set.
|
||||||
|
// Only use if contains(i) is known to be false.
|
||||||
|
// This function is private, only intended as a helper
|
||||||
|
// for other methods.
|
||||||
|
void create_index(int i);
|
||||||
|
|
||||||
|
// In debug mode, verify that some invariant properties of the class
|
||||||
|
// are being maintained. This is called at the end of the constructor
|
||||||
|
// and at the beginning and end of all public non-const member functions.
|
||||||
|
void DebugCheckInvariants() const;
|
||||||
|
|
||||||
|
// Initializes memory for elements [min, max).
|
||||||
|
void MaybeInitializeMemory(int min, int max) {
|
||||||
|
#if __has_feature(memory_sanitizer)
|
||||||
|
__msan_unpoison(sparse_.get() + min, (max - min) * sizeof sparse_[0]);
|
||||||
|
#elif defined(RE2_ON_VALGRIND)
|
||||||
|
for (int i = min; i < max; i++) {
|
||||||
|
sparse_[i] = 0xababababU;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
int size_ = 0;
|
||||||
|
int max_size_ = 0;
|
||||||
|
std::unique_ptr<int[]> sparse_;
|
||||||
|
std::unique_ptr<int[]> dense_;
|
||||||
|
};
|
||||||
|
|
||||||
|
template<typename Value>
|
||||||
|
SparseSetT<Value>::SparseSetT() = default;
|
||||||
|
|
||||||
|
// Change the maximum size of the set.
|
||||||
|
// Invalidates all iterators.
|
||||||
|
template<typename Value>
|
||||||
|
void SparseSetT<Value>::resize(int max_size) {
|
||||||
|
DebugCheckInvariants();
|
||||||
|
if (max_size > max_size_) {
|
||||||
|
std::unique_ptr<int[]> a(new int[max_size]);
|
||||||
|
if (sparse_) {
|
||||||
|
std::copy_n(sparse_.get(), max_size_, a.get());
|
||||||
|
}
|
||||||
|
sparse_ = std::move(a);
|
||||||
|
|
||||||
|
std::unique_ptr<int[]> b(new int[max_size]);
|
||||||
|
if (dense_) {
|
||||||
|
std::copy_n(dense_.get(), max_size_, b.get());
|
||||||
|
}
|
||||||
|
dense_ = std::move(b);
|
||||||
|
|
||||||
|
MaybeInitializeMemory(max_size_, max_size);
|
||||||
|
}
|
||||||
|
max_size_ = max_size;
|
||||||
|
if (size_ > max_size_)
|
||||||
|
size_ = max_size_;
|
||||||
|
DebugCheckInvariants();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check whether index i is in the set.
|
||||||
|
template<typename Value>
|
||||||
|
bool SparseSetT<Value>::contains(int i) const {
|
||||||
|
assert(i >= 0);
|
||||||
|
assert(i < max_size_);
|
||||||
|
if (static_cast<uint32_t>(i) >= static_cast<uint32_t>(max_size_)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
// Unsigned comparison avoids checking sparse_[i] < 0.
|
||||||
|
return (uint32_t)sparse_[i] < (uint32_t)size_ &&
|
||||||
|
dense_[sparse_[i]] == i;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename Value>
|
||||||
|
void SparseSetT<Value>::create_index(int i) {
|
||||||
|
assert(!contains(i));
|
||||||
|
assert(size_ < max_size_);
|
||||||
|
sparse_[i] = size_;
|
||||||
dense_[size_] = i;
|
dense_[size_] = i;
|
||||||
size_++;
|
size_++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename Value> SparseSetT<Value>::SparseSetT(int max_size) {
|
||||||
|
sparse_.reset(new int[max_size]);
|
||||||
|
dense_.reset(new int[max_size]);
|
||||||
|
size_ = 0;
|
||||||
|
MaybeInitializeMemory(size_, max_size);
|
||||||
|
max_size_ = max_size;
|
||||||
|
DebugCheckInvariants();
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename Value> SparseSetT<Value>::~SparseSetT() {
|
||||||
|
DebugCheckInvariants();
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename Value> void SparseSetT<Value>::DebugCheckInvariants() const {
|
||||||
|
assert(0 <= size_);
|
||||||
|
assert(size_ <= max_size_);
|
||||||
|
assert(size_ == 0 || sparse_ != NULL);
|
||||||
|
}
|
||||||
|
|
||||||
// Comparison function for sorting.
|
// Comparison function for sorting.
|
||||||
// Can sort the sparse array so that future iterations
|
template<typename Value> bool SparseSetT<Value>::less(int a, int b) {
|
||||||
// will visit indices in increasing order using
|
return a < b;
|
||||||
// sort(arr.begin(), arr.end(), arr.less);
|
}
|
||||||
static bool less(int a, int b) { return a < b; }
|
|
||||||
|
|
||||||
private:
|
typedef SparseSetT<void> SparseSet;
|
||||||
int size_;
|
|
||||||
int max_size_;
|
|
||||||
int* sparse_to_dense_;
|
|
||||||
int* dense_;
|
|
||||||
bool valgrind_;
|
|
||||||
|
|
||||||
DISALLOW_EVIL_CONSTRUCTORS(SparseSet);
|
|
||||||
};
|
|
||||||
|
|
||||||
} // namespace re2
|
} // namespace re2
|
||||||
|
|
||||||
#endif // RE2_UTIL_SPARSE_SET_H__
|
#endif // UTIL_SPARSE_SET_H_
|
||||||
|
@ -1,87 +0,0 @@
|
|||||||
// Copyright 2004 The RE2 Authors. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
#include "re2/stringpiece.h"
|
|
||||||
#include "util/util.h"
|
|
||||||
|
|
||||||
using re2::StringPiece;
|
|
||||||
|
|
||||||
std::ostream& operator<<(std::ostream& o, const StringPiece& piece) {
|
|
||||||
o.write(piece.data(), piece.size());
|
|
||||||
return o;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool StringPiece::_equal(const StringPiece& x, const StringPiece& y) {
|
|
||||||
int len = x.size();
|
|
||||||
if (len != y.size()) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
const char* p = x.data();
|
|
||||||
const char* p2 = y.data();
|
|
||||||
// Test last byte in case strings share large common prefix
|
|
||||||
if ((len > 0) && (p[len-1] != p2[len-1])) return false;
|
|
||||||
const char* p_limit = p + len;
|
|
||||||
for (; p < p_limit; p++, p2++) {
|
|
||||||
if (*p != *p2)
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
void StringPiece::CopyToString(string* target) const {
|
|
||||||
target->assign(ptr_, length_);
|
|
||||||
}
|
|
||||||
|
|
||||||
int StringPiece::copy(char* buf, size_type n, size_type pos) const {
|
|
||||||
int ret = min(length_ - pos, n);
|
|
||||||
memcpy(buf, ptr_ + pos, ret);
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
int StringPiece::find(const StringPiece& s, size_type pos) const {
|
|
||||||
if (length_ < 0 || pos > static_cast<size_type>(length_))
|
|
||||||
return npos;
|
|
||||||
|
|
||||||
const char* result = std::search(ptr_ + pos, ptr_ + length_,
|
|
||||||
s.ptr_, s.ptr_ + s.length_);
|
|
||||||
const size_type xpos = result - ptr_;
|
|
||||||
return xpos + s.length_ <= static_cast<size_type>(length_) ? xpos : npos;
|
|
||||||
}
|
|
||||||
|
|
||||||
int StringPiece::find(char c, size_type pos) const {
|
|
||||||
if (length_ <= 0 || pos >= static_cast<size_type>(length_)) {
|
|
||||||
return npos;
|
|
||||||
}
|
|
||||||
const char* result = std::find(ptr_ + pos, ptr_ + length_, c);
|
|
||||||
return result != ptr_ + length_ ? result - ptr_ : npos;
|
|
||||||
}
|
|
||||||
|
|
||||||
int StringPiece::rfind(const StringPiece& s, size_type pos) const {
|
|
||||||
if (length_ < s.length_) return npos;
|
|
||||||
const size_t ulen = length_;
|
|
||||||
if (s.length_ == 0) return min(ulen, pos);
|
|
||||||
|
|
||||||
const char* last = ptr_ + min(ulen - s.length_, pos) + s.length_;
|
|
||||||
const char* result = std::find_end(ptr_, last, s.ptr_, s.ptr_ + s.length_);
|
|
||||||
return result != last ? result - ptr_ : npos;
|
|
||||||
}
|
|
||||||
|
|
||||||
int StringPiece::rfind(char c, size_type pos) const {
|
|
||||||
if (length_ <= 0) return npos;
|
|
||||||
for (int i = min(pos, static_cast<size_type>(length_ - 1));
|
|
||||||
i >= 0; --i) {
|
|
||||||
if (ptr_[i] == c) {
|
|
||||||
return i;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return npos;
|
|
||||||
}
|
|
||||||
|
|
||||||
StringPiece StringPiece::substr(size_type pos, size_type n) const {
|
|
||||||
if (pos > static_cast<size_type>(length_)) pos = length_;
|
|
||||||
if (n > length_ - pos) n = length_ - pos;
|
|
||||||
return StringPiece(ptr_ + pos, n);
|
|
||||||
}
|
|
||||||
|
|
||||||
const StringPiece::size_type StringPiece::npos = size_type(-1);
|
|
@ -1,78 +0,0 @@
|
|||||||
// Copyright 2002 The RE2 Authors. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
#include "util/util.h"
|
|
||||||
|
|
||||||
namespace re2 {
|
|
||||||
|
|
||||||
static void StringAppendV(string* dst, const char* format, va_list ap) {
|
|
||||||
// First try with a small fixed size buffer
|
|
||||||
char space[1024];
|
|
||||||
|
|
||||||
// It's possible for methods that use a va_list to invalidate
|
|
||||||
// the data in it upon use. The fix is to make a copy
|
|
||||||
// of the structure before using it and use that copy instead.
|
|
||||||
va_list backup_ap;
|
|
||||||
va_copy(backup_ap, ap);
|
|
||||||
int result = vsnprintf(space, sizeof(space), format, backup_ap);
|
|
||||||
va_end(backup_ap);
|
|
||||||
|
|
||||||
if ((result >= 0) && (static_cast<size_t>(result) < sizeof(space))) {
|
|
||||||
// It fit
|
|
||||||
dst->append(space, result);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Repeatedly increase buffer size until it fits
|
|
||||||
int length = sizeof(space);
|
|
||||||
while (true) {
|
|
||||||
if (result < 0) {
|
|
||||||
// Older behavior: just try doubling the buffer size
|
|
||||||
length *= 2;
|
|
||||||
} else {
|
|
||||||
// We need exactly "result+1" characters
|
|
||||||
length = result+1;
|
|
||||||
}
|
|
||||||
char* buf = new char[length];
|
|
||||||
|
|
||||||
// Restore the va_list before we use it again
|
|
||||||
va_copy(backup_ap, ap);
|
|
||||||
result = vsnprintf(buf, length, format, backup_ap);
|
|
||||||
va_end(backup_ap);
|
|
||||||
|
|
||||||
if ((result >= 0) && (result < length)) {
|
|
||||||
// It fit
|
|
||||||
dst->append(buf, result);
|
|
||||||
delete[] buf;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
delete[] buf;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
string StringPrintf(const char* format, ...) {
|
|
||||||
va_list ap;
|
|
||||||
va_start(ap, format);
|
|
||||||
string result;
|
|
||||||
StringAppendV(&result, format, ap);
|
|
||||||
va_end(ap);
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
void SStringPrintf(string* dst, const char* format, ...) {
|
|
||||||
va_list ap;
|
|
||||||
va_start(ap, format);
|
|
||||||
dst->clear();
|
|
||||||
StringAppendV(dst, format, ap);
|
|
||||||
va_end(ap);
|
|
||||||
}
|
|
||||||
|
|
||||||
void StringAppendF(string* dst, const char* format, ...) {
|
|
||||||
va_list ap;
|
|
||||||
va_start(ap, format);
|
|
||||||
StringAppendV(dst, format, ap);
|
|
||||||
va_end(ap);
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace re2
|
|
@ -2,8 +2,15 @@
|
|||||||
// Use of this source code is governed by a BSD-style
|
// Use of this source code is governed by a BSD-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
#include "util/util.h"
|
#include <stdarg.h>
|
||||||
#include "re2/stringpiece.h"
|
#include <stdio.h>
|
||||||
|
|
||||||
|
#include "util/strutil.h"
|
||||||
|
|
||||||
|
#ifdef _WIN32
|
||||||
|
#define snprintf _snprintf
|
||||||
|
#define vsnprintf _vsnprintf
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace re2 {
|
namespace re2 {
|
||||||
|
|
||||||
@ -12,16 +19,16 @@ namespace re2 {
|
|||||||
// Copies 'src' to 'dest', escaping dangerous characters using
|
// Copies 'src' to 'dest', escaping dangerous characters using
|
||||||
// C-style escape sequences. 'src' and 'dest' should not overlap.
|
// C-style escape sequences. 'src' and 'dest' should not overlap.
|
||||||
// Returns the number of bytes written to 'dest' (not including the \0)
|
// Returns the number of bytes written to 'dest' (not including the \0)
|
||||||
// or -1 if there was insufficient space.
|
// or (size_t)-1 if there was insufficient space.
|
||||||
// ----------------------------------------------------------------------
|
// ----------------------------------------------------------------------
|
||||||
int CEscapeString(const char* src, int src_len, char* dest,
|
static size_t CEscapeString(const char* src, size_t src_len,
|
||||||
int dest_len) {
|
char* dest, size_t dest_len) {
|
||||||
const char* src_end = src + src_len;
|
const char* src_end = src + src_len;
|
||||||
int used = 0;
|
size_t used = 0;
|
||||||
|
|
||||||
for (; src < src_end; src++) {
|
for (; src < src_end; src++) {
|
||||||
if (dest_len - used < 2) // Need space for two letter escape
|
if (dest_len - used < 2) // space for two-character escape
|
||||||
return -1;
|
return (size_t)-1;
|
||||||
|
|
||||||
unsigned char c = *src;
|
unsigned char c = *src;
|
||||||
switch (c) {
|
switch (c) {
|
||||||
@ -36,9 +43,9 @@ int CEscapeString(const char* src, int src_len, char* dest,
|
|||||||
// digit then that digit must be escaped too to prevent it being
|
// digit then that digit must be escaped too to prevent it being
|
||||||
// interpreted as part of the character code by C.
|
// interpreted as part of the character code by C.
|
||||||
if (c < ' ' || c > '~') {
|
if (c < ' ' || c > '~') {
|
||||||
if (dest_len - used < 4) // need space for 4 letter escape
|
if (dest_len - used < 5) // space for four-character escape + \0
|
||||||
return -1;
|
return (size_t)-1;
|
||||||
sprintf(dest + used, "\\%03o", c);
|
snprintf(dest + used, 5, "\\%03o", c);
|
||||||
used += 4;
|
used += 4;
|
||||||
} else {
|
} else {
|
||||||
dest[used++] = c; break;
|
dest[used++] = c; break;
|
||||||
@ -47,51 +54,111 @@ int CEscapeString(const char* src, int src_len, char* dest,
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (dest_len - used < 1) // make sure that there is room for \0
|
if (dest_len - used < 1) // make sure that there is room for \0
|
||||||
return -1;
|
return (size_t)-1;
|
||||||
|
|
||||||
dest[used] = '\0'; // doesn't count towards return value though
|
dest[used] = '\0'; // doesn't count towards return value though
|
||||||
return used;
|
return used;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// ----------------------------------------------------------------------
|
// ----------------------------------------------------------------------
|
||||||
// CEscape()
|
// CEscape()
|
||||||
// Copies 'src' to result, escaping dangerous characters using
|
// Copies 'src' to result, escaping dangerous characters using
|
||||||
// C-style escape sequences. 'src' and 'dest' should not overlap.
|
// C-style escape sequences. 'src' and 'dest' should not overlap.
|
||||||
// ----------------------------------------------------------------------
|
// ----------------------------------------------------------------------
|
||||||
string CEscape(const StringPiece& src) {
|
string CEscape(const StringPiece& src) {
|
||||||
const int dest_length = src.size() * 4 + 1; // Maximum possible expansion
|
const size_t dest_len = src.size() * 4 + 1; // Maximum possible expansion
|
||||||
char* dest = new char[dest_length];
|
char* dest = new char[dest_len];
|
||||||
const int len = CEscapeString(src.data(), src.size(),
|
const size_t used = CEscapeString(src.data(), src.size(),
|
||||||
dest, dest_length);
|
dest, dest_len);
|
||||||
string s = string(dest, len);
|
string s = string(dest, used);
|
||||||
delete[] dest;
|
delete[] dest;
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
string PrefixSuccessor(const StringPiece& prefix) {
|
void PrefixSuccessor(string* prefix) {
|
||||||
// We can increment the last character in the string and be done
|
// We can increment the last character in the string and be done
|
||||||
// unless that character is 255, in which case we have to erase the
|
// unless that character is 255, in which case we have to erase the
|
||||||
// last character and increment the previous character, unless that
|
// last character and increment the previous character, unless that
|
||||||
// is 255, etc. If the string is empty or consists entirely of
|
// is 255, etc. If the string is empty or consists entirely of
|
||||||
// 255's, we just return the empty string.
|
// 255's, we just return the empty string.
|
||||||
bool done = false;
|
while (!prefix->empty()) {
|
||||||
string limit(prefix.data(), prefix.size());
|
char& c = prefix->back();
|
||||||
int index = limit.length() - 1;
|
if (c == '\xff') { // char literal avoids signed/unsigned.
|
||||||
while (!done && index >= 0) {
|
prefix->pop_back();
|
||||||
if ((limit[index]&255) == 255) {
|
|
||||||
limit.erase(index);
|
|
||||||
index--;
|
|
||||||
} else {
|
} else {
|
||||||
limit[index]++;
|
++c;
|
||||||
done = true;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!done) {
|
}
|
||||||
return "";
|
|
||||||
} else {
|
static void StringAppendV(string* dst, const char* format, va_list ap) {
|
||||||
return limit;
|
// First try with a small fixed size buffer
|
||||||
}
|
char space[1024];
|
||||||
|
|
||||||
|
// It's possible for methods that use a va_list to invalidate
|
||||||
|
// the data in it upon use. The fix is to make a copy
|
||||||
|
// of the structure before using it and use that copy instead.
|
||||||
|
va_list backup_ap;
|
||||||
|
va_copy(backup_ap, ap);
|
||||||
|
int result = vsnprintf(space, sizeof(space), format, backup_ap);
|
||||||
|
va_end(backup_ap);
|
||||||
|
|
||||||
|
if ((result >= 0) && (static_cast<size_t>(result) < sizeof(space))) {
|
||||||
|
// It fit
|
||||||
|
dst->append(space, result);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Repeatedly increase buffer size until it fits
|
||||||
|
int length = sizeof(space);
|
||||||
|
while (true) {
|
||||||
|
if (result < 0) {
|
||||||
|
// Older behavior: just try doubling the buffer size
|
||||||
|
length *= 2;
|
||||||
|
} else {
|
||||||
|
// We need exactly "result+1" characters
|
||||||
|
length = result+1;
|
||||||
|
}
|
||||||
|
char* buf = new char[length];
|
||||||
|
|
||||||
|
// Restore the va_list before we use it again
|
||||||
|
va_copy(backup_ap, ap);
|
||||||
|
result = vsnprintf(buf, length, format, backup_ap);
|
||||||
|
va_end(backup_ap);
|
||||||
|
|
||||||
|
if ((result >= 0) && (result < length)) {
|
||||||
|
// It fit
|
||||||
|
dst->append(buf, result);
|
||||||
|
delete[] buf;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
delete[] buf;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
string StringPrintf(const char* format, ...) {
|
||||||
|
va_list ap;
|
||||||
|
va_start(ap, format);
|
||||||
|
string result;
|
||||||
|
StringAppendV(&result, format, ap);
|
||||||
|
va_end(ap);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
void SStringPrintf(string* dst, const char* format, ...) {
|
||||||
|
va_list ap;
|
||||||
|
va_start(ap, format);
|
||||||
|
dst->clear();
|
||||||
|
StringAppendV(dst, format, ap);
|
||||||
|
va_end(ap);
|
||||||
|
}
|
||||||
|
|
||||||
|
void StringAppendF(string* dst, const char* format, ...) {
|
||||||
|
va_list ap;
|
||||||
|
va_start(ap, format);
|
||||||
|
StringAppendV(dst, format, ap);
|
||||||
|
va_end(ap);
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace re2
|
} // namespace re2
|
||||||
|
23
contrib/libre2/util/strutil.h
Normal file
23
contrib/libre2/util/strutil.h
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
// Copyright 2016 The RE2 Authors. All Rights Reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style
|
||||||
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
|
#ifndef UTIL_STRUTIL_H_
|
||||||
|
#define UTIL_STRUTIL_H_
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
#include "re2/stringpiece.h"
|
||||||
|
#include "util/util.h"
|
||||||
|
|
||||||
|
namespace re2 {
|
||||||
|
|
||||||
|
string CEscape(const StringPiece& src);
|
||||||
|
void PrefixSuccessor(string* prefix);
|
||||||
|
string StringPrintf(const char* format, ...);
|
||||||
|
void SStringPrintf(string* dst, const char* format, ...);
|
||||||
|
void StringAppendF(string* dst, const char* format, ...);
|
||||||
|
|
||||||
|
} // namespace re2
|
||||||
|
|
||||||
|
#endif // UTIL_STRUTIL_H_
|
@ -3,7 +3,10 @@
|
|||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
#ifndef _WIN32
|
||||||
#include <sys/resource.h>
|
#include <sys/resource.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
#include "util/test.h"
|
#include "util/test.h"
|
||||||
|
|
||||||
DEFINE_string(test_tmpdir, "/var/tmp", "temp directory");
|
DEFINE_string(test_tmpdir, "/var/tmp", "temp directory");
|
||||||
@ -21,14 +24,6 @@ void RegisterTest(void (*fn)(void), const char *name) {
|
|||||||
tests[ntests++].name = name;
|
tests[ntests++].name = name;
|
||||||
}
|
}
|
||||||
|
|
||||||
namespace re2 {
|
|
||||||
int64 VirtualProcessSize() {
|
|
||||||
struct rusage ru;
|
|
||||||
getrusage(RUSAGE_SELF, &ru);
|
|
||||||
return (int64)ru.ru_maxrss*1024;
|
|
||||||
}
|
|
||||||
} // namespace re2
|
|
||||||
|
|
||||||
int main(int argc, char** argv) {
|
int main(int argc, char** argv) {
|
||||||
for (int i = 0; i < ntests; i++) {
|
for (int i = 0; i < ntests; i++) {
|
||||||
printf("%s\n", tests[i].name);
|
printf("%s\n", tests[i].name);
|
||||||
|
@ -2,11 +2,12 @@
|
|||||||
// Use of this source code is governed by a BSD-style
|
// Use of this source code is governed by a BSD-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
#ifndef RE2_UTIL_TEST_H__
|
#ifndef UTIL_TEST_H_
|
||||||
#define RE2_UTIL_TEST_H__
|
#define UTIL_TEST_H_
|
||||||
|
|
||||||
#include "util/util.h"
|
#include "util/util.h"
|
||||||
#include "util/flags.h"
|
#include "util/flags.h"
|
||||||
|
#include "util/logging.h"
|
||||||
|
|
||||||
#define TEST(x, y) \
|
#define TEST(x, y) \
|
||||||
void x##y(void); \
|
void x##y(void); \
|
||||||
@ -31,14 +32,6 @@ class TestRegisterer {
|
|||||||
#define EXPECT_GE CHECK_GE
|
#define EXPECT_GE CHECK_GE
|
||||||
#define EXPECT_FALSE(x) CHECK(!(x))
|
#define EXPECT_FALSE(x) CHECK(!(x))
|
||||||
|
|
||||||
#define ARRAYSIZE arraysize
|
|
||||||
|
|
||||||
#define EXPECT_TRUE_M(x, y) CHECK(x) << (y)
|
|
||||||
#define EXPECT_FALSE_M(x, y) CHECK(!(x)) << (y)
|
|
||||||
#define ASSERT_TRUE_M(x, y) CHECK(x) << (y)
|
|
||||||
#define ASSERT_EQUALS(x, y) CHECK_EQ(x, y)
|
|
||||||
|
|
||||||
const bool UsingMallocCounter = false;
|
|
||||||
namespace testing {
|
namespace testing {
|
||||||
class MallocCounter {
|
class MallocCounter {
|
||||||
public:
|
public:
|
||||||
@ -50,8 +43,4 @@ class MallocCounter {
|
|||||||
};
|
};
|
||||||
} // namespace testing
|
} // namespace testing
|
||||||
|
|
||||||
namespace re2 {
|
#endif // UTIL_TEST_H_
|
||||||
int64 VirtualProcessSize();
|
|
||||||
} // namespace re2
|
|
||||||
|
|
||||||
#endif // RE2_UTIL_TEST_H__
|
|
||||||
|
@ -1,44 +0,0 @@
|
|||||||
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
#include <pthread.h>
|
|
||||||
|
|
||||||
#include "util/util.h"
|
|
||||||
#include "util/thread.h"
|
|
||||||
|
|
||||||
Thread::Thread() {
|
|
||||||
pid_ = 0;
|
|
||||||
running_ = 0;
|
|
||||||
joinable_ = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
Thread::~Thread() {
|
|
||||||
}
|
|
||||||
|
|
||||||
void *startThread(void *v) {
|
|
||||||
Thread* t = (Thread*)v;
|
|
||||||
t->Run();
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
void Thread::Start() {
|
|
||||||
CHECK(!running_);
|
|
||||||
pthread_create(&pid_, 0, startThread, this);
|
|
||||||
running_ = true;
|
|
||||||
if (!joinable_)
|
|
||||||
pthread_detach(pid_);
|
|
||||||
}
|
|
||||||
|
|
||||||
void Thread::Join() {
|
|
||||||
CHECK(running_);
|
|
||||||
CHECK(joinable_);
|
|
||||||
void *val;
|
|
||||||
pthread_join(pid_, &val);
|
|
||||||
running_ = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
void Thread::SetJoinable(bool j) {
|
|
||||||
CHECK(!running_);
|
|
||||||
joinable_ = j;
|
|
||||||
}
|
|
@ -1,26 +0,0 @@
|
|||||||
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
#ifndef RE2_UTIL_THREAD_H__
|
|
||||||
#define RE2_UTIL_THREAD_H__
|
|
||||||
|
|
||||||
#include <pthread.h>
|
|
||||||
|
|
||||||
class Thread {
|
|
||||||
public:
|
|
||||||
Thread();
|
|
||||||
virtual ~Thread();
|
|
||||||
void Start();
|
|
||||||
void Join();
|
|
||||||
void SetJoinable(bool);
|
|
||||||
virtual void Run() = 0;
|
|
||||||
|
|
||||||
private:
|
|
||||||
pthread_t pid_;
|
|
||||||
bool running_;
|
|
||||||
bool joinable_;
|
|
||||||
};
|
|
||||||
|
|
||||||
#endif // RE2_UTIL_THREAD_H__
|
|
||||||
|
|
@ -14,8 +14,9 @@
|
|||||||
* This file and rune.cc have been converted to compile as C++ code
|
* This file and rune.cc have been converted to compile as C++ code
|
||||||
* in name space re2.
|
* in name space re2.
|
||||||
*/
|
*/
|
||||||
#ifndef RE2_UTIL_UTF_H__
|
|
||||||
#define RE2_UTIL_UTF_H__
|
#ifndef UTIL_UTF_H_
|
||||||
|
#define UTIL_UTF_H_
|
||||||
|
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
@ -40,4 +41,4 @@ char* utfrune(const char*, Rune);
|
|||||||
|
|
||||||
} // namespace re2
|
} // namespace re2
|
||||||
|
|
||||||
#endif // RE2_UTIL_UTF_H__
|
#endif // UTIL_UTF_H_
|
||||||
|
@ -2,125 +2,21 @@
|
|||||||
// Use of this source code is governed by a BSD-style
|
// Use of this source code is governed by a BSD-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
|
||||||
#ifndef RE2_UTIL_UTIL_H__
|
#ifndef UTIL_UTIL_H_
|
||||||
#define RE2_UTIL_UTIL_H__
|
#define UTIL_UTIL_H_
|
||||||
|
|
||||||
// C
|
// TODO(junyer): Get rid of this.
|
||||||
#include <stdio.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <stdint.h>
|
|
||||||
#include <stddef.h> // For size_t
|
|
||||||
#include <assert.h>
|
|
||||||
#include <stdarg.h>
|
|
||||||
#include <sys/time.h>
|
|
||||||
#include <time.h>
|
|
||||||
#include <ctype.h> // For isdigit, isalpha.
|
|
||||||
|
|
||||||
// C++
|
|
||||||
#include <vector>
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <algorithm>
|
|
||||||
#include <iosfwd>
|
|
||||||
#include <map>
|
|
||||||
#include <stack>
|
|
||||||
#include <ostream>
|
|
||||||
#include <utility>
|
|
||||||
#include <set>
|
|
||||||
|
|
||||||
// Use std names.
|
|
||||||
using std::set;
|
|
||||||
using std::pair;
|
|
||||||
using std::vector;
|
|
||||||
using std::string;
|
using std::string;
|
||||||
using std::min;
|
|
||||||
using std::max;
|
|
||||||
using std::ostream;
|
|
||||||
using std::map;
|
|
||||||
using std::stack;
|
|
||||||
using std::sort;
|
|
||||||
using std::swap;
|
|
||||||
using std::make_pair;
|
|
||||||
|
|
||||||
#if defined(__GNUC__) && !defined(USE_CXX0X) && !defined(_LIBCPP_ABI_VERSION) && !defined(OS_ANDROID)
|
#define arraysize(array) (int)(sizeof(array)/sizeof((array)[0]))
|
||||||
|
|
||||||
#include <tr1/unordered_set>
|
#ifndef FALLTHROUGH_INTENDED
|
||||||
using std::tr1::unordered_set;
|
#define FALLTHROUGH_INTENDED do { } while (0)
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
#include <unordered_set>
|
|
||||||
#if defined(WIN32) || defined(OS_ANDROID)
|
|
||||||
using std::tr1::unordered_set;
|
|
||||||
#else
|
|
||||||
using std::unordered_set;
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifndef NO_THREAD_SAFETY_ANALYSIS
|
||||||
|
#define NO_THREAD_SAFETY_ANALYSIS
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
namespace re2 {
|
#endif // UTIL_UTIL_H_
|
||||||
|
|
||||||
typedef int8_t int8;
|
|
||||||
typedef uint8_t uint8;
|
|
||||||
typedef int16_t int16;
|
|
||||||
typedef uint16_t uint16;
|
|
||||||
typedef int32_t int32;
|
|
||||||
typedef uint32_t uint32;
|
|
||||||
typedef int64_t int64;
|
|
||||||
typedef uint64_t uint64;
|
|
||||||
|
|
||||||
typedef unsigned long ulong;
|
|
||||||
typedef unsigned int uint;
|
|
||||||
typedef unsigned short ushort;
|
|
||||||
|
|
||||||
// COMPILE_ASSERT causes a compile error about msg if expr is not true.
|
|
||||||
#if __cplusplus >= 201103L
|
|
||||||
#define COMPILE_ASSERT(expr, msg) static_assert(expr, #msg)
|
|
||||||
#else
|
|
||||||
template<bool> struct CompileAssert {};
|
|
||||||
#define COMPILE_ASSERT(expr, msg) \
|
|
||||||
typedef CompileAssert<(bool(expr))> msg[bool(expr) ? 1 : -1]
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// DISALLOW_EVIL_CONSTRUCTORS disallows the copy and operator= functions.
|
|
||||||
// It goes in the private: declarations in a class.
|
|
||||||
#define DISALLOW_EVIL_CONSTRUCTORS(TypeName) \
|
|
||||||
TypeName(const TypeName&); \
|
|
||||||
void operator=(const TypeName&)
|
|
||||||
|
|
||||||
#define arraysize(array) (sizeof(array)/sizeof((array)[0]))
|
|
||||||
|
|
||||||
class StringPiece;
|
|
||||||
|
|
||||||
string CEscape(const StringPiece& src);
|
|
||||||
int CEscapeString(const char* src, int src_len, char* dest, int dest_len);
|
|
||||||
|
|
||||||
extern string StringPrintf(const char* format, ...);
|
|
||||||
extern void SStringPrintf(string* dst, const char* format, ...);
|
|
||||||
extern void StringAppendF(string* dst, const char* format, ...);
|
|
||||||
extern string PrefixSuccessor(const StringPiece& prefix);
|
|
||||||
|
|
||||||
uint32 hashword(const uint32*, size_t, uint32);
|
|
||||||
void hashword2(const uint32*, size_t, uint32*, uint32*);
|
|
||||||
|
|
||||||
static inline uint32 Hash32StringWithSeed(const char* s, int len, uint32 seed) {
|
|
||||||
return hashword((uint32*)s, len/4, seed);
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline uint64 Hash64StringWithSeed(const char* s, int len, uint32 seed) {
|
|
||||||
uint32 x, y;
|
|
||||||
x = seed;
|
|
||||||
y = 0;
|
|
||||||
hashword2((uint32*)s, len/4, &x, &y);
|
|
||||||
return ((uint64)x << 32) | y;
|
|
||||||
}
|
|
||||||
|
|
||||||
int RunningOnValgrind();
|
|
||||||
|
|
||||||
} // namespace re2
|
|
||||||
|
|
||||||
#include "util/arena.h"
|
|
||||||
#include "util/logging.h"
|
|
||||||
#include "util/mutex.h"
|
|
||||||
#include "util/utf.h"
|
|
||||||
|
|
||||||
#endif // RE2_UTIL_UTIL_H__
|
|
||||||
|
@ -1,24 +0,0 @@
|
|||||||
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
|
||||||
// Use of this source code is governed by a BSD-style
|
|
||||||
// license that can be found in the LICENSE file.
|
|
||||||
|
|
||||||
#include "util/util.h"
|
|
||||||
#include "util/valgrind.h"
|
|
||||||
|
|
||||||
namespace re2 {
|
|
||||||
|
|
||||||
#ifndef __has_feature
|
|
||||||
#define __has_feature(x) 0
|
|
||||||
#endif
|
|
||||||
|
|
||||||
int RunningOnValgrind() {
|
|
||||||
#if __has_feature(memory_sanitizer)
|
|
||||||
return true;
|
|
||||||
#elif defined(RUNNING_ON_VALGRIND)
|
|
||||||
return RUNNING_ON_VALGRIND;
|
|
||||||
#else
|
|
||||||
return 0;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace re2
|
|
File diff suppressed because it is too large
Load Diff
@ -623,7 +623,7 @@ struct ReplaceRegexpImpl
|
|||||||
{
|
{
|
||||||
re2_st::StringPiece matches[max_captures];
|
re2_st::StringPiece matches[max_captures];
|
||||||
|
|
||||||
int start_pos = 0;
|
size_t start_pos = 0;
|
||||||
while (start_pos < input.length())
|
while (start_pos < input.length())
|
||||||
{
|
{
|
||||||
/// If no more replacements possible for current string
|
/// If no more replacements possible for current string
|
||||||
|
Loading…
Reference in New Issue
Block a user