From 3e3d7b354a0f757d23fb6c66fd0c71887c81dd98 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 19 Jan 2018 04:18:47 +0300 Subject: [PATCH] Updated re2 to the latest version [#CLICKHOUSE-2] --- contrib/libre2/CMakeLists.txt | 55 +- contrib/libre2/README | 10 +- contrib/libre2/create_st_headers.sh | 10 - contrib/libre2/re2/bitmap256.h | 113 + contrib/libre2/re2/bitstate.cc | 119 +- contrib/libre2/re2/compile.cc | 414 +- contrib/libre2/re2/dfa.cc | 861 ++-- contrib/libre2/re2/filtered_re2.cc | 51 +- contrib/libre2/re2/filtered_re2.h | 39 +- contrib/libre2/re2/mimics_pcre.cc | 4 +- contrib/libre2/re2/nfa.cc | 454 +- contrib/libre2/re2/onepass.cc | 285 +- contrib/libre2/re2/parse.cc | 685 ++- contrib/libre2/re2/prefilter.cc | 79 +- contrib/libre2/re2/prefilter.h | 25 +- contrib/libre2/re2/prefilter_tree.cc | 256 +- contrib/libre2/re2/prefilter_tree.h | 58 +- contrib/libre2/re2/prog.cc | 619 ++- contrib/libre2/re2/prog.h | 258 +- contrib/libre2/re2/re2.cc | 680 +-- contrib/libre2/re2/re2.h | 366 +- contrib/libre2/re2/regexp.cc | 167 +- contrib/libre2/re2/regexp.h | 203 +- contrib/libre2/re2/set.cc | 111 +- contrib/libre2/re2/set.h | 67 +- contrib/libre2/re2/simplify.cc | 341 +- contrib/libre2/re2/stringpiece.cc | 65 + contrib/libre2/re2/stringpiece.h | 206 +- contrib/libre2/re2/tostring.cc | 28 +- contrib/libre2/re2/unicode.py | 2 +- contrib/libre2/re2/unicode_casefold.cc | 104 +- contrib/libre2/re2/unicode_casefold.h | 17 +- contrib/libre2/re2/unicode_groups.cc | 1399 ++++-- contrib/libre2/re2/unicode_groups.h | 17 +- contrib/libre2/re2/walker-inl.h | 18 +- contrib/libre2/re2_transform.cmake | 1 + contrib/libre2/util/arena.cc | 168 - contrib/libre2/util/arena.h | 103 - contrib/libre2/util/atomicops.h | 137 - contrib/libre2/util/benchmark.cc | 39 +- contrib/libre2/util/benchmark.h | 12 +- contrib/libre2/util/flags.h | 14 +- contrib/libre2/util/fuzz.cc | 21 + contrib/libre2/util/hash.cc | 231 - contrib/libre2/util/logging.h | 59 +- contrib/libre2/util/mix.h | 41 + contrib/libre2/util/mutex.h | 172 +- contrib/libre2/util/pcre.cc | 358 +- contrib/libre2/util/pcre.h | 111 +- contrib/libre2/util/random.cc | 34 - contrib/libre2/util/random.h | 29 - contrib/libre2/util/rune.cc | 12 +- contrib/libre2/util/sparse_array.h | 469 +- contrib/libre2/util/sparse_set.h | 323 +- contrib/libre2/util/stringpiece.cc | 87 - contrib/libre2/util/stringprintf.cc | 78 - contrib/libre2/util/strutil.cc | 135 +- contrib/libre2/util/strutil.h | 23 + contrib/libre2/util/test.cc | 13 +- contrib/libre2/util/test.h | 23 +- contrib/libre2/util/thread.cc | 44 - contrib/libre2/util/thread.h | 26 - contrib/libre2/util/utf.h | 7 +- contrib/libre2/util/util.h | 122 +- contrib/libre2/util/valgrind.cc | 24 - contrib/libre2/util/valgrind.h | 4517 ------------------ dbms/src/Functions/FunctionsStringSearch.cpp | 2 +- 67 files changed, 6520 insertions(+), 9101 deletions(-) delete mode 100755 contrib/libre2/create_st_headers.sh create mode 100644 contrib/libre2/re2/bitmap256.h create mode 100644 contrib/libre2/re2/stringpiece.cc delete mode 100644 contrib/libre2/util/arena.cc delete mode 100644 contrib/libre2/util/arena.h delete mode 100644 contrib/libre2/util/atomicops.h create mode 100644 contrib/libre2/util/fuzz.cc delete mode 100644 contrib/libre2/util/hash.cc create mode 100644 contrib/libre2/util/mix.h delete mode 100644 contrib/libre2/util/random.cc delete mode 100644 contrib/libre2/util/random.h delete mode 100644 contrib/libre2/util/stringpiece.cc delete mode 100644 contrib/libre2/util/stringprintf.cc create mode 100644 contrib/libre2/util/strutil.h delete mode 100644 contrib/libre2/util/thread.cc delete mode 100644 contrib/libre2/util/thread.h delete mode 100644 contrib/libre2/util/valgrind.cc delete mode 100644 contrib/libre2/util/valgrind.h diff --git a/contrib/libre2/CMakeLists.txt b/contrib/libre2/CMakeLists.txt index e6ec2cbe3af..f64c084112c 100644 --- a/contrib/libre2/CMakeLists.txt +++ b/contrib/libre2/CMakeLists.txt @@ -1,33 +1,30 @@ set (re2_sources - ./re2/tostring.cc - ./re2/dfa.cc - ./re2/prefilter.cc - ./re2/compile.cc - ./re2/regexp.cc - ./re2/onepass.cc - ./re2/prefilter_tree.cc - ./re2/set.cc - ./re2/filtered_re2.cc - ./re2/perl_groups.cc - ./re2/parse.cc - ./re2/nfa.cc - ./re2/bitstate.cc - ./re2/simplify.cc - ./re2/unicode_groups.cc - ./re2/mimics_pcre.cc - ./re2/re2.cc - ./re2/prog.cc - ./re2/unicode_casefold.cc - ./util/strutil.cc - ./util/stringpiece.cc - ./util/hash.cc - ./util/arena.cc - ./util/valgrind.cc - ./util/pcre.cc - ./util/stringprintf.cc - ./util/rune.cc - ./util/random.cc - ./util/thread.cc +./re2/bitstate.cc +./re2/compile.cc +./re2/dfa.cc +./re2/filtered_re2.cc +./re2/mimics_pcre.cc +./re2/nfa.cc +./re2/onepass.cc +./re2/parse.cc +./re2/perl_groups.cc +./re2/prefilter.cc +./re2/prefilter_tree.cc +./re2/prog.cc +./re2/re2.cc +./re2/regexp.cc +./re2/set.cc +./re2/simplify.cc +./re2/stringpiece.cc +./re2/tostring.cc +./re2/unicode_casefold.cc +./re2/unicode_groups.cc +./util/benchmark.cc +./util/fuzz.cc +./util/pcre.cc +./util/rune.cc +./util/strutil.cc +./util/test.cc ) # Building re2 which is thread-safe and re2_st which is not. diff --git a/contrib/libre2/README b/contrib/libre2/README index 128e35b0f9d..9226176cff7 100644 --- a/contrib/libre2/README +++ b/contrib/libre2/README @@ -1,9 +1 @@ -Source: hg clone https://re2.googlecode.com/hg re2 - -Latest commit: - -changeset: 118:1b483548272e -tag: tip -user: Russ Cox -date: Mon Oct 06 15:08:47 2014 -0400 -summary: doc: import clarifications from Go tree \ No newline at end of file +https://github.com/google/re2/tree/7cf8b88e8f70f97fd4926b56aa87e7f53b2717e0 diff --git a/contrib/libre2/create_st_headers.sh b/contrib/libre2/create_st_headers.sh deleted file mode 100755 index ccc1d3ca804..00000000000 --- a/contrib/libre2/create_st_headers.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/sh - -rm -rf re2_st -mkdir -p re2_st - -for i in filtered_re2.h re2.h set.h stringpiece.h variadic_function.h; -do - cp $1/re2/$i re2_st/$i - sed -i -r 's/using re2::RE2;//g;s/namespace re2/namespace re2_st/g;s/re2::/re2_st::/g;s/\"re2\//\"re2_st\//g;s/(.*?_H)/\1_ST/g' re2_st/$i; -done diff --git a/contrib/libre2/re2/bitmap256.h b/contrib/libre2/re2/bitmap256.h new file mode 100644 index 00000000000..1abae99ee6e --- /dev/null +++ b/contrib/libre2/re2/bitmap256.h @@ -0,0 +1,113 @@ +// Copyright 2016 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef RE2_BITMAP256_H_ +#define RE2_BITMAP256_H_ + +#ifdef _MSC_VER +#include +#endif +#include +#include + +#include "util/util.h" +#include "util/logging.h" + +namespace re2 { + +class Bitmap256 { + public: + Bitmap256() { + memset(words_, 0, sizeof words_); + } + + // Tests the bit with index c. + bool Test(int c) const { + DCHECK_GE(c, 0); + DCHECK_LE(c, 255); + + return (words_[c / 64] & (1ULL << (c % 64))) != 0; + } + + // Sets the bit with index c. + void Set(int c) { + DCHECK_GE(c, 0); + DCHECK_LE(c, 255); + + words_[c / 64] |= (1ULL << (c % 64)); + } + + // Finds the next non-zero bit with index >= c. + // Returns -1 if no such bit exists. + int FindNextSetBit(int c) const; + + private: + // Finds the least significant non-zero bit in n. + static int FindLSBSet(uint64_t n) { + DCHECK_NE(n, 0); + +#if defined(__GNUC__) + return __builtin_ctzll(n); +#elif defined(_MSC_VER) && defined(_M_X64) + unsigned long c; + _BitScanForward64(&c, n); + return static_cast(c); +#elif defined(_MSC_VER) && defined(_M_IX86) + unsigned long c; + if (static_cast(n) != 0) { + _BitScanForward(&c, static_cast(n)); + return static_cast(c); + } else { + _BitScanForward(&c, static_cast(n >> 32)); + return static_cast(c) + 32; + } +#else + int c = 63; + for (int shift = 1 << 5; shift != 0; shift >>= 1) { + uint64_t word = n << shift; + if (word != 0) { + n = word; + c -= shift; + } + } + return c; +#endif + } + + uint64_t words_[4]; +}; + +int Bitmap256::FindNextSetBit(int c) const { + DCHECK_GE(c, 0); + DCHECK_LE(c, 255); + + // Check the word that contains the bit. Mask out any lower bits. + int i = c / 64; + uint64_t word = words_[i] & (~0ULL << (c % 64)); + if (word != 0) + return (i * 64) + FindLSBSet(word); + + // Check any following words. + i++; + switch (i) { + case 1: + if (words_[1] != 0) + return (1 * 64) + FindLSBSet(words_[1]); + FALLTHROUGH_INTENDED; + case 2: + if (words_[2] != 0) + return (2 * 64) + FindLSBSet(words_[2]); + FALLTHROUGH_INTENDED; + case 3: + if (words_[3] != 0) + return (3 * 64) + FindLSBSet(words_[3]); + FALLTHROUGH_INTENDED; + default: + return -1; + } +} + +} // namespace re2 + +#endif // RE2_BITMAP256_H_ diff --git a/contrib/libre2/re2/bitstate.cc b/contrib/libre2/re2/bitstate.cc index 518d6420127..5ca2aa30caf 100644 --- a/contrib/libre2/re2/bitstate.cc +++ b/contrib/libre2/re2/bitstate.cc @@ -17,6 +17,11 @@ // SearchBitState is a fast replacement for the NFA code on small // regexps and texts when SearchOnePass cannot be used. +#include +#include +#include + +#include "util/logging.h" #include "re2/prog.h" #include "re2/regexp.h" @@ -60,8 +65,8 @@ class BitState { int ncap_; static const int VisitedBits = 32; - uint32 *visited_; // bitmap: (Inst*, char*) pairs already backtracked - int nvisited_; // # of words in bitmap + uint32_t *visited_; // bitmap: (Inst*, char*) pairs already backtracked + size_t nvisited_; // # of words in bitmap Job *job_; // stack of text positions to explore int njob_; @@ -94,7 +99,7 @@ BitState::~BitState() { // If so, remember that it was visited so that the next time, // we don't repeat the visit. bool BitState::ShouldVisit(int id, const char* p) { - uint n = id * (text_.size() + 1) + (p - text_.begin()); + size_t n = id * (text_.size() + 1) + (p - text_.begin()); if (visited_[n/VisitedBits] & (1 << (n & (VisitedBits-1)))) return false; visited_[n/VisitedBits] |= 1 << (n & (VisitedBits-1)); @@ -103,7 +108,6 @@ bool BitState::ShouldVisit(int id, const char* p) { // Grow the stack. bool BitState::GrowStack() { - // VLOG(0) << "Reallocate."; maxjob_ *= 2; Job* newjob = new Job[maxjob_]; memmove(newjob, job_, njob_*sizeof job_[0]); @@ -141,6 +145,7 @@ void BitState::Push(int id, const char* p, int arg) { // Return whether it succeeded. bool BitState::TrySearch(int id0, const char* p0) { bool matched = false; + bool inaltmatch = false; const char* end = text_.end(); njob_ = 0; Push(id0, p0, 0); @@ -159,81 +164,86 @@ bool BitState::TrySearch(int id0, const char* p0) { // would have, but we avoid the stack // manipulation. if (0) { + Next: + // If the Match of a non-greedy AltMatch failed, + // we stop ourselves from trying the ByteRange, + // which would steer us off the short circuit. + if (prog_->inst(id)->last() || inaltmatch) + continue; + id++; + CheckAndLoop: if (!ShouldVisit(id, p)) continue; } // Visit ip, p. - // VLOG(0) << "Job: " << ip->id() << " " - // << (p - text_.begin()) << " " << arg; Prog::Inst* ip = prog_->inst(id); switch (ip->opcode()) { - case kInstFail: default: LOG(DFATAL) << "Unexpected opcode: " << ip->opcode() << " arg " << arg; return false; - case kInstAlt: - // Cannot just - // Push(ip->out1(), p, 0); - // Push(ip->out(), p, 0); - // If, during the processing of ip->out(), we encounter - // ip->out1() via another path, we want to process it then. - // Pushing it here will inhibit that. Instead, re-push - // ip with arg==1 as a reminder to push ip->out1() later. + case kInstFail: + continue; + + case kInstAltMatch: switch (arg) { case 0: + inaltmatch = true; Push(id, p, 1); // come back when we're done + + // One opcode is ByteRange; the other leads to Match + // (possibly via Nop or Capture). + if (ip->greedy(prog_)) { + // out1 is the match + Push(ip->out1(), p, 0); + id = ip->out1(); + p = end; + goto CheckAndLoop; + } + // out is the match - non-greedy + Push(ip->out(), end, 0); id = ip->out(); goto CheckAndLoop; case 1: - // Finished ip->out(); try ip->out1(). - arg = 0; - id = ip->out1(); - goto CheckAndLoop; + inaltmatch = false; + continue; } - LOG(DFATAL) << "Bad arg in kInstCapture: " << arg; + LOG(DFATAL) << "Bad arg in kInstAltMatch: " << arg; continue; - case kInstAltMatch: - // One opcode is byte range; the other leads to match. - if (ip->greedy(prog_)) { - // out1 is the match - Push(ip->out1(), p, 0); - id = ip->out1(); - p = end; - goto CheckAndLoop; - } - // out is the match - non-greedy - Push(ip->out(), end, 0); - id = ip->out(); - goto CheckAndLoop; - case kInstByteRange: { int c = -1; if (p < end) c = *p & 0xFF; - if (ip->Matches(c)) { - id = ip->out(); - p++; - goto CheckAndLoop; - } - continue; + if (!ip->Matches(c)) + goto Next; + + if (!ip->last()) + Push(id+1, p, 0); // try the next when we're done + id = ip->out(); + p++; + goto CheckAndLoop; } case kInstCapture: switch (arg) { case 0: + if (!ip->last()) + Push(id+1, p, 0); // try the next when we're done + if (0 <= ip->cap() && ip->cap() < ncap_) { // Capture p to register, but save old value. Push(id, cap_[ip->cap()], 1); // come back when we're done cap_[ip->cap()] = p; } + // Continue on. id = ip->out(); goto CheckAndLoop; + case 1: // Finished ip->out(); restore the old value. cap_[ip->cap()] = p; @@ -244,19 +254,23 @@ bool BitState::TrySearch(int id0, const char* p0) { case kInstEmptyWidth: if (ip->empty() & ~Prog::EmptyFlags(context_, p)) - continue; + goto Next; + + if (!ip->last()) + Push(id+1, p, 0); // try the next when we're done id = ip->out(); goto CheckAndLoop; case kInstNop: + if (!ip->last()) + Push(id+1, p, 0); // try the next when we're done id = ip->out(); goto CheckAndLoop; case kInstMatch: { if (endmatch_ && p != text_.end()) - continue; + goto Next; - // VLOG(0) << "Found match."; // We found a match. If the caller doesn't care // where the match is, no point going further. if (nsubmatch_ == 0) @@ -270,7 +284,9 @@ bool BitState::TrySearch(int id0, const char* p0) { if (submatch_[0].data() == NULL || (longest_ && p > submatch_[0].end())) { for (int i = 0; i < nsubmatch_; i++) - submatch_[i] = StringPiece(cap_[2*i], cap_[2*i+1] - cap_[2*i]); + submatch_[i] = + StringPiece(cap_[2 * i], + static_cast(cap_[2 * i + 1] - cap_[2 * i])); } // If going for first match, we're done. @@ -282,7 +298,7 @@ bool BitState::TrySearch(int id0, const char* p0) { return true; // Otherwise, continue on in hope of a longer match. - continue; + goto Next; } } } @@ -308,13 +324,12 @@ bool BitState::Search(const StringPiece& text, const StringPiece& context, submatch_ = submatch; nsubmatch_ = nsubmatch; for (int i = 0; i < nsubmatch_; i++) - submatch_[i] = NULL; + submatch_[i] = StringPiece(); // Allocate scratch space. nvisited_ = (prog_->size() * (text.size()+1) + VisitedBits-1) / VisitedBits; - visited_ = new uint32[nvisited_]; + visited_ = new uint32_t[nvisited_]; memset(visited_, 0, nvisited_*sizeof visited_[0]); - // VLOG(0) << "nvisited_ = " << nvisited_; ncap_ = 2*nsubmatch; if (ncap_ < 2) @@ -338,6 +353,14 @@ bool BitState::Search(const StringPiece& text, const StringPiece& context, // but we are not clearing visited_ between calls to TrySearch, // so no work is duplicated and it ends up still being linear. for (const char* p = text.begin(); p <= text.end(); p++) { + // Try to use memchr to find the first byte quickly. + int fb = prog_->first_byte(); + if (fb >= 0 && p < text.end() && (p[0] & 0xFF) != fb) { + p = reinterpret_cast(memchr(p, fb, text.end() - p)); + if (p == NULL) + p = text.end(); + } + cap_[0] = p; if (TrySearch(prog_->start(), p)) // Match must be leftmost; done. return true; diff --git a/contrib/libre2/re2/compile.cc b/contrib/libre2/re2/compile.cc index bf51e7d34a8..454c8726d41 100644 --- a/contrib/libre2/re2/compile.cc +++ b/contrib/libre2/re2/compile.cc @@ -8,6 +8,13 @@ // This file's external interface is just Regexp::CompileToProg. // The Compiler class defined in this file is private. +#include +#include +#include +#include + +#include "util/logging.h" +#include "util/utf.h" #include "re2/prog.h" #include "re2/re2.h" #include "re2/regexp.h" @@ -28,14 +35,14 @@ namespace re2 { // is always the fail instruction, which never appears on a list. struct PatchList { - uint32 p; + uint32_t p; // Returns patch list containing just p. - static PatchList Mk(uint32 p); + static PatchList Mk(uint32_t p); // Patches all the entries on l to have value v. // Caller must not ever use patch list again. - static void Patch(Prog::Inst *inst0, PatchList l, uint32 v); + static void Patch(Prog::Inst *inst0, PatchList l, uint32_t v); // Deref returns the next pointer pointed at by p. static PatchList Deref(Prog::Inst *inst0, PatchList l); @@ -47,7 +54,7 @@ struct PatchList { static PatchList nullPatchList = { 0 }; // Returns patch list containing just p. -PatchList PatchList::Mk(uint32 p) { +PatchList PatchList::Mk(uint32_t p) { PatchList l; l.p = p; return l; @@ -64,7 +71,7 @@ PatchList PatchList::Deref(Prog::Inst* inst0, PatchList l) { } // Patches all the entries on l to have value v. -void PatchList::Patch(Prog::Inst *inst0, PatchList l, uint32 val) { +void PatchList::Patch(Prog::Inst *inst0, PatchList l, uint32_t val) { while (l.p != 0) { Prog::Inst* ip = &inst0[l.p>>1]; if (l.p&1) { @@ -103,17 +110,17 @@ PatchList PatchList::Append(Prog::Inst* inst0, PatchList l1, PatchList l2) { // Compiled program fragment. struct Frag { - uint32 begin; + uint32_t begin; PatchList end; Frag() : begin(0) { end.p = 0; } // needed so Frag can go in vector - Frag(uint32 begin, PatchList end) : begin(begin), end(end) {} + Frag(uint32_t begin, PatchList end) : begin(begin), end(end) {} }; // Input encodings. enum Encoding { kEncodingUTF8 = 1, // UTF-8 (0-10FFFF) - kEncodingLatin1, // Latin1 (0-FF) + kEncodingLatin1, // Latin-1 (0-FF) }; class Compiler : public Regexp::Walker { @@ -125,12 +132,11 @@ class Compiler : public Regexp::Walker { // Caller is responsible for deleting Prog when finished with it. // If reversed is true, compiles for walking over the input // string backward (reverses all concatenations). - static Prog *Compile(Regexp* re, bool reversed, int64 max_mem); + static Prog *Compile(Regexp* re, bool reversed, int64_t max_mem); // Compiles alternation of all the re to a new Prog. // Each re has a match with an id equal to its index in the vector. - static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor, - Regexp* re); + static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem); // Interface for Regexp::Walker, which helps traverse the Regexp. // The walk is purely post-recursive: given the machines for the @@ -162,7 +168,7 @@ class Compiler : public Regexp::Walker { Frag NoMatch(); // Returns a fragment that matches the empty string. - Frag Match(int32 id); + Frag Match(int32_t id); // Returns a no-op fragment. Frag Nop(); @@ -178,9 +184,6 @@ class Compiler : public Regexp::Walker { // Returns -1 if no more instructions are available. int AllocInst(int n); - // Deletes unused instructions. - void Trim(); - // Rune range compiler. // Begins a new alternation. @@ -193,19 +196,35 @@ class Compiler : public Regexp::Walker { void Add_80_10ffff(); // New suffix that matches the byte range lo-hi, then goes to next. - int RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next); - int UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next); + int UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next); + int CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next); + + // Returns true iff the suffix is cached. + bool IsCachedRuneByteSuffix(int id); // Adds a suffix to alternation. void AddSuffix(int id); + // Adds a suffix to the trie starting from the given root node. + // Returns zero iff allocating an instruction fails. Otherwise, returns + // the current root node, which might be different from what was given. + int AddSuffixRecursive(int root, int id); + + // Finds the trie node for the given suffix. Returns a Frag in order to + // distinguish between pointing at the root node directly (end.p == 0) + // and pointing at an Alt's out1 or out (end.p&1 == 1 or 0, respectively). + Frag FindByteRange(int root, int id); + + // Compares two ByteRanges and returns true iff they are equal. + bool ByteRangeEqual(int id1, int id2); + // Returns the alternation of all the added suffixes. Frag EndRange(); // Single rune. Frag Literal(Rune r, bool foldcase); - void Setup(Regexp::ParseFlags, int64, RE2::Anchor); + void Setup(Regexp::ParseFlags, int64_t, RE2::Anchor); Prog* Finish(); // Returns .* where dot = any byte @@ -223,14 +242,15 @@ class Compiler : public Regexp::Walker { int inst_len_; // Number of instructions used. int inst_cap_; // Number of instructions allocated. - int64 max_mem_; // Total memory budget. + int64_t max_mem_; // Total memory budget. - map rune_cache_; + std::unordered_map rune_cache_; Frag rune_range_; RE2::Anchor anchor_; // anchor mode for RE2::Set - DISALLOW_EVIL_CONSTRUCTORS(Compiler); + Compiler(const Compiler&) = delete; + Compiler& operator=(const Compiler&) = delete; }; Compiler::Compiler() { @@ -265,7 +285,8 @@ int Compiler::AllocInst(int n) { while (inst_len_ + n > inst_cap_) inst_cap_ *= 2; Prog::Inst* ip = new Prog::Inst[inst_cap_]; - memmove(ip, inst_, inst_len_ * sizeof ip[0]); + if (inst_ != NULL) + memmove(ip, inst_, inst_len_ * sizeof ip[0]); memset(ip + inst_len_, 0, (inst_cap_ - inst_len_) * sizeof ip[0]); delete[] inst_; inst_ = ip; @@ -275,16 +296,6 @@ int Compiler::AllocInst(int n) { return id; } -void Compiler::Trim() { - if (inst_len_ < inst_cap_) { - Prog::Inst* ip = new Prog::Inst[inst_len_]; - memmove(ip, inst_, inst_len_ * sizeof ip[0]); - delete[] inst_; - inst_ = ip; - inst_cap_ = inst_len_; - } -} - // These routines are somewhat hard to visualize in text -- // see http://swtch.com/~rsc/regexp/regexp1.html for // pictures explaining what is going on here. @@ -393,16 +404,6 @@ Frag Compiler::ByteRange(int lo, int hi, bool foldcase) { if (id < 0) return NoMatch(); inst_[id].InitByteRange(lo, hi, foldcase, 0); - prog_->byte_inst_count_++; - prog_->MarkByteRange(lo, hi); - if (foldcase && lo <= 'z' && hi >= 'a') { - if (lo < 'a') - lo = 'a'; - if (hi > 'z') - hi = 'z'; - if (lo <= hi) - prog_->MarkByteRange(lo + 'A' - 'a', hi + 'A' - 'a'); - } return Frag(id, PatchList::Mk(id << 1)); } @@ -416,7 +417,7 @@ Frag Compiler::Nop() { } // Returns a fragment that signals a match. -Frag Compiler::Match(int32 match_id) { +Frag Compiler::Match(int32_t match_id) { int id = AllocInst(1); if (id < 0) return NoMatch(); @@ -430,16 +431,6 @@ Frag Compiler::EmptyWidth(EmptyOp empty) { if (id < 0) return NoMatch(); inst_[id].InitEmptyWidth(empty, 0); - if (empty & (kEmptyBeginLine|kEmptyEndLine)) - prog_->MarkByteRange('\n', '\n'); - if (empty & (kEmptyWordBoundary|kEmptyNonWordBoundary)) { - int j; - for (int i = 0; i < 256; i = j) { - for (j = i+1; j < 256 && Prog::IsWordChar(i) == Prog::IsWordChar(j); j++) - ; - prog_->MarkByteRange(i, j-1); - } - } return Frag(id, PatchList::Mk(id << 1)); } @@ -482,7 +473,7 @@ void Compiler::BeginRange() { rune_range_.end = nullPatchList; } -int Compiler::UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, +int Compiler::UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next) { Frag f = ByteRange(lo, hi, foldcase); if (next != 0) { @@ -493,18 +484,18 @@ int Compiler::UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, return f.begin; } -int Compiler::RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next) { - // In Latin1 mode, there's no point in caching. - // In forward UTF-8 mode, only need to cache continuation bytes. - if (encoding_ == kEncodingLatin1 || - (encoding_ == kEncodingUTF8 && - !reversed_ && - !(0x80 <= lo && hi <= 0xbf))) { - return UncachedRuneByteSuffix(lo, hi, foldcase, next); - } +static uint64_t MakeRuneCacheKey(uint8_t lo, uint8_t hi, bool foldcase, + int next) { + return (uint64_t)next << 17 | + (uint64_t)lo << 9 | + (uint64_t)hi << 1 | + (uint64_t)foldcase; +} - uint64 key = ((uint64)next << 17) | (lo<<9) | (hi<<1) | foldcase; - map::iterator it = rune_cache_.find(key); +int Compiler::CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, + int next) { + uint64_t key = MakeRuneCacheKey(lo, hi, foldcase, next); + std::unordered_map::const_iterator it = rune_cache_.find(key); if (it != rune_cache_.end()) return it->second; int id = UncachedRuneByteSuffix(lo, hi, foldcase, next); @@ -512,12 +503,31 @@ int Compiler::RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next) { return id; } +bool Compiler::IsCachedRuneByteSuffix(int id) { + uint8_t lo = inst_[id].lo_; + uint8_t hi = inst_[id].hi_; + bool foldcase = inst_[id].foldcase() != 0; + int next = inst_[id].out(); + + uint64_t key = MakeRuneCacheKey(lo, hi, foldcase, next); + return rune_cache_.find(key) != rune_cache_.end(); +} + void Compiler::AddSuffix(int id) { + if (failed_) + return; + if (rune_range_.begin == 0) { rune_range_.begin = id; return; } + if (encoding_ == kEncodingUTF8) { + // Build a trie in order to reduce fanout. + rune_range_.begin = AddSuffixRecursive(rune_range_.begin, id); + return; + } + int alt = AllocInst(1); if (alt < 0) { rune_range_.begin = 0; @@ -527,6 +537,102 @@ void Compiler::AddSuffix(int id) { rune_range_.begin = alt; } +int Compiler::AddSuffixRecursive(int root, int id) { + DCHECK(inst_[root].opcode() == kInstAlt || + inst_[root].opcode() == kInstByteRange); + + Frag f = FindByteRange(root, id); + if (IsNoMatch(f)) { + int alt = AllocInst(1); + if (alt < 0) + return 0; + inst_[alt].InitAlt(root, id); + return alt; + } + + int br; + if (f.end.p == 0) + br = root; + else if (f.end.p&1) + br = inst_[f.begin].out1(); + else + br = inst_[f.begin].out(); + + if (IsCachedRuneByteSuffix(br)) { + // We can't fiddle with cached suffixes, so make a clone of the head. + int byterange = AllocInst(1); + if (byterange < 0) + return 0; + inst_[byterange].InitByteRange(inst_[br].lo(), inst_[br].hi(), + inst_[br].foldcase(), inst_[br].out()); + + // Ensure that the parent points to the clone, not to the original. + // Note that this could leave the head unreachable except via the cache. + br = byterange; + if (f.end.p == 0) + root = br; + else if (f.end.p&1) + inst_[f.begin].out1_ = br; + else + inst_[f.begin].set_out(br); + } + + int out = inst_[id].out(); + if (!IsCachedRuneByteSuffix(id)) { + // The head should be the instruction most recently allocated, so free it + // instead of leaving it unreachable. + DCHECK_EQ(id, inst_len_-1); + inst_[id].out_opcode_ = 0; + inst_[id].out1_ = 0; + inst_len_--; + } + + out = AddSuffixRecursive(inst_[br].out(), out); + if (out == 0) + return 0; + + inst_[br].set_out(out); + return root; +} + +bool Compiler::ByteRangeEqual(int id1, int id2) { + return inst_[id1].lo() == inst_[id2].lo() && + inst_[id1].hi() == inst_[id2].hi() && + inst_[id1].foldcase() == inst_[id2].foldcase(); +} + +Frag Compiler::FindByteRange(int root, int id) { + if (inst_[root].opcode() == kInstByteRange) { + if (ByteRangeEqual(root, id)) + return Frag(root, nullPatchList); + else + return NoMatch(); + } + + while (inst_[root].opcode() == kInstAlt) { + int out1 = inst_[root].out1(); + if (ByteRangeEqual(out1, id)) + return Frag(root, PatchList::Mk((root << 1) | 1)); + + // CharClass is a sorted list of ranges, so if out1 of the root Alt wasn't + // what we're looking for, then we can stop immediately. Unfortunately, we + // can't short-circuit the search in reverse mode. + if (!reversed_) + return NoMatch(); + + int out = inst_[root].out(); + if (inst_[out].opcode() == kInstAlt) + root = out; + else if (ByteRangeEqual(out, id)) + return Frag(root, PatchList::Mk(root << 1)); + else + return NoMatch(); + } + + LOG(DFATAL) << "should never happen"; + return NoMatch(); +} + Frag Compiler::EndRange() { return rune_range_; } @@ -550,12 +656,13 @@ void Compiler::AddRuneRange(Rune lo, Rune hi, bool foldcase) { } void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) { - // Latin1 is easy: runes *are* bytes. + // Latin-1 is easy: runes *are* bytes. if (lo > hi || lo > 0xFF) return; if (hi > 0xFF) hi = 0xFF; - AddSuffix(RuneByteSuffix(lo, hi, foldcase, 0)); + AddSuffix(UncachedRuneByteSuffix(static_cast(lo), + static_cast(hi), foldcase, 0)); } // Table describing how to make a UTF-8 matching machine @@ -591,12 +698,13 @@ static struct ByteRangeProg { void Compiler::Add_80_10ffff() { int inst[arraysize(prog_80_10ffff)] = { 0 }; // does not need to be initialized; silences gcc warning - for (size_t i = 0; i < arraysize(prog_80_10ffff); i++) { + for (int i = 0; i < arraysize(prog_80_10ffff); i++) { const ByteRangeProg& p = prog_80_10ffff[i]; int next = 0; if (p.next >= 0) next = inst[p.next]; - inst[i] = UncachedRuneByteSuffix(p.lo, p.hi, false, next); + inst[i] = UncachedRuneByteSuffix(static_cast(p.lo), + static_cast(p.hi), false, next); if ((p.lo & 0xC0) != 0x80) AddSuffix(inst[i]); } @@ -625,13 +733,14 @@ void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) { // ASCII range is always a special case. if (hi < Runeself) { - AddSuffix(RuneByteSuffix(lo, hi, foldcase, 0)); + AddSuffix(UncachedRuneByteSuffix(static_cast(lo), + static_cast(hi), foldcase, 0)); return; } // Split range into sections that agree on leading bytes. for (int i = 1; i < UTFmax; i++) { - uint m = (1<<(6*i)) - 1; // last i bytes of a UTF-8 sequence + uint32_t m = (1<<(6*i)) - 1; // last i bytes of a UTF-8 sequence if ((lo & ~m) != (hi & ~m)) { if ((lo & m) != 0) { AddRuneRangeUTF8(lo, lo|m, foldcase); @@ -647,19 +756,55 @@ void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) { } // Finally. Generate byte matching equivalent for lo-hi. - uint8 ulo[UTFmax], uhi[UTFmax]; + uint8_t ulo[UTFmax], uhi[UTFmax]; int n = runetochar(reinterpret_cast(ulo), &lo); int m = runetochar(reinterpret_cast(uhi), &hi); (void)m; // USED(m) DCHECK_EQ(n, m); + // The logic below encodes this thinking: + // + // 1. When we have built the whole suffix, we know that it cannot + // possibly be a suffix of anything longer: in forward mode, nothing + // else can occur before the leading byte; in reverse mode, nothing + // else can occur after the last continuation byte or else the leading + // byte would have to change. Thus, there is no benefit to caching + // the first byte of the suffix whereas there is a cost involved in + // cloning it if it begins a common prefix, which is fairly likely. + // + // 2. Conversely, the last byte of the suffix cannot possibly be a + // prefix of anything because next == 0, so we will never want to + // clone it, but it is fairly likely to be a common suffix. Perhaps + // more so in reverse mode than in forward mode because the former is + // "converging" towards lower entropy, but caching is still worthwhile + // for the latter in cases such as 80-BF. + // + // 3. Handling the bytes between the first and the last is less + // straightforward and, again, the approach depends on whether we are + // "converging" towards lower entropy: in forward mode, a single byte + // is unlikely to be part of a common suffix whereas a byte range + // is more likely so; in reverse mode, a byte range is unlikely to + // be part of a common suffix whereas a single byte is more likely + // so. The same benefit versus cost argument applies here. int id = 0; if (reversed_) { - for (int i = 0; i < n; i++) - id = RuneByteSuffix(ulo[i], uhi[i], false, id); + for (int i = 0; i < n; i++) { + // In reverse UTF-8 mode: cache the leading byte; don't cache the last + // continuation byte; cache anything else iff it's a single byte (XX-XX). + if (i == 0 || (ulo[i] == uhi[i] && i != n-1)) + id = CachedRuneByteSuffix(ulo[i], uhi[i], false, id); + else + id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id); + } } else { - for (int i = n-1; i >= 0; i--) - id = RuneByteSuffix(ulo[i], uhi[i], false, id); + for (int i = n-1; i >= 0; i--) { + // In forward UTF-8 mode: don't cache the leading byte; cache the last + // continuation byte; cache anything else iff it's a byte range (XX-YY). + if (i == n-1 || (ulo[i] < uhi[i] && i != 0)) + id = CachedRuneByteSuffix(ulo[i], uhi[i], false, id); + else + id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id); + } } AddSuffix(id); } @@ -699,11 +844,11 @@ Frag Compiler::Literal(Rune r, bool foldcase) { case kEncodingUTF8: { if (r < Runeself) // Make common case fast. return ByteRange(r, r, foldcase); - uint8 buf[UTFmax]; + uint8_t buf[UTFmax]; int n = runetochar(reinterpret_cast(buf), &r); - Frag f = ByteRange((uint8)buf[0], buf[0], false); + Frag f = ByteRange((uint8_t)buf[0], buf[0], false); for (int i = 1; i < n; i++) - f = Cat(f, ByteRange((uint8)buf[i], buf[i], false)); + f = Cat(f, ByteRange((uint8_t)buf[i], buf[i], false)); return f; } } @@ -732,9 +877,11 @@ Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags, case kRegexpHaveMatch: { Frag f = Match(re->match_id()); - // Remember unanchored match to end of string. - if (anchor_ != RE2::ANCHOR_BOTH) - f = Cat(DotStar(), Cat(EmptyWidth(kEmptyEndText), f)); + if (anchor_ == RE2::ANCHOR_BOTH) { + // Append \z or else the subexpression will effectively be unanchored. + // Complemented by the UNANCHORED case in CompileSet(). + f = Cat(EmptyWidth(kEmptyEndText), f); + } return f; } @@ -753,16 +900,16 @@ Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags, } case kRegexpStar: - return Star(child_frags[0], re->parse_flags()&Regexp::NonGreedy); + return Star(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0); case kRegexpPlus: - return Plus(child_frags[0], re->parse_flags()&Regexp::NonGreedy); + return Plus(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0); case kRegexpQuest: - return Quest(child_frags[0], re->parse_flags()&Regexp::NonGreedy); + return Quest(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0); case kRegexpLiteral: - return Literal(re->rune(), re->parse_flags()&Regexp::FoldCase); + return Literal(re->rune(), (re->parse_flags()&Regexp::FoldCase) != 0); case kRegexpLiteralString: { // Concatenation of literals. @@ -770,7 +917,8 @@ Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags, return Nop(); Frag f; for (int i = 0; i < re->nrunes(); i++) { - Frag f1 = Literal(re->runes()[i], re->parse_flags()&Regexp::FoldCase); + Frag f1 = Literal(re->runes()[i], + (re->parse_flags()&Regexp::FoldCase) != 0); if (i == 0) f = f1; else @@ -815,7 +963,8 @@ Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags, // If this range contains all of A-Za-z or none of it, // the fold flag is unnecessary; don't bother. bool fold = foldascii; - if ((i->lo <= 'A' && 'z' <= i->hi) || i->hi < 'A' || 'z' < i->lo) + if ((i->lo <= 'A' && 'z' <= i->hi) || i->hi < 'A' || 'z' < i->lo || + ('Z' < i->lo && i->hi < 'a')) fold = false; AddRuneRange(i->lo, i->hi, fold); @@ -949,7 +1098,7 @@ static bool IsAnchorEnd(Regexp** pre, int depth) { return false; } -void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem, +void Compiler::Setup(Regexp::ParseFlags flags, int64_t max_mem, RE2::Anchor anchor) { prog_->set_flags(flags); @@ -958,11 +1107,11 @@ void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem, max_mem_ = max_mem; if (max_mem <= 0) { max_inst_ = 100000; // more than enough - } else if (max_mem <= static_cast(sizeof(Prog))) { + } else if (static_cast(max_mem) <= sizeof(Prog)) { // No room for anything. max_inst_ = 0; } else { - int64 m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst); + int64_t m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst); // Limit instruction count so that inst->id() fits nicely in an int. // SparseArray also assumes that the indices (inst->id()) are ints. // The call to WalkExponential uses 2*max_inst_ below, @@ -978,7 +1127,7 @@ void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem, if (m > Prog::Inst::kMaxInst) m = Prog::Inst::kMaxInst; - max_inst_ = m; + max_inst_ = static_cast(m); } anchor_ = anchor; @@ -989,10 +1138,9 @@ void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem, // If reversed is true, compiles a program that expects // to run over the input string backward (reverses all concatenations). // The reversed flag is also recorded in the returned program. -Prog* Compiler::Compile(Regexp* re, bool reversed, int64 max_mem) { +Prog* Compiler::Compile(Regexp* re, bool reversed, int64_t max_mem) { Compiler c; - - c.Setup(re->parse_flags(), max_mem, RE2::ANCHOR_BOTH /* unused */); + c.Setup(re->parse_flags(), max_mem, RE2::UNANCHORED /* unused */); c.reversed_ = reversed; // Simplify to remove things like counted repetitions @@ -1007,7 +1155,7 @@ Prog* Compiler::Compile(Regexp* re, bool reversed, int64 max_mem) { bool is_anchor_end = IsAnchorEnd(&sre, 0); // Generate fragment for entire regexp. - Frag f = c.WalkExponential(sre, Frag(), 2*c.max_inst_); + Frag all = c.WalkExponential(sre, Frag(), 2*c.max_inst_); sre->Decref(); if (c.failed_) return NULL; @@ -1016,10 +1164,10 @@ Prog* Compiler::Compile(Regexp* re, bool reversed, int64 max_mem) { // Turn off c.reversed_ (if it is set) to force the remaining concatenations // to behave normally. c.reversed_ = false; - Frag all = c.Cat(f, c.Match(0)); - c.prog_->set_start(all.begin); + all = c.Cat(all, c.Match(0)); - if (reversed) { + c.prog_->set_reversed(reversed); + if (c.prog_->reversed()) { c.prog_->set_anchor_start(is_anchor_end); c.prog_->set_anchor_end(is_anchor_start); } else { @@ -1027,15 +1175,12 @@ Prog* Compiler::Compile(Regexp* re, bool reversed, int64 max_mem) { c.prog_->set_anchor_end(is_anchor_end); } - // Also create unanchored version, which starts with a .*? loop. - if (c.prog_->anchor_start()) { - c.prog_->set_start_unanchored(c.prog_->start()); - } else { - Frag unanchored = c.Cat(c.DotStar(), all); - c.prog_->set_start_unanchored(unanchored.begin); + c.prog_->set_start(all.begin); + if (!c.prog_->anchor_start()) { + // Also create unanchored version, which starts with a .*? loop. + all = c.Cat(c.DotStar(), all); } - - c.prog_->set_reversed(reversed); + c.prog_->set_start_unanchored(all.begin); // Hand ownership of prog_ to caller. return c.Finish(); @@ -1050,22 +1195,20 @@ Prog* Compiler::Finish() { inst_len_ = 1; } - // Trim instruction to minimum array and transfer to Prog. - Trim(); + // Hand off the array to Prog. prog_->inst_ = inst_; prog_->size_ = inst_len_; inst_ = NULL; - // Compute byte map. - prog_->ComputeByteMap(); - prog_->Optimize(); + prog_->Flatten(); + prog_->ComputeByteMap(); // Record remaining memory for DFA. if (max_mem_ <= 0) { prog_->set_dfa_mem(1<<20); } else { - int64 m = max_mem_ - sizeof(Prog) - inst_len_*sizeof(Prog::Inst); + int64_t m = max_mem_ - sizeof(Prog) - prog_->size_*sizeof(Prog::Inst); if (m < 0) m = 0; prog_->set_dfa_mem(m); @@ -1077,11 +1220,11 @@ Prog* Compiler::Finish() { } // Converts Regexp to Prog. -Prog* Regexp::CompileToProg(int64 max_mem) { +Prog* Regexp::CompileToProg(int64_t max_mem) { return Compiler::Compile(this, false, max_mem); } -Prog* Regexp::CompileToReverseProg(int64 max_mem) { +Prog* Regexp::CompileToReverseProg(int64_t max_mem) { return Compiler::Compile(this, true, max_mem); } @@ -1090,41 +1233,41 @@ Frag Compiler::DotStar() { } // Compiles RE set to Prog. -Prog* Compiler::CompileSet(const RE2::Options& options, RE2::Anchor anchor, - Regexp* re) { +Prog* Compiler::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) { Compiler c; + c.Setup(re->parse_flags(), max_mem, anchor); - Regexp::ParseFlags pf = static_cast(options.ParseFlags()); - c.Setup(pf, options.max_mem(), anchor); + Regexp* sre = re->Simplify(); + if (sre == NULL) + return NULL; - // Compile alternation of fragments. - Frag all = c.WalkExponential(re, Frag(), 2*c.max_inst_); - re->Decref(); + Frag all = c.WalkExponential(sre, Frag(), 2*c.max_inst_); + sre->Decref(); if (c.failed_) return NULL; - if (anchor == RE2::UNANCHORED) { - // The trailing .* was added while handling kRegexpHaveMatch. - // We just have to add the leading one. - all = c.Cat(c.DotStar(), all); - } - - c.prog_->set_start(all.begin); - c.prog_->set_start_unanchored(all.begin); c.prog_->set_anchor_start(true); c.prog_->set_anchor_end(true); + if (anchor == RE2::UNANCHORED) { + // Prepend .* or else the expression will effectively be anchored. + // Complemented by the ANCHOR_BOTH case in PostVisit(). + all = c.Cat(c.DotStar(), all); + } + c.prog_->set_start(all.begin); + c.prog_->set_start_unanchored(all.begin); + Prog* prog = c.Finish(); if (prog == NULL) return NULL; // Make sure DFA has enough memory to operate, // since we're not going to fall back to the NFA. - bool failed; + bool dfa_failed = false; StringPiece sp = "hello, world"; prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch, - NULL, &failed, NULL); - if (failed) { + NULL, &dfa_failed, NULL); + if (dfa_failed) { delete prog; return NULL; } @@ -1132,9 +1275,8 @@ Prog* Compiler::CompileSet(const RE2::Options& options, RE2::Anchor anchor, return prog; } -Prog* Prog::CompileSet(const RE2::Options& options, RE2::Anchor anchor, - Regexp* re) { - return Compiler::CompileSet(options, anchor, re); +Prog* Prog::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) { + return Compiler::CompileSet(re, anchor, max_mem); } } // namespace re2 diff --git a/contrib/libre2/re2/dfa.cc b/contrib/libre2/re2/dfa.cc index b551acc9c13..bc81fea35ca 100644 --- a/contrib/libre2/re2/dfa.cc +++ b/contrib/libre2/re2/dfa.cc @@ -21,15 +21,33 @@ // // See http://swtch.com/~rsc/regexp/ for a very bare-bones equivalent. +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "util/logging.h" +#include "util/mix.h" +#include "util/mutex.h" +#include "util/sparse_set.h" +#include "util/strutil.h" #include "re2/prog.h" #include "re2/stringpiece.h" -#include "util/atomicops.h" -#include "util/flags.h" -#include "util/sparse_set.h" -DEFINE_bool(re2_dfa_bail_when_slow, true, - "Whether the RE2 DFA should bail out early " - "if the NFA would be faster (for testing)."); +// Silence "zero-sized array in struct/union" warning for DFA::State::next_. +#ifdef _MSC_VER +#pragma warning(disable: 4200) +#endif namespace re2 { @@ -44,9 +62,12 @@ static void* memrchr(const void* s, int c, size_t n) { } #endif +// Controls whether the DFA should bail out early if the NFA would be faster. +static bool dfa_should_bail_when_slow = true; + // Changing this to true compiles in prints that trace execution of the DFA. // Generates a lot of output -- only useful for debugging. -static const bool DebugDFA = false; +static const bool ExtraDebug = false; // A DFA implementation of a regular expression program. // Since this is entirely a forward declaration mandated by C++, @@ -54,7 +75,7 @@ static const bool DebugDFA = false; // the comments in the sections that follow the DFA definition. class DFA { public: - DFA(Prog* prog, Prog::MatchKind kind, int64 max_mem); + DFA(Prog* prog, Prog::MatchKind kind, int64_t max_mem); ~DFA(); bool ok() const { return !init_failed_; } Prog::MatchKind kind() { return kind_; } @@ -74,11 +95,13 @@ class DFA { // memory), it sets *failed and returns false. bool Search(const StringPiece& text, const StringPiece& context, bool anchored, bool want_earliest_match, bool run_forward, - bool* failed, const char** ep, vector* matches); + bool* failed, const char** ep, SparseSet* matches); - // Builds out all states for the entire DFA. FOR TESTING ONLY - // Returns number of states. - int BuildAllStates(); + // Builds out all states for the entire DFA. + // If cb is not empty, it receives one callback per state built. + // Returns the number of states built. + // FOR TESTING OR EXPERIMENTAL PURPOSES ONLY. + int BuildAllStates(const Prog::DFAStateCallback& cb); // Computes min and max for matching strings. Won't return strings // bigger than maxlen. @@ -86,94 +109,72 @@ class DFA { // These data structures are logically private, but C++ makes it too // difficult to mark them as such. - class Workq; class RWLocker; class StateSaver; + class Workq; // A single DFA state. The DFA is represented as a graph of these // States, linked by the next_ pointers. If in state s and reading // byte c, the next state should be s->next_[c]. struct State { - inline bool IsMatch() const { return flag_ & kFlagMatch; } - void SaveMatch(vector* v); + inline bool IsMatch() const { return (flag_ & kFlagMatch) != 0; } + void SaveMatch(std::vector* v); int* inst_; // Instruction pointers in the state. int ninst_; // # of inst_ pointers. - uint flag_; // Empty string bitfield flags in effect on the way + uint32_t flag_; // Empty string bitfield flags in effect on the way // into this state, along with kFlagMatch if this // is a matching state. - State** next_; // Outgoing arrows from State, + +// Work around the bug affecting flexible array members in GCC 6.x (for x >= 1). +// (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70932) +#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ == 6 && __GNUC_MINOR__ >= 1 + std::atomic next_[0]; // Outgoing arrows from State, +#else + std::atomic next_[]; // Outgoing arrows from State, +#endif + // one per input byte class }; enum { kByteEndText = 256, // imaginary byte at end of text - kFlagEmptyMask = 0xFFF, // State.flag_: bits holding kEmptyXXX flags - kFlagMatch = 0x1000, // State.flag_: this is a matching state - kFlagLastWord = 0x2000, // State.flag_: last byte was a word char + kFlagEmptyMask = 0xFF, // State.flag_: bits holding kEmptyXXX flags + kFlagMatch = 0x0100, // State.flag_: this is a matching state + kFlagLastWord = 0x0200, // State.flag_: last byte was a word char kFlagNeedShift = 16, // needed kEmpty bits are or'ed in shifted left }; -#ifndef STL_MSVC - // STL function structures for use with unordered_set. + struct StateHash { + size_t operator()(const State* a) const { + DCHECK(a != NULL); + HashMix mix(a->flag_); + for (int i = 0; i < a->ninst_; i++) + mix.Mix(a->inst_[i]); + mix.Mix(0); + return mix.get(); + } + }; + struct StateEqual { bool operator()(const State* a, const State* b) const { + DCHECK(a != NULL); + DCHECK(b != NULL); if (a == b) return true; - if (a == NULL || b == NULL) + if (a->flag_ != b->flag_) return false; if (a->ninst_ != b->ninst_) return false; - if (a->flag_ != b->flag_) - return false; for (int i = 0; i < a->ninst_; i++) if (a->inst_[i] != b->inst_[i]) return false; - return true; // they're equal + return true; } }; -#endif // STL_MSVC - struct StateHash { - size_t operator()(const State* a) const { - if (a == NULL) - return 0; - const char* s = reinterpret_cast(a->inst_); - int len = a->ninst_ * sizeof a->inst_[0]; - if (sizeof(size_t) == sizeof(uint32)) - return Hash32StringWithSeed(s, len, a->flag_); - else - return Hash64StringWithSeed(s, len, a->flag_); - } -#ifdef STL_MSVC - // Less than operator. - bool operator()(const State* a, const State* b) const { - if (a == b) - return false; - if (a == NULL || b == NULL) - return a == NULL; - if (a->ninst_ != b->ninst_) - return a->ninst_ < b->ninst_; - if (a->flag_ != b->flag_) - return a->flag_ < b->flag_; - for (int i = 0; i < a->ninst_; ++i) - if (a->inst_[i] != b->inst_[i]) - return a->inst_[i] < b->inst_[i]; - return false; // they're equal - } - // The two public members are required by msvc. 4 and 8 are default values. - // Reference: http://msdn.microsoft.com/en-us/library/1s1byw77.aspx - static const size_t bucket_size = 4; - static const size_t min_buckets = 8; -#endif // STL_MSVC - }; - -#ifdef STL_MSVC - typedef unordered_set StateSet; -#else // !STL_MSVC - typedef unordered_set StateSet; -#endif // STL_MSVC + typedef std::unordered_set StateSet; private: // Special "firstbyte" values for a state. (Values >= 0 denote actual bytes.) @@ -205,11 +206,11 @@ class DFA { // Looks up and returns the State corresponding to a Workq. // L >= mutex_ - State* WorkqToCachedState(Workq* q, uint flag); + State* WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag); // Looks up and returns a State matching the inst, ninst, and flag. // L >= mutex_ - State* CachedState(int* inst, int ninst, uint flag); + State* CachedState(int* inst, int ninst, uint32_t flag); // Clear the cache entirely. // Must hold cache_mutex_.w or be in destructor. @@ -217,7 +218,7 @@ class DFA { // Converts a State into a Workq: the opposite of WorkqToCachedState. // L >= mutex_ - static void StateToWorkq(State* s, Workq* q); + void StateToWorkq(State* s, Workq* q); // Runs a State on a given byte, returning the next state. State* RunStateOnByteUnlocked(State*, int); // cache_mutex_.r <= L < mutex_ @@ -228,18 +229,16 @@ class DFA { // sets *ismatch to true. // L >= mutex_ void RunWorkqOnByte(Workq* q, Workq* nq, - int c, uint flag, bool* ismatch, - Prog::MatchKind kind, - int new_byte_loop); + int c, uint32_t flag, bool* ismatch); // Runs a Workq on a set of empty-string flags, producing a new Workq in nq. // L >= mutex_ - void RunWorkqOnEmptyString(Workq* q, Workq* nq, uint flag); + void RunWorkqOnEmptyString(Workq* q, Workq* nq, uint32_t flag); // Adds the instruction id to the Workq, following empty arrows // according to flag. // L >= mutex_ - void AddToQueue(Workq* q, int id, uint flag); + void AddToQueue(Workq* q, int id, uint32_t flag); // For debugging, returns a text representation of State. static string DumpState(State* state); @@ -272,10 +271,11 @@ class DFA { RWLocker *cache_lock; bool failed; // "out" parameter: whether search gave up const char* ep; // "out" parameter: end pointer for match - vector* matches; + SparseSet* matches; private: - DISALLOW_EVIL_CONSTRUCTORS(SearchParams); + SearchParams(const SearchParams&) = delete; + SearchParams& operator=(const SearchParams&) = delete; }; // Before each search, the parameters to Search are analyzed by @@ -284,7 +284,7 @@ class DFA { struct StartInfo { StartInfo() : start(NULL), firstbyte(kFbUnknown) { } State* start; - volatile int firstbyte; + std::atomic firstbyte; }; // Fills in params->start and params->firstbyte using @@ -292,7 +292,8 @@ class DFA { // false on failure. // cache_mutex_.r <= L < mutex_ bool AnalyzeSearch(SearchParams* params); - bool AnalyzeSearchHelper(SearchParams* params, StartInfo* info, uint flags); + bool AnalyzeSearchHelper(SearchParams* params, StartInfo* info, + uint32_t flags); // The generic search loop, inlined to create specialized versions. // cache_mutex_.r <= L < mutex_ @@ -340,7 +341,6 @@ class DFA { // Constant after initialization. Prog* prog_; // The regular expression program to run. Prog::MatchKind kind_; // The kind of DFA. - int start_unanchored_; // start of unanchored program bool init_failed_; // initialization failed (out of memory) Mutex mutex_; // mutex_ >= cache_mutex_.r @@ -358,16 +358,15 @@ class DFA { // readers. Any State* pointers are only valid while cache_mutex_ // is held. Mutex cache_mutex_; - int64 mem_budget_; // Total memory budget for all States. - int64 state_budget_; // Amount of memory remaining for new States. + int64_t mem_budget_; // Total memory budget for all States. + int64_t state_budget_; // Amount of memory remaining for new States. StateSet state_cache_; // All States computed so far. StartInfo start_[kMaxStart]; - bool cache_warned_; // have printed to LOG(INFO) about the cache }; -// Shorthand for casting to uint8*. -static inline const uint8* BytePtr(const void* v) { - return reinterpret_cast(v); +// Shorthand for casting to uint8_t*. +static inline const uint8_t* BytePtr(const void* v) { + return reinterpret_cast(v); } // Work queues @@ -376,6 +375,10 @@ static inline const uint8* BytePtr(const void* v) { // in the work queue when in leftmost-longest matching mode. #define Mark (-1) +// Separates the match IDs from the instructions in inst_. +// Used only for "many match" DFA states. +#define MatchSep (-2) + // Internally, the DFA uses a sparse array of // program instruction pointers as a work queue. // In leftmost longest mode, marks separate sections @@ -428,27 +431,29 @@ class DFA::Workq : public SparseSet { int maxmark_; // maximum number of marks int nextmark_; // id of next mark bool last_was_mark_; // last inserted was mark - DISALLOW_EVIL_CONSTRUCTORS(Workq); + + Workq(const Workq&) = delete; + Workq& operator=(const Workq&) = delete; }; -DFA::DFA(Prog* prog, Prog::MatchKind kind, int64 max_mem) +DFA::DFA(Prog* prog, Prog::MatchKind kind, int64_t max_mem) : prog_(prog), kind_(kind), init_failed_(false), q0_(NULL), q1_(NULL), astack_(NULL), - mem_budget_(max_mem), - cache_warned_(false) { - if (DebugDFA) + mem_budget_(max_mem) { + if (ExtraDebug) fprintf(stderr, "\nkind %d\n%s\n", (int)kind_, prog_->DumpUnanchored().c_str()); int nmark = 0; - start_unanchored_ = 0; - if (kind_ == Prog::kLongestMatch) { - nmark = prog->size(); - start_unanchored_ = prog->start_unanchored(); - } - nastack_ = 2 * prog->size() + nmark; + if (kind_ == Prog::kLongestMatch) + nmark = prog_->size(); + // See DFA::AddToQueue() for why this is so. + nastack_ = prog_->inst_count(kInstCapture) + + prog_->inst_count(kInstEmptyWidth) + + prog_->inst_count(kInstNop) + + nmark + 1; // + 1 for start inst // Account for space needed for DFA, q0, q1, astack. mem_budget_ -= sizeof(DFA); @@ -456,8 +461,6 @@ DFA::DFA(Prog* prog, Prog::MatchKind kind, int64 max_mem) (sizeof(int)+sizeof(int)) * 2; // q0, q1 mem_budget_ -= nastack_ * sizeof(int); // astack if (mem_budget_ < 0) { - LOG(INFO) << StringPrintf("DFA out of memory: prog size %lld mem %lld", - prog_->size(), max_mem); init_failed_ = true; return; } @@ -468,17 +471,18 @@ DFA::DFA(Prog* prog, Prog::MatchKind kind, int64 max_mem) // At minimum, the search requires room for two states in order // to limp along, restarting frequently. We'll get better performance // if there is room for a larger number of states, say 20. - int64 one_state = sizeof(State) + (prog_->size()+nmark)*sizeof(int) + - (prog_->bytemap_range()+1)*sizeof(State*); + // Note that a state stores list heads only, so we use the program + // list count for the upper bound, not the program size. + int nnext = prog_->bytemap_range() + 1; // + 1 for kByteEndText slot + int64_t one_state = sizeof(State) + nnext*sizeof(std::atomic) + + (prog_->list_count()+nmark)*sizeof(int); if (state_budget_ < 20*one_state) { - LOG(INFO) << StringPrintf("DFA out of memory: prog size %lld mem %lld", - prog_->size(), max_mem); init_failed_ = true; return; } - q0_ = new Workq(prog->size(), nmark); - q1_ = new Workq(prog->size(), nmark); + q0_ = new Workq(prog_->size(), nmark); + q1_ = new Workq(prog_->size(), nmark); astack_ = new int[nastack_]; } @@ -507,7 +511,7 @@ DFA::~DFA() { string DFA::DumpWorkq(Workq* q) { string s; const char* sep = ""; - for (DFA::Workq::iterator it = q->begin(); it != q->end(); ++it) { + for (Workq::iterator it = q->begin(); it != q->end(); ++it) { if (q->is_mark(*it)) { StringAppendF(&s, "|"); sep = ""; @@ -534,6 +538,9 @@ string DFA::DumpState(State* state) { if (state->inst_[i] == Mark) { StringAppendF(&s, "|"); sep = ""; + } else if (state->inst_[i] == MatchSep) { + StringAppendF(&s, "||"); + sep = ""; } else { StringAppendF(&s, "%s%d", sep, state->inst_[i]); sep = ","; @@ -601,9 +608,10 @@ string DFA::DumpState(State* state) { // Looks in the State cache for a State matching q, flag. // If one is found, returns it. If one is not found, allocates one, // inserts it in the cache, and returns it. -DFA::State* DFA::WorkqToCachedState(Workq* q, uint flag) { - if (DEBUG_MODE) - mutex_.AssertHeld(); +// If mq is not null, MatchSep and the match IDs in mq will be appended +// to the State. +DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) { + //mutex_.AssertHeld(); // Construct array of instruction ids for the new state. // Only ByteRange, EmptyWidth, and Match instructions are useful to keep: @@ -611,10 +619,10 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, uint flag) { // RunWorkqOnEmptyString or RunWorkqOnByte. int* inst = new int[q->size()]; int n = 0; - uint needflags = 0; // flags needed by kInstEmptyWidth instructions - bool sawmatch = false; // whether queue contains guaranteed kInstMatch - bool sawmark = false; // whether queue contains a Mark - if (DebugDFA) + uint32_t needflags = 0; // flags needed by kInstEmptyWidth instructions + bool sawmatch = false; // whether queue contains guaranteed kInstMatch + bool sawmark = false; // whether queue contains a Mark + if (ExtraDebug) fprintf(stderr, "WorkqToCachedState %s [%#x]", DumpWorkq(q).c_str(), flag); for (Workq::iterator it = q->begin(); it != q->end(); ++it) { int id = *it; @@ -640,36 +648,22 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, uint flag) { (kind_ != Prog::kLongestMatch || !sawmark) && (flag & kFlagMatch)) { delete[] inst; - if (DebugDFA) + if (ExtraDebug) fprintf(stderr, " -> FullMatchState\n"); return FullMatchState; } - // Fall through. - case kInstByteRange: // These are useful. - case kInstEmptyWidth: - case kInstMatch: - case kInstAlt: // Not useful, but necessary [*] - inst[n++] = *it; + FALLTHROUGH_INTENDED; + default: + // Record iff id is the head of its list, which must + // be the case if id-1 is the last of *its* list. :) + if (prog_->inst(id-1)->last()) + inst[n++] = *it; if (ip->opcode() == kInstEmptyWidth) needflags |= ip->empty(); if (ip->opcode() == kInstMatch && !prog_->anchor_end()) sawmatch = true; break; - - default: // The rest are not. - break; } - - // [*] kInstAlt would seem useless to record in a state, since - // we've already followed both its arrows and saved all the - // interesting states we can reach from there. The problem - // is that one of the empty-width instructions might lead - // back to the same kInstAlt (if an empty-width operator is starred), - // producing a different evaluation order depending on whether - // we keep the kInstAlt to begin with. Sigh. - // A specific case that this affects is /(^|a)+/ matching "a". - // If we don't save the kInstAlt, we will match the whole "a" (0,1) - // but in fact the correct leftmost-first match is the leading "" (0,0). } DCHECK_LE(n, q->size()); if (n > 0 && inst[n-1] == Mark) @@ -701,7 +695,7 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, uint flag) { // if the state is *not* a matching state. if (n == 0 && flag == 0) { delete[] inst; - if (DebugDFA) + if (ExtraDebug) fprintf(stderr, " -> DeadState\n"); return DeadState; } @@ -716,13 +710,24 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, uint flag) { int* markp = ip; while (markp < ep && *markp != Mark) markp++; - sort(ip, markp); + std::sort(ip, markp); if (markp < ep) markp++; ip = markp; } } + // Append MatchSep and the match IDs in mq if necessary. + if (mq != NULL) { + inst[n++] = MatchSep; + for (Workq::iterator i = mq->begin(); i != mq->end(); ++i) { + int id = *i; + Prog::Inst* ip = prog_->inst(id); + if (ip->opcode() == kInstMatch) + inst[n++] = ip->match_id(); + } + } + // Save the needed empty-width flags in the top bits for use later. flag |= needflags << kFlagNeedShift; @@ -734,42 +739,50 @@ DFA::State* DFA::WorkqToCachedState(Workq* q, uint flag) { // Looks in the State cache for a State matching inst, ninst, flag. // If one is found, returns it. If one is not found, allocates one, // inserts it in the cache, and returns it. -DFA::State* DFA::CachedState(int* inst, int ninst, uint flag) { - if (DEBUG_MODE) - mutex_.AssertHeld(); +DFA::State* DFA::CachedState(int* inst, int ninst, uint32_t flag) { + //mutex_.AssertHeld(); // Look in the cache for a pre-existing state. - State state = { inst, ninst, flag, NULL }; + // We have to initialise the struct like this because otherwise + // MSVC will complain about the flexible array member. :( + State state; + state.inst_ = inst; + state.ninst_ = ninst; + state.flag_ = flag; StateSet::iterator it = state_cache_.find(&state); if (it != state_cache_.end()) { - if (DebugDFA) + if (ExtraDebug) fprintf(stderr, " -cached-> %s\n", DumpState(*it).c_str()); return *it; } // Must have enough memory for new state. // In addition to what we're going to allocate, - // the state cache hash table seems to incur about 32 bytes per + // the state cache hash table seems to incur about 40 bytes per // State*, empirically. - const int kStateCacheOverhead = 32; + const int kStateCacheOverhead = 40; int nnext = prog_->bytemap_range() + 1; // + 1 for kByteEndText slot - int mem = sizeof(State) + nnext*sizeof(State*) + ninst*sizeof(int); + int mem = sizeof(State) + nnext*sizeof(std::atomic) + + ninst*sizeof(int); if (mem_budget_ < mem + kStateCacheOverhead) { mem_budget_ = -1; return NULL; } mem_budget_ -= mem + kStateCacheOverhead; - // Allocate new state, along with room for next and inst. + // Allocate new state along with room for next_ and inst_. char* space = new char[mem]; - State* s = reinterpret_cast(space); - s->next_ = reinterpret_cast(s + 1); - s->inst_ = reinterpret_cast(s->next_ + nnext); - memset(s->next_, 0, nnext*sizeof s->next_[0]); + State* s = new (space) State; + (void) new (s->next_) std::atomic[nnext]; + // Work around a unfortunate bug in older versions of libstdc++. + // (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=64658) + for (int i = 0; i < nnext; i++) + (void) new (s->next_ + i) std::atomic(NULL); + s->inst_ = new (s->next_ + nnext) int[ninst]; memmove(s->inst_, inst, ninst*sizeof s->inst_[0]); s->ninst_ = ninst; s->flag_ = flag; - if (DebugDFA) + if (ExtraDebug) fprintf(stderr, " -> %s\n", DumpState(s).c_str()); // Put state in cache and return it. @@ -779,39 +792,45 @@ DFA::State* DFA::CachedState(int* inst, int ninst, uint flag) { // Clear the cache. Must hold cache_mutex_.w or be in destructor. void DFA::ClearCache() { - // In case state_cache_ doesn't support deleting entries - // during iteration, copy into a vector and then delete. - vector v; - v.reserve(state_cache_.size()); - for (StateSet::iterator it = state_cache_.begin(); - it != state_cache_.end(); ++it) - v.push_back(*it); + StateSet::iterator begin = state_cache_.begin(); + StateSet::iterator end = state_cache_.end(); + while (begin != end) { + StateSet::iterator tmp = begin; + ++begin; + // Deallocate the blob of memory that we allocated in DFA::CachedState(). + delete[] reinterpret_cast(*tmp); + } state_cache_.clear(); - for (size_t i = 0; i < v.size(); i++) - delete[] reinterpret_cast(v[i]); } // Copies insts in state s to the work queue q. void DFA::StateToWorkq(State* s, Workq* q) { q->clear(); for (int i = 0; i < s->ninst_; i++) { - if (s->inst_[i] == Mark) + if (s->inst_[i] == Mark) { q->mark(); - else - q->insert_new(s->inst_[i]); + } else if (s->inst_[i] == MatchSep) { + // Nothing after this is an instruction! + break; + } else { + // Explore from the head of the list. + AddToQueue(q, s->inst_[i], s->flag_ & kFlagEmptyMask); + } } } -// Adds ip to the work queue, following empty arrows according to flag -// and expanding kInstAlt instructions (two-target gotos). -void DFA::AddToQueue(Workq* q, int id, uint flag) { +// Adds ip to the work queue, following empty arrows according to flag. +void DFA::AddToQueue(Workq* q, int id, uint32_t flag) { - // Use astack_ to hold our stack of states yet to process. - // It is sized to have room for nastack_ == 2*prog->size() + nmark - // instructions, which is enough: each instruction can be - // processed by the switch below only once, and the processing - // pushes at most two instructions plus maybe a mark. - // (If we're using marks, nmark == prog->size(); otherwise nmark == 0.) + // Use astack_ to hold our stack of instructions yet to process. + // It was preallocated as follows: + // one entry per Capture; + // one entry per EmptyWidth; and + // one entry per Nop. + // This reflects the maximum number of stack pushes that each can + // perform. (Each instruction can be processed at most once.) + // When using marks, we also added nmark == prog_->size(). + // (Otherwise, nmark == 0.) int* stk = astack_; int nstk = 0; @@ -820,6 +839,7 @@ void DFA::AddToQueue(Workq* q, int id, uint flag) { DCHECK_LE(nstk, nastack_); id = stk[--nstk]; + Loop: if (id == Mark) { q->mark(); continue; @@ -829,9 +849,8 @@ void DFA::AddToQueue(Workq* q, int id, uint flag) { continue; // If ip is already on the queue, nothing to do. - // Otherwise add it. We don't actually keep all the ones - // that get added -- for example, kInstAlt is ignored - // when on a work queue -- but adding all ip's here + // Otherwise add it. We don't actually keep all the + // ones that get added, but adding all of them here // increases the likelihood of q->contains(id), // reducing the amount of duplicated work. if (q->contains(id)) @@ -841,37 +860,46 @@ void DFA::AddToQueue(Workq* q, int id, uint flag) { // Process instruction. Prog::Inst* ip = prog_->inst(id); switch (ip->opcode()) { - case kInstFail: // can't happen: discarded above + default: + LOG(DFATAL) << "unhandled opcode: " << ip->opcode(); break; case kInstByteRange: // just save these on the queue case kInstMatch: - break; + if (ip->last()) + break; + id = id+1; + goto Loop; case kInstCapture: // DFA treats captures as no-ops. case kInstNop: - stk[nstk++] = ip->out(); - break; + if (!ip->last()) + stk[nstk++] = id+1; - case kInstAlt: // two choices: expand both, in order - case kInstAltMatch: - // Want to visit out then out1, so push on stack in reverse order. - // This instruction is the [00-FF]* loop at the beginning of - // a leftmost-longest unanchored search, separate out from out1 - // with a Mark, so that out1's threads (which will start farther - // to the right in the string being searched) are lower priority - // than the current ones. - stk[nstk++] = ip->out1(); - if (q->maxmark() > 0 && + // If this instruction is the [00-FF]* loop at the beginning of + // a leftmost-longest unanchored search, separate with a Mark so + // that future threads (which will start farther to the right in + // the input string) are lower priority than current threads. + if (ip->opcode() == kInstNop && q->maxmark() > 0 && id == prog_->start_unanchored() && id != prog_->start()) stk[nstk++] = Mark; - stk[nstk++] = ip->out(); - break; + id = ip->out(); + goto Loop; + + case kInstAltMatch: + DCHECK(!ip->last()); + id = id+1; + goto Loop; case kInstEmptyWidth: - if ((ip->empty() & flag) == ip->empty()) - stk[nstk++] = ip->out(); - break; + if (!ip->last()) + stk[nstk++] = id+1; + + // Continue on if we have all the right flag bits. + if (ip->empty() & ~flag) + break; + id = ip->out(); + goto Loop; } } } @@ -892,7 +920,7 @@ void DFA::AddToQueue(Workq* q, int id, uint flag) { // and then processing only $. Doing the two-step sequence won't match // ^$^$^$ but processing ^ and $ simultaneously will (and is the behavior // exhibited by existing implementations). -void DFA::RunWorkqOnEmptyString(Workq* oldq, Workq* newq, uint flag) { +void DFA::RunWorkqOnEmptyString(Workq* oldq, Workq* newq, uint32_t flag) { newq->clear(); for (Workq::iterator i = oldq->begin(); i != oldq->end(); ++i) { if (oldq->is_mark(*i)) @@ -907,11 +935,8 @@ void DFA::RunWorkqOnEmptyString(Workq* oldq, Workq* newq, uint flag) { // means to match c$. Sets the bool *ismatch to true if the end of the // regular expression program has been reached (the regexp has matched). void DFA::RunWorkqOnByte(Workq* oldq, Workq* newq, - int c, uint flag, bool* ismatch, - Prog::MatchKind kind, - int new_byte_loop) { - if (DEBUG_MODE) - mutex_.AssertHeld(); + int c, uint32_t flag, bool* ismatch) { + //mutex_.AssertHeld(); newq->clear(); for (Workq::iterator i = oldq->begin(); i != oldq->end(); ++i) { @@ -924,10 +949,13 @@ void DFA::RunWorkqOnByte(Workq* oldq, Workq* newq, int id = *i; Prog::Inst* ip = prog_->inst(id); switch (ip->opcode()) { + default: + LOG(DFATAL) << "unhandled opcode: " << ip->opcode(); + break; + case kInstFail: // never succeeds case kInstCapture: // already followed case kInstNop: // already followed - case kInstAlt: // already followed case kInstAltMatch: // already followed case kInstEmptyWidth: // already followed break; @@ -938,10 +966,11 @@ void DFA::RunWorkqOnByte(Workq* oldq, Workq* newq, break; case kInstMatch: - if (prog_->anchor_end() && c != kByteEndText) + if (prog_->anchor_end() && c != kByteEndText && + kind_ != Prog::kManyMatch) break; *ismatch = true; - if (kind == Prog::kFirstMatch) { + if (kind_ == Prog::kFirstMatch) { // Can stop processing work queue since we found a match. return; } @@ -949,7 +978,7 @@ void DFA::RunWorkqOnByte(Workq* oldq, Workq* newq, } } - if (DebugDFA) + if (ExtraDebug) fprintf(stderr, "%s on %d[%#x] -> %s [%d]\n", DumpWorkq(oldq).c_str(), c, flag, DumpWorkq(newq).c_str(), *ismatch); } @@ -965,8 +994,8 @@ DFA::State* DFA::RunStateOnByteUnlocked(State* state, int c) { // Processes input byte c in state, returning new state. DFA::State* DFA::RunStateOnByte(State* state, int c) { - if (DEBUG_MODE) - mutex_.AssertHeld(); + //mutex_.AssertHeld(); + if (state <= SpecialStateMax) { if (state == FullMatchState) { // It is convenient for routines like PossibleMatchRange @@ -988,8 +1017,7 @@ DFA::State* DFA::RunStateOnByte(State* state, int c) { } // If someone else already computed this, return it. - State* ns; - ATOMIC_LOAD_CONSUME(ns, &state->next_[ByteMap(c)]); + State* ns = state->next_[ByteMap(c)].load(std::memory_order_relaxed); if (ns != NULL) return ns; @@ -1000,10 +1028,10 @@ DFA::State* DFA::RunStateOnByte(State* state, int c) { // around this byte. Before the byte we have the flags recorded // in the State structure itself. After the byte we have // nothing yet (but that will change: read on). - uint needflag = state->flag_ >> kFlagNeedShift; - uint beforeflag = state->flag_ & kFlagEmptyMask; - uint oldbeforeflag = beforeflag; - uint afterflag = 0; + uint32_t needflag = state->flag_ >> kFlagNeedShift; + uint32_t beforeflag = state->flag_ & kFlagEmptyMask; + uint32_t oldbeforeflag = beforeflag; + uint32_t afterflag = 0; if (c == '\n') { // Insert implicit $ and ^ around \n @@ -1019,8 +1047,8 @@ DFA::State* DFA::RunStateOnByte(State* state, int c) { // The state flag kFlagLastWord says whether the last // byte processed was a word character. Use that info to // insert empty-width (non-)word boundaries. - bool islastword = state->flag_ & kFlagLastWord; - bool isword = (c != kByteEndText && Prog::IsWordChar(c)); + bool islastword = (state->flag_ & kFlagLastWord) != 0; + bool isword = c != kByteEndText && Prog::IsWordChar(static_cast(c)); if (isword == islastword) beforeflag |= kEmptyNonWordBoundary; else @@ -1030,36 +1058,31 @@ DFA::State* DFA::RunStateOnByte(State* state, int c) { // Only useful to rerun on empty string if there are new, useful flags. if (beforeflag & ~oldbeforeflag & needflag) { RunWorkqOnEmptyString(q0_, q1_, beforeflag); + using std::swap; swap(q0_, q1_); } bool ismatch = false; - RunWorkqOnByte(q0_, q1_, c, afterflag, &ismatch, kind_, start_unanchored_); - - // Most of the time, we build the state from the output of - // RunWorkqOnByte, so swap q0_ and q1_ here. However, so that - // RE2::Set can tell exactly which match instructions - // contributed to the match, don't swap if c is kByteEndText. - // The resulting state wouldn't be correct for further processing - // of the string, but we're at the end of the text so that's okay. - // Leaving q0_ alone preseves the match instructions that led to - // the current setting of ismatch. - if (c != kByteEndText || kind_ != Prog::kManyMatch) - swap(q0_, q1_); + RunWorkqOnByte(q0_, q1_, c, afterflag, &ismatch); + using std::swap; + swap(q0_, q1_); // Save afterflag along with ismatch and isword in new state. - uint flag = afterflag; + uint32_t flag = afterflag; if (ismatch) flag |= kFlagMatch; if (isword) flag |= kFlagLastWord; - ns = WorkqToCachedState(q0_, flag); + if (ismatch && kind_ == Prog::kManyMatch) + ns = WorkqToCachedState(q0_, q1_, flag); + else + ns = WorkqToCachedState(q0_, NULL, flag); // Flush ns before linking to it. // Write barrier before updating state->next_ so that the // main search loop can proceed without any locking, for speed. // (Otherwise it would need one mutex operation per input byte.) - ATOMIC_STORE_RELEASE(&state->next_[ByteMap(c)], ns); + state->next_[ByteMap(c)].store(ns, std::memory_order_release); return ns; } @@ -1093,21 +1116,15 @@ class DFA::RWLocker { // Notice that the lock is *released* temporarily. void LockForWriting(); - // Returns whether the lock is already held for writing. - bool IsLockedForWriting() { - return writing_; - } - private: Mutex* mu_; bool writing_; - DISALLOW_EVIL_CONSTRUCTORS(RWLocker); + RWLocker(const RWLocker&) = delete; + RWLocker& operator=(const RWLocker&) = delete; }; -DFA::RWLocker::RWLocker(Mutex* mu) - : mu_(mu), writing_(false) { - +DFA::RWLocker::RWLocker(Mutex* mu) : mu_(mu), writing_(false) { mu_->ReaderLock(); } @@ -1116,16 +1133,16 @@ DFA::RWLocker::RWLocker(Mutex* mu) void DFA::RWLocker::LockForWriting() NO_THREAD_SAFETY_ANALYSIS { if (!writing_) { mu_->ReaderUnlock(); - mu_->Lock(); + mu_->WriterLock(); writing_ = true; } } DFA::RWLocker::~RWLocker() { - if (writing_) - mu_->WriterUnlock(); - else + if (!writing_) mu_->ReaderUnlock(); + else + mu_->WriterUnlock(); } @@ -1142,24 +1159,12 @@ DFA::RWLocker::~RWLocker() { void DFA::ResetCache(RWLocker* cache_lock) { // Re-acquire the cache_mutex_ for writing (exclusive use). - bool was_writing = cache_lock->IsLockedForWriting(); cache_lock->LockForWriting(); - // If we already held cache_mutex_ for writing, it means - // this invocation of Search() has already reset the - // cache once already. That's a pretty clear indication - // that the cache is too small. Warn about that, once. - // TODO(rsc): Only warn if state_cache_.size() < some threshold. - if (was_writing && !cache_warned_) { - LOG(INFO) << "DFA memory cache could be too small: " - << "only room for " << state_cache_.size() << " states."; - cache_warned_ = true; - } - // Clear the cache, reset the memory budget. for (int i = 0; i < kMaxStart; i++) { start_[i].start = NULL; - start_[i].firstbyte = kFbUnknown; + start_[i].firstbyte.store(kFbUnknown, std::memory_order_relaxed); } ClearCache(); mem_budget_ = state_budget_; @@ -1198,11 +1203,12 @@ class DFA::StateSaver { DFA* dfa_; // the DFA to use int* inst_; // saved info from State int ninst_; - uint flag_; + uint32_t flag_; bool is_special_; // whether original state was special State* special_; // if is_special_, the original state - DISALLOW_EVIL_CONSTRUCTORS(StateSaver); + StateSaver(const StateSaver&) = delete; + StateSaver& operator=(const StateSaver&) = delete; }; DFA::StateSaver::StateSaver(DFA* dfa, State* state) { @@ -1312,21 +1318,36 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params, bool want_earliest_match, bool run_forward) { State* start = params->start; - const uint8* bp = BytePtr(params->text.begin()); // start of text - const uint8* p = bp; // text scanning point - const uint8* ep = BytePtr(params->text.end()); // end of text - const uint8* resetp = NULL; // p at last cache reset - if (!run_forward) + const uint8_t* bp = BytePtr(params->text.begin()); // start of text + const uint8_t* p = bp; // text scanning point + const uint8_t* ep = BytePtr(params->text.end()); // end of text + const uint8_t* resetp = NULL; // p at last cache reset + if (!run_forward) { + using std::swap; swap(p, ep); + } - const uint8* bytemap = prog_->bytemap(); - const uint8* lastmatch = NULL; // most recent matching position in text + const uint8_t* bytemap = prog_->bytemap(); + const uint8_t* lastmatch = NULL; // most recent matching position in text bool matched = false; + State* s = start; + if (ExtraDebug) + fprintf(stderr, "@stx: %s\n", DumpState(s).c_str()); if (s->IsMatch()) { matched = true; lastmatch = p; + if (ExtraDebug) + fprintf(stderr, "match @stx! [%s]\n", DumpState(s).c_str()); + if (params->matches != NULL && kind_ == Prog::kManyMatch) { + for (int i = s->ninst_ - 1; i >= 0; i--) { + int id = s->inst_[i]; + if (id == MatchSep) + break; + params->matches->insert(id); + } + } if (want_earliest_match) { params->ep = reinterpret_cast(lastmatch); return true; @@ -1334,9 +1355,10 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params, } while (p != ep) { - if (DebugDFA) - fprintf(stderr, "@%d: %s\n", static_cast(p - bp), - DumpState(s).c_str()); + if (ExtraDebug) + fprintf(stderr, "@%td: %s\n", + p - bp, DumpState(s).c_str()); + if (have_firstbyte && s == start) { // In start state, only way out is to find firstbyte, // so use optimized assembly in memchr to skip ahead. @@ -1380,8 +1402,7 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params, // Okay to use bytemap[] not ByteMap() here, because // c is known to be an actual byte and not kByteEndText. - State* ns; - ATOMIC_LOAD_CONSUME(ns, &s->next_[bytemap[c]]); + State* ns = s->next_[bytemap[c]].load(std::memory_order_acquire); if (ns == NULL) { ns = RunStateOnByteUnlocked(s, c); if (ns == NULL) { @@ -1393,8 +1414,8 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params, // same at about 2 MB/s. Unless we're processing an average // of 10 bytes per state computation, fail so that RE2 can // fall back to the NFA. - if (FLAGS_re2_dfa_bail_when_slow && resetp != NULL && - (p - resetp) < static_cast(10*state_cache_.size())) { + if (dfa_should_bail_when_slow && resetp != NULL && + static_cast(p - resetp) < 10*state_cache_.size()) { params->failed = true; return false; } @@ -1431,8 +1452,8 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params, params->ep = reinterpret_cast(ep); return true; } - s = ns; + s = ns; if (s->IsMatch()) { matched = true; // The DFA notices the match one byte late, @@ -1441,11 +1462,17 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params, lastmatch = p - 1; else lastmatch = p + 1; - if (DebugDFA) - fprintf(stderr, "match @%d! [%s]\n", - static_cast(lastmatch - bp), - DumpState(s).c_str()); - + if (ExtraDebug) + fprintf(stderr, "match @%td! [%s]\n", + lastmatch - bp, DumpState(s).c_str()); + if (params->matches != NULL && kind_ == Prog::kManyMatch) { + for (int i = s->ninst_ - 1; i >= 0; i--) { + int id = s->inst_[i]; + if (id == MatchSep) + break; + params->matches->insert(id); + } + } if (want_earliest_match) { params->ep = reinterpret_cast(lastmatch); return true; @@ -1455,6 +1482,9 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params, // Process one more byte to see if it triggers a match. // (Remember, matches are delayed one byte.) + if (ExtraDebug) + fprintf(stderr, "@etx: %s\n", DumpState(s).c_str()); + int lastbyte; if (run_forward) { if (params->text.end() == params->context.end()) @@ -1468,8 +1498,7 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params, lastbyte = params->text.begin()[-1] & 0xFF; } - State* ns; - ATOMIC_LOAD_CONSUME(ns, &s->next_[ByteMap(lastbyte)]); + State* ns = s->next_[ByteMap(lastbyte)].load(std::memory_order_acquire); if (ns == NULL) { ns = RunStateOnByteUnlocked(s, lastbyte); if (ns == NULL) { @@ -1487,29 +1516,32 @@ inline bool DFA::InlinedSearchLoop(SearchParams* params, } } } - s = ns; - if (DebugDFA) - fprintf(stderr, "@_: %s\n", DumpState(s).c_str()); - if (s == FullMatchState) { + if (ns <= SpecialStateMax) { + if (ns == DeadState) { + params->ep = reinterpret_cast(lastmatch); + return matched; + } + // FullMatchState params->ep = reinterpret_cast(ep); return true; } - if (s > SpecialStateMax && s->IsMatch()) { + + s = ns; + if (s->IsMatch()) { matched = true; lastmatch = p; - if (params->matches && kind_ == Prog::kManyMatch) { - vector* v = params->matches; - v->clear(); - for (int i = 0; i < s->ninst_; i++) { - Prog::Inst* ip = prog_->inst(s->inst_[i]); - if (ip->opcode() == kInstMatch) - v->push_back(ip->match_id()); + if (ExtraDebug) + fprintf(stderr, "match @etx! [%s]\n", DumpState(s).c_str()); + if (params->matches != NULL && kind_ == Prog::kManyMatch) { + for (int i = s->ninst_ - 1; i >= 0; i--) { + int id = s->inst_[i]; + if (id == MatchSep) + break; + params->matches->insert(id); } } - if (DebugDFA) - fprintf(stderr, "match @%d! [%s]\n", static_cast(lastmatch - bp), - DumpState(s).c_str()); } + params->ep = reinterpret_cast(lastmatch); return matched; } @@ -1604,14 +1636,14 @@ bool DFA::AnalyzeSearch(SearchParams* params) { // Sanity check: make sure that text lies within context. if (text.begin() < context.begin() || text.end() > context.end()) { - LOG(DFATAL) << "Text is not inside context."; + LOG(DFATAL) << "context does not contain text"; params->start = DeadState; return true; } // Determine correct search type. int start; - uint flags; + uint32_t flags; if (params->run_forward) { if (text.begin() == context.begin()) { start = kStartBeginText; @@ -1657,50 +1689,47 @@ bool DFA::AnalyzeSearch(SearchParams* params) { } } - if (DebugDFA) { - int fb; - ATOMIC_LOAD_RELAXED(fb, &info->firstbyte); + if (ExtraDebug) fprintf(stderr, "anchored=%d fwd=%d flags=%#x state=%s firstbyte=%d\n", params->anchored, params->run_forward, flags, - DumpState(info->start).c_str(), fb); - } + DumpState(info->start).c_str(), info->firstbyte.load()); params->start = info->start; - ATOMIC_LOAD_ACQUIRE(params->firstbyte, &info->firstbyte); + params->firstbyte = info->firstbyte.load(std::memory_order_acquire); return true; } // Fills in info if needed. Returns true on success, false on failure. bool DFA::AnalyzeSearchHelper(SearchParams* params, StartInfo* info, - uint flags) { + uint32_t flags) { // Quick check. - int fb; - ATOMIC_LOAD_ACQUIRE(fb, &info->firstbyte); + int fb = info->firstbyte.load(std::memory_order_acquire); if (fb != kFbUnknown) return true; MutexLock l(&mutex_); - if (info->firstbyte != kFbUnknown) + fb = info->firstbyte.load(std::memory_order_relaxed); + if (fb != kFbUnknown) return true; q0_->clear(); AddToQueue(q0_, params->anchored ? prog_->start() : prog_->start_unanchored(), flags); - info->start = WorkqToCachedState(q0_, flags); + info->start = WorkqToCachedState(q0_, NULL, flags); if (info->start == NULL) return false; if (info->start == DeadState) { // Synchronize with "quick check" above. - ATOMIC_STORE_RELEASE(&info->firstbyte, kFbNone); + info->firstbyte.store(kFbNone, std::memory_order_release); return true; } if (info->start == FullMatchState) { // Synchronize with "quick check" above. - ATOMIC_STORE_RELEASE(&info->firstbyte, kFbNone); // will be ignored + info->firstbyte.store(kFbNone, std::memory_order_release); // will be ignored return true; } @@ -1712,7 +1741,7 @@ bool DFA::AnalyzeSearchHelper(SearchParams* params, StartInfo* info, State* s = RunStateOnByte(info->start, i); if (s == NULL) { // Synchronize with "quick check" above. - ATOMIC_STORE_RELEASE(&info->firstbyte, firstbyte); + info->firstbyte.store(kFbUnknown, std::memory_order_release); return false; } if (s == info->start) @@ -1725,8 +1754,9 @@ bool DFA::AnalyzeSearchHelper(SearchParams* params, StartInfo* info, break; } } + // Synchronize with "quick check" above. - ATOMIC_STORE_RELEASE(&info->firstbyte, firstbyte); + info->firstbyte.store(firstbyte, std::memory_order_release); return true; } @@ -1738,7 +1768,7 @@ bool DFA::Search(const StringPiece& text, bool run_forward, bool* failed, const char** epp, - vector* matches) { + SparseSet* matches) { *epp = NULL; if (!ok()) { *failed = true; @@ -1746,10 +1776,10 @@ bool DFA::Search(const StringPiece& text, } *failed = false; - if (DebugDFA) { + if (ExtraDebug) { fprintf(stderr, "\nprogram:\n%s\n", prog_->DumpUnanchored().c_str()); fprintf(stderr, "text %s anchored=%d earliest=%d fwd=%d kind %d\n", - text.as_string().c_str(), anchored, want_earliest_match, + text.ToString().c_str(), anchored, want_earliest_match, run_forward, kind_); } @@ -1773,7 +1803,7 @@ bool DFA::Search(const StringPiece& text, *epp = text.end(); return true; } - if (DebugDFA) + if (ExtraDebug) fprintf(stderr, "start %s\n", DumpState(params.start).c_str()); bool ret = FastSearchLoop(¶ms); if (params.failed) { @@ -1784,59 +1814,38 @@ bool DFA::Search(const StringPiece& text, return ret; } -// Deletes dfa. -// -// This is a separate function so that -// prog.h can be used without moving the definition of -// class DFA out of this file. If you set -// prog->dfa_ = dfa; -// then you also have to set -// prog->delete_dfa_ = DeleteDFA; -// so that ~Prog can delete the dfa. -static void DeleteDFA(DFA* dfa) { - delete dfa; -} - DFA* Prog::GetDFA(MatchKind kind) { - DFA*volatile* pdfa; - if (kind == kFirstMatch || kind == kManyMatch) { - pdfa = &dfa_first_; - } else { - kind = kLongestMatch; - pdfa = &dfa_longest_; - } - - // Quick check. - DFA *dfa; - ATOMIC_LOAD_ACQUIRE(dfa, pdfa); - if (dfa != NULL) - return dfa; - - MutexLock l(&dfa_mutex_); - dfa = *pdfa; - if (dfa != NULL) - return dfa; - // For a forward DFA, half the memory goes to each DFA. + // However, if it is a "many match" DFA, then there is + // no counterpart with which the memory must be shared. + // // For a reverse DFA, all the memory goes to the // "longest match" DFA, because RE2 never does reverse // "first match" searches. - int64 m = dfa_mem_/2; - if (reversed_) { - if (kind == kLongestMatch || kind == kManyMatch) - m = dfa_mem_; - else - m = 0; + if (kind == kFirstMatch) { + std::call_once(dfa_first_once_, [](Prog* prog) { + prog->dfa_first_ = new DFA(prog, kFirstMatch, prog->dfa_mem_ / 2); + }, this); + return dfa_first_; + } else if (kind == kManyMatch) { + std::call_once(dfa_first_once_, [](Prog* prog) { + prog->dfa_first_ = new DFA(prog, kManyMatch, prog->dfa_mem_); + }, this); + return dfa_first_; + } else { + std::call_once(dfa_longest_once_, [](Prog* prog) { + if (!prog->reversed_) + prog->dfa_longest_ = new DFA(prog, kLongestMatch, prog->dfa_mem_ / 2); + else + prog->dfa_longest_ = new DFA(prog, kLongestMatch, prog->dfa_mem_); + }, this); + return dfa_longest_; } - dfa = new DFA(this, kind, m); - delete_dfa_ = DeleteDFA; - - // Synchronize with "quick check" above. - ATOMIC_STORE_RELEASE(pdfa, dfa); - - return dfa; } +void Prog::DeleteDFA(DFA* dfa) { + delete dfa; +} // Executes the regexp program to search in text, // which itself is inside the larger context. (As a convenience, @@ -1849,8 +1858,8 @@ DFA* Prog::GetDFA(MatchKind kind) { // This is the only external interface (class DFA only exists in this file). // bool Prog::SearchDFA(const StringPiece& text, const StringPiece& const_context, - Anchor anchor, MatchKind kind, - StringPiece* match0, bool* failed, vector* matches) { + Anchor anchor, MatchKind kind, StringPiece* match0, + bool* failed, SparseSet* matches) { *failed = false; StringPiece context = const_context; @@ -1873,7 +1882,7 @@ bool Prog::SearchDFA(const StringPiece& text, const StringPiece& const_context, bool anchored = anchor == kAnchored || anchor_start() || kind == kFullMatch; bool endmatch = false; if (kind == kManyMatch) { - endmatch = true; + // This is split out in order to avoid clobbering kind. } else if (kind == kFullMatch || anchor_end()) { endmatch = true; kind = kLongestMatch; @@ -1881,17 +1890,22 @@ bool Prog::SearchDFA(const StringPiece& text, const StringPiece& const_context, // If the caller doesn't care where the match is (just whether one exists), // then we can stop at the very first match we find, the so-called - // "shortest match". - bool want_shortest_match = false; - if (match0 == NULL && !endmatch) { - want_shortest_match = true; + // "earliest match". + bool want_earliest_match = false; + if (kind == kManyMatch) { + // This is split out in order to avoid clobbering kind. + if (matches == NULL) { + want_earliest_match = true; + } + } else if (match0 == NULL && !endmatch) { + want_earliest_match = true; kind = kLongestMatch; } DFA* dfa = GetDFA(kind); const char* ep; bool matched = dfa->Search(text, context, anchored, - want_shortest_match, !reversed_, + want_earliest_match, !reversed_, failed, &ep, matches); if (*failed) return false; @@ -1905,51 +1919,89 @@ bool Prog::SearchDFA(const StringPiece& text, const StringPiece& const_context, // as the beginning. if (match0) { if (reversed_) - *match0 = StringPiece(ep, text.end() - ep); + *match0 = StringPiece(ep, static_cast(text.end() - ep)); else - *match0 = StringPiece(text.begin(), ep - text.begin()); + *match0 = + StringPiece(text.begin(), static_cast(ep - text.begin())); } return true; } // Build out all states in DFA. Returns number of states. -int DFA::BuildAllStates() { +int DFA::BuildAllStates(const Prog::DFAStateCallback& cb) { if (!ok()) return 0; // Pick out start state for unanchored search // at beginning of text. RWLocker l(&cache_mutex_); - SearchParams params(NULL, NULL, &l); + SearchParams params(StringPiece(), StringPiece(), &l); params.anchored = false; - if (!AnalyzeSearch(¶ms) || params.start <= SpecialStateMax) + if (!AnalyzeSearch(¶ms) || + params.start == NULL || + params.start == DeadState) return 0; // Add start state to work queue. - StateSet queued; - vector q; - queued.insert(params.start); + // Note that any State* that we handle here must point into the cache, + // so we can simply depend on pointer-as-a-number hashing and equality. + std::unordered_map m; + std::deque q; + m.emplace(params.start, static_cast(m.size())); q.push_back(params.start); + // Compute the input bytes needed to cover all of the next pointers. + int nnext = prog_->bytemap_range() + 1; // + 1 for kByteEndText slot + std::vector input(nnext); + for (int c = 0; c < 256; c++) { + int b = prog_->bytemap()[c]; + while (c < 256-1 && prog_->bytemap()[c+1] == b) + c++; + input[b] = c; + } + input[prog_->bytemap_range()] = kByteEndText; + + // Scratch space for the output. + std::vector output(nnext); + // Flood to expand every state. - for (size_t i = 0; i < q.size(); i++) { - State* s = q[i]; - for (int c = 0; c < 257; c++) { + bool oom = false; + while (!q.empty()) { + State* s = q.front(); + q.pop_front(); + for (int c : input) { State* ns = RunStateOnByteUnlocked(s, c); - if (ns > SpecialStateMax && queued.find(ns) == queued.end()) { - queued.insert(ns); + if (ns == NULL) { + oom = true; + break; + } + if (ns == DeadState) { + output[ByteMap(c)] = -1; + continue; + } + if (m.find(ns) == m.end()) { + m.emplace(ns, static_cast(m.size())); q.push_back(ns); } + output[ByteMap(c)] = m[ns]; } + if (cb) + cb(oom ? NULL : output.data(), + s == FullMatchState || s->IsMatch()); + if (oom) + break; } - return q.size(); + return static_cast(m.size()); } // Build out all states in DFA for kind. Returns number of states. -int Prog::BuildEntireDFA(MatchKind kind) { - //LOG(ERROR) << "BuildEntireDFA is only for testing."; - return GetDFA(kind)->BuildAllStates(); +int Prog::BuildEntireDFA(MatchKind kind, const DFAStateCallback& cb) { + return GetDFA(kind)->BuildAllStates(cb); +} + +void Prog::TEST_dfa_should_bail_when_slow(bool b) { + dfa_should_bail_when_slow = b; } // Computes min and max for matching string. @@ -1971,11 +2023,11 @@ bool DFA::PossibleMatchRange(string* min, string* max, int maxlen) { // Also note that previously_visited_states[UnseenStatePtr] will, in the STL // tradition, implicitly insert a '0' value at first use. We take advantage // of that property below. - map previously_visited_states; + std::unordered_map previously_visited_states; // Pick out start state for anchored search at beginning of text. RWLocker l(&cache_mutex_); - SearchParams params(NULL, NULL, &l); + SearchParams params(StringPiece(), StringPiece(), &l); params.anchored = true; if (!AnalyzeSearch(¶ms)) return false; @@ -2015,16 +2067,14 @@ bool DFA::PossibleMatchRange(string* min, string* max, int maxlen) { // Build minimum prefix. State* s = params.start; min->clear(); + MutexLock lock(&mutex_); for (int i = 0; i < maxlen; i++) { - if (previously_visited_states[s] > kMaxEltRepetitions) { - VLOG(2) << "Hit kMaxEltRepetitions=" << kMaxEltRepetitions - << " for state s=" << s << " and min=" << CEscape(*min); + if (previously_visited_states[s] > kMaxEltRepetitions) break; - } previously_visited_states[s]++; // Stop if min is a match. - State* ns = RunStateOnByteUnlocked(s, kByteEndText); + State* ns = RunStateOnByte(s, kByteEndText); if (ns == NULL) // DFA out of memory return false; if (ns != DeadState && (ns == FullMatchState || ns->IsMatch())) @@ -2033,13 +2083,13 @@ bool DFA::PossibleMatchRange(string* min, string* max, int maxlen) { // Try to extend the string with low bytes. bool extended = false; for (int j = 0; j < 256; j++) { - ns = RunStateOnByteUnlocked(s, j); + ns = RunStateOnByte(s, j); if (ns == NULL) // DFA out of memory return false; if (ns == FullMatchState || (ns > SpecialStateMax && ns->ninst_ > 0)) { extended = true; - min->append(1, j); + min->append(1, static_cast(j)); s = ns; break; } @@ -2053,23 +2103,20 @@ bool DFA::PossibleMatchRange(string* min, string* max, int maxlen) { s = params.start; max->clear(); for (int i = 0; i < maxlen; i++) { - if (previously_visited_states[s] > kMaxEltRepetitions) { - VLOG(2) << "Hit kMaxEltRepetitions=" << kMaxEltRepetitions - << " for state s=" << s << " and max=" << CEscape(*max); + if (previously_visited_states[s] > kMaxEltRepetitions) break; - } previously_visited_states[s] += 1; // Try to extend the string with high bytes. bool extended = false; for (int j = 255; j >= 0; j--) { - State* ns = RunStateOnByteUnlocked(s, j); + State* ns = RunStateOnByte(s, j); if (ns == NULL) return false; if (ns == FullMatchState || (ns > SpecialStateMax && ns->ninst_ > 0)) { extended = true; - max->append(1, j); + max->append(1, static_cast(j)); s = ns; break; } @@ -2081,7 +2128,7 @@ bool DFA::PossibleMatchRange(string* min, string* max, int maxlen) { } // Stopped while still adding to *max - round aaaaaaaaaa... to aaaa...b - *max = PrefixSuccessor(*max); + PrefixSuccessor(max); // If there are no bytes left, we have no way to say "there is no maximum // string". We could make the interface more complicated and be able to @@ -2097,19 +2144,9 @@ bool DFA::PossibleMatchRange(string* min, string* max, int maxlen) { // PossibleMatchRange for a Prog. bool Prog::PossibleMatchRange(string* min, string* max, int maxlen) { - DFA* dfa = NULL; - { - MutexLock l(&dfa_mutex_); - // Have to use dfa_longest_ to get all strings for full matches. - // For example, (a|aa) never matches aa in first-match mode. - dfa = dfa_longest_; - if (dfa == NULL) { - dfa = new DFA(this, Prog::kLongestMatch, dfa_mem_/2); - ATOMIC_STORE_RELEASE(&dfa_longest_, dfa); - delete_dfa_ = DeleteDFA; - } - } - return dfa->PossibleMatchRange(min, max, maxlen); + // Have to use dfa_longest_ to get all strings for full matches. + // For example, (a|aa) never matches aa in first-match mode. + return GetDFA(kLongestMatch)->PossibleMatchRange(min, max, maxlen); } } // namespace re2 diff --git a/contrib/libre2/re2/filtered_re2.cc b/contrib/libre2/re2/filtered_re2.cc index dd2fd37312f..12f638a1f69 100644 --- a/contrib/libre2/re2/filtered_re2.cc +++ b/contrib/libre2/re2/filtered_re2.cc @@ -2,9 +2,13 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -#include -#include "util/util.h" #include "re2/filtered_re2.h" + +#include +#include + +#include "util/util.h" +#include "util/logging.h" #include "re2/prefilter.h" #include "re2/prefilter_tree.h" @@ -15,6 +19,11 @@ FilteredRE2::FilteredRE2() prefilter_tree_(new PrefilterTree()) { } +FilteredRE2::FilteredRE2(int min_atom_len) + : compiled_(false), + prefilter_tree_(new PrefilterTree(min_atom_len)) { +} + FilteredRE2::~FilteredRE2() { for (size_t i = 0; i < re2_vec_.size(); i++) delete re2_vec_[i]; @@ -33,16 +42,21 @@ RE2::ErrorCode FilteredRE2::Add(const StringPiece& pattern, } delete re; } else { - *id = re2_vec_.size(); + *id = static_cast(re2_vec_.size()); re2_vec_.push_back(re); } return code; } -void FilteredRE2::Compile(vector* atoms) { - if (compiled_ || re2_vec_.size() == 0) { - LOG(INFO) << "C: " << compiled_ << " S:" << re2_vec_.size(); +void FilteredRE2::Compile(std::vector* atoms) { + if (compiled_) { + LOG(ERROR) << "Compile called already."; + return; + } + + if (re2_vec_.empty()) { + LOG(ERROR) << "Compile called before Add."; return; } @@ -58,17 +72,17 @@ void FilteredRE2::Compile(vector* atoms) { int FilteredRE2::SlowFirstMatch(const StringPiece& text) const { for (size_t i = 0; i < re2_vec_.size(); i++) if (RE2::PartialMatch(text, *re2_vec_[i])) - return i; + return static_cast(i); return -1; } int FilteredRE2::FirstMatch(const StringPiece& text, - const vector& atoms) const { + const std::vector& atoms) const { if (!compiled_) { - LOG(DFATAL) << "FirstMatch called before Compile"; + LOG(DFATAL) << "FirstMatch called before Compile."; return -1; } - vector regexps; + std::vector regexps; prefilter_tree_->RegexpsGivenStrings(atoms, ®exps); for (size_t i = 0; i < regexps.size(); i++) if (RE2::PartialMatch(text, *re2_vec_[regexps[i]])) @@ -78,10 +92,10 @@ int FilteredRE2::FirstMatch(const StringPiece& text, bool FilteredRE2::AllMatches( const StringPiece& text, - const vector& atoms, - vector* matching_regexps) const { + const std::vector& atoms, + std::vector* matching_regexps) const { matching_regexps->clear(); - vector regexps; + std::vector regexps; prefilter_tree_->RegexpsGivenStrings(atoms, ®exps); for (size_t i = 0; i < regexps.size(); i++) if (RE2::PartialMatch(text, *re2_vec_[regexps[i]])) @@ -89,11 +103,16 @@ bool FilteredRE2::AllMatches( return !matching_regexps->empty(); } -void FilteredRE2::RegexpsGivenStrings(const vector& matched_atoms, - vector* passed_regexps) { - prefilter_tree_->RegexpsGivenStrings(matched_atoms, passed_regexps); +void FilteredRE2::AllPotentials( + const std::vector& atoms, + std::vector* potential_regexps) const { + prefilter_tree_->RegexpsGivenStrings(atoms, potential_regexps); } +void FilteredRE2::RegexpsGivenStrings(const std::vector& matched_atoms, + std::vector* passed_regexps) { + prefilter_tree_->RegexpsGivenStrings(matched_atoms, passed_regexps); +} void FilteredRE2::PrintPrefilter(int regexpid) { prefilter_tree_->PrintPrefilter(regexpid); diff --git a/contrib/libre2/re2/filtered_re2.h b/contrib/libre2/re2/filtered_re2.h index 64b35be6c2c..b1317cccfa4 100644 --- a/contrib/libre2/re2/filtered_re2.h +++ b/contrib/libre2/re2/filtered_re2.h @@ -2,6 +2,9 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +#ifndef RE2_FILTERED_RE2_H_ +#define RE2_FILTERED_RE2_H_ + // The class FilteredRE2 is used as a wrapper to multiple RE2 regexps. // It provides a prefilter mechanism that helps in cutting down the // number of regexps that need to be actually searched. @@ -18,20 +21,19 @@ // indices of strings that were found in the text to get the actual // regexp matches. -#ifndef RE2_FILTERED_RE2_H_ -#define RE2_FILTERED_RE2_H_ - +#include #include + #include "re2/re2.h" namespace re2 { -using std::vector; class PrefilterTree; class FilteredRE2 { public: FilteredRE2(); + explicit FilteredRE2(int min_atom_len); ~FilteredRE2(); // Uses RE2 constructor to create a RE2 object (re). Returns @@ -47,7 +49,7 @@ class FilteredRE2 { // the search text should be lowercased first to find matching // strings from the set of strings returned by Compile. Call after // all Add calls are done. - void Compile(vector* strings_to_match); + void Compile(std::vector* strings_to_match); // Returns the index of the first matching regexp. // Returns -1 on no match. Can be called prior to Compile. @@ -59,16 +61,24 @@ class FilteredRE2 { // Returns -1 on no match. Compile has to be called before // calling this. int FirstMatch(const StringPiece& text, - const vector& atoms) const; + const std::vector& atoms) const; // Returns the indices of all matching regexps, after first clearing // matched_regexps. bool AllMatches(const StringPiece& text, - const vector& atoms, - vector* matching_regexps) const; + const std::vector& atoms, + std::vector* matching_regexps) const; + + // Returns the indices of all potentially matching regexps after first + // clearing potential_regexps. + // A regexp is potentially matching if it passes the filter. + // If a regexp passes the filter it may still not match. + // A regexp that does not pass the filter is guaranteed to not match. + void AllPotentials(const std::vector& atoms, + std::vector* potential_regexps) const; // The number of regexps added. - int NumRegexps() const { return re2_vec_.size(); } + int NumRegexps() const { return static_cast(re2_vec_.size()); } private: @@ -79,11 +89,11 @@ class FilteredRE2 { void PrintPrefilter(int regexpid); // Useful for testing and debugging. - void RegexpsGivenStrings(const vector& matched_atoms, - vector* passed_regexps); + void RegexpsGivenStrings(const std::vector& matched_atoms, + std::vector* passed_regexps); // All the regexps in the FilteredRE2. - vector re2_vec_; + std::vector re2_vec_; // Has the FilteredRE2 been compiled using Compile() bool compiled_; @@ -91,9 +101,8 @@ class FilteredRE2 { // An AND-OR tree of string atoms used for filtering regexps. PrefilterTree* prefilter_tree_; - //DISALLOW_EVIL_CONSTRUCTORS(FilteredRE2); - FilteredRE2(const FilteredRE2&); - void operator=(const FilteredRE2&); + FilteredRE2(const FilteredRE2&) = delete; + FilteredRE2& operator=(const FilteredRE2&) = delete; }; } // namespace re2 diff --git a/contrib/libre2/re2/mimics_pcre.cc b/contrib/libre2/re2/mimics_pcre.cc index fc6dd4ad594..ad197bef554 100644 --- a/contrib/libre2/re2/mimics_pcre.cc +++ b/contrib/libre2/re2/mimics_pcre.cc @@ -23,6 +23,7 @@ // Regexp::MimicsPCRE checks for any of these conditions. #include "util/util.h" +#include "util/logging.h" #include "re2/regexp.h" #include "re2/walker-inl.h" @@ -124,7 +125,8 @@ class EmptyStringWalker : public Regexp::Walker { } private: - DISALLOW_EVIL_CONSTRUCTORS(EmptyStringWalker); + EmptyStringWalker(const EmptyStringWalker&) = delete; + EmptyStringWalker& operator=(const EmptyStringWalker&) = delete; }; // Called after visiting re's children. child_args contains the return diff --git a/contrib/libre2/re2/nfa.cc b/contrib/libre2/re2/nfa.cc index 8c4f76136d5..ac853f9a22c 100644 --- a/contrib/libre2/re2/nfa.cc +++ b/contrib/libre2/re2/nfa.cc @@ -24,13 +24,24 @@ // Like Thompson's original machine and like the DFA implementation, this // implementation notices a match only once it is one byte past it. +#include +#include +#include +#include +#include +#include + #include "re2/prog.h" #include "re2/regexp.h" +#include "util/logging.h" #include "util/sparse_array.h" #include "util/sparse_set.h" +#include "util/strutil.h" namespace re2 { +static const bool ExtraDebug = false; + class NFA { public: NFA(Prog* prog); @@ -51,12 +62,10 @@ class NFA { bool anchored, bool longest, StringPiece* submatch, int nsubmatch); - static const int Debug = 0; - private: struct Thread { union { - int id; + int ref; Thread* next; // when on free list }; const char** capture; @@ -64,16 +73,15 @@ class NFA { // State for explicit stack in AddToThreadq. struct AddState { - int id; // Inst to process - int j; - const char* cap_j; // if j>=0, set capture[j] = cap_j before processing ip + int id; // Inst to process + Thread* t; // if not null, set t0 = t before processing id AddState() - : id(0), j(-1), cap_j(NULL) {} + : id(0), t(NULL) {} explicit AddState(int id) - : id(id), j(-1), cap_j(NULL) {} - AddState(int id, const char* cap_j, int j) - : id(id), j(j), cap_j(cap_j) {} + : id(id), t(NULL) {} + AddState(int id, Thread* t) + : id(id), t(t) {} }; // Threadq is a list of threads. The list is sorted by the order @@ -82,19 +90,24 @@ class NFA { typedef SparseArray Threadq; inline Thread* AllocThread(); - inline void FreeThread(Thread*); + inline Thread* Incref(Thread* t); + inline void Decref(Thread* t); - // Add id (or its children, following unlabeled arrows) - // to the workqueue q with associated capture info. - void AddToThreadq(Threadq* q, int id, int flag, - const char* p, const char** capture); + // Follows all empty arrows from id0 and enqueues all the states reached. + // Enqueues only the ByteRange instructions that match byte c. + // The bits in flag (Bol, Eol, etc.) specify whether ^, $ and \b match. + // p is the current input position, and t0 is the current thread. + void AddToThreadq(Threadq* q, int id0, int c, int flag, + const char* p, Thread* t0); // Run runq on byte c, appending new states to nextq. // Updates matched_ and match_ as new, better matches are found. - // p is position of the next byte (the one after c) - // in the input string, used when processing capturing parens. - // flag is the bitwise or of Bol, Eol, etc., specifying whether - // ^, $ and \b match the current input point (after c). + // p is the position of byte c in the input string for AddToThreadq; + // p-1 will be used when processing Match instructions. + // flag is the bitwise OR of Bol, Eol, etc., specifying whether + // ^, $ and \b match the current input position (after c). + // Frees all the threads on runq. + // If there is a shortcut to the end, returns that shortcut. inline int Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p); // Returns text version of capture information, for debugging. @@ -102,10 +115,6 @@ class NFA { inline void CopyCapture(const char** dst, const char** src); - // Computes whether all matches must begin with the same first - // byte, and if so, returns that byte. If not, returns -1. - int ComputeFirstByte(); - Prog* prog_; // underlying program int start_; // start instruction in program int ncapture_; // number of submatches to track @@ -118,16 +127,16 @@ class NFA { bool matched_; // any match so far? AddState* astack_; // pre-allocated for AddToThreadq int nastack_; - int first_byte_; // required first byte for match, or -1 if none Thread* free_threads_; // free list - DISALLOW_EVIL_CONSTRUCTORS(NFA); + NFA(const NFA&) = delete; + NFA& operator=(const NFA&) = delete; }; NFA::NFA(Prog* prog) { prog_ = prog; - start_ = prog->start(); + start_ = prog_->start(); ncapture_ = 0; longest_ = false; endmatch_ = false; @@ -135,12 +144,14 @@ NFA::NFA(Prog* prog) { etext_ = NULL; q0_.resize(prog_->size()); q1_.resize(prog_->size()); - nastack_ = 2*prog_->size(); + // See NFA::AddToThreadq() for why this is so. + nastack_ = 2*prog_->inst_count(kInstCapture) + + prog_->inst_count(kInstEmptyWidth) + + prog_->inst_count(kInstNop) + 1; // + 1 for start inst astack_ = new AddState[nastack_]; match_ = NULL; matched_ = false; free_threads_ = NULL; - first_byte_ = ComputeFirstByte(); } NFA::~NFA() { @@ -154,24 +165,36 @@ NFA::~NFA() { } } -void NFA::FreeThread(Thread *t) { - if (t == NULL) - return; - t->next = free_threads_; - free_threads_ = t; -} - NFA::Thread* NFA::AllocThread() { Thread* t = free_threads_; if (t == NULL) { t = new Thread; + t->ref = 1; t->capture = new const char*[ncapture_]; return t; } free_threads_ = t->next; + t->ref = 1; return t; } +NFA::Thread* NFA::Incref(Thread* t) { + DCHECK(t != NULL); + t->ref++; + return t; +} + +void NFA::Decref(Thread* t) { + if (t == NULL) + return; + t->ref--; + if (t->ref > 0) + return; + DCHECK_EQ(t->ref, 0); + t->next = free_threads_; + free_threads_ = t; +} + void NFA::CopyCapture(const char** dst, const char** src) { for (int i = 0; i < ncapture_; i+=2) { dst[i] = src[i]; @@ -180,35 +203,43 @@ void NFA::CopyCapture(const char** dst, const char** src) { } // Follows all empty arrows from id0 and enqueues all the states reached. +// Enqueues only the ByteRange instructions that match byte c. // The bits in flag (Bol, Eol, etc.) specify whether ^, $ and \b match. -// The pointer p is the current input position, and m is the -// current set of match boundaries. -void NFA::AddToThreadq(Threadq* q, int id0, int flag, - const char* p, const char** capture) { +// p is the current input position, and t0 is the current thread. +void NFA::AddToThreadq(Threadq* q, int id0, int c, int flag, + const char* p, Thread* t0) { if (id0 == 0) return; - // Astack_ is pre-allocated to avoid resize operations. - // It has room for 2*prog_->size() entries, which is enough: - // Each inst in prog can be processed at most once, - // pushing at most two entries on stk. - - int nstk = 0; + // Use astack_ to hold our stack of instructions yet to process. + // It was preallocated as follows: + // two entries per Capture; + // one entry per EmptyWidth; and + // one entry per Nop. + // This reflects the maximum number of stack pushes that each can + // perform. (Each instruction can be processed at most once.) AddState* stk = astack_; - stk[nstk++] = AddState(id0); + int nstk = 0; + stk[nstk++] = AddState(id0); while (nstk > 0) { DCHECK_LE(nstk, nastack_); - const AddState& a = stk[--nstk]; - if (a.j >= 0) - capture[a.j] = a.cap_j; + AddState a = stk[--nstk]; + + Loop: + if (a.t != NULL) { + // t0 was a thread that we allocated and copied in order to + // record the capture, so we must now decref it. + Decref(t0); + t0 = a.t; + } int id = a.id; if (id == 0) continue; if (q->has_index(id)) { - if (Debug) - fprintf(stderr, " [%d%s]\n", id, FormatCapture(capture).c_str()); + if (ExtraDebug) + fprintf(stderr, " [%d%s]\n", id, FormatCapture(t0->capture).c_str()); continue; } @@ -231,62 +262,76 @@ void NFA::AddToThreadq(Threadq* q, int id0, int flag, case kInstAltMatch: // Save state; will pick up at next byte. - t = AllocThread(); - t->id = id; - CopyCapture(t->capture, capture); + t = Incref(t0); *tp = t; - // fall through - case kInstAlt: - // Explore alternatives. - stk[nstk++] = AddState(ip->out1()); - stk[nstk++] = AddState(ip->out()); - break; + DCHECK(!ip->last()); + a = AddState(id+1); + goto Loop; case kInstNop: + if (!ip->last()) + stk[nstk++] = AddState(id+1); + // Continue on. - stk[nstk++] = AddState(ip->out()); - break; + a = AddState(ip->out()); + goto Loop; case kInstCapture: + if (!ip->last()) + stk[nstk++] = AddState(id+1); + if ((j=ip->cap()) < ncapture_) { - // Push a dummy whose only job is to restore capture[j] + // Push a dummy whose only job is to restore t0 // once we finish exploring this possibility. - stk[nstk++] = AddState(0, capture[j], j); + stk[nstk++] = AddState(0, t0); // Record capture. - capture[j] = p; + t = AllocThread(); + CopyCapture(t->capture, t0->capture); + t->capture[j] = p; + t0 = t; } - stk[nstk++] = AddState(ip->out()); - break; + a = AddState(ip->out()); + goto Loop; + + case kInstByteRange: + if (!ip->Matches(c)) + goto Next; + FALLTHROUGH_INTENDED; case kInstMatch: - case kInstByteRange: // Save state; will pick up at next byte. - t = AllocThread(); - t->id = id; - CopyCapture(t->capture, capture); + t = Incref(t0); *tp = t; - if (Debug) - fprintf(stderr, " + %d%s [%p]\n", id, FormatCapture(t->capture).c_str(), t); - break; + if (ExtraDebug) + fprintf(stderr, " + %d%s\n", id, FormatCapture(t0->capture).c_str()); + + Next: + if (ip->last()) + break; + a = AddState(id+1); + goto Loop; case kInstEmptyWidth: + if (!ip->last()) + stk[nstk++] = AddState(id+1); + // Continue on if we have all the right flag bits. if (ip->empty() & ~flag) break; - stk[nstk++] = AddState(ip->out()); - break; + a = AddState(ip->out()); + goto Loop; } } } // Run runq on byte c, appending new states to nextq. -// Updates match as new, better matches are found. -// p is position of the byte c in the input string, -// used when processing capturing parens. -// flag is the bitwise or of Bol, Eol, etc., specifying whether -// ^, $ and \b match the current input point (after c). +// Updates matched_ and match_ as new, better matches are found. +// p is the position of byte c in the input string for AddToThreadq; +// p-1 will be used when processing Match instructions. +// flag is the bitwise OR of Bol, Eol, etc., specifying whether +// ^, $ and \b match the current input position (after c). // Frees all the threads on runq. // If there is a shortcut to the end, returns that shortcut. int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) { @@ -300,12 +345,12 @@ int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) { if (longest_) { // Can skip any threads started after our current best match. if (matched_ && match_[0] < t->capture[0]) { - FreeThread(t); + Decref(t); continue; } } - int id = t->id; + int id = i->index(); Prog::Inst* ip = prog_->inst(id); switch (ip->opcode()) { @@ -315,8 +360,7 @@ int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) { break; case kInstByteRange: - if (ip->Matches(c)) - AddToThreadq(nextq, ip->out(), flag, p+1, t->capture); + AddToThreadq(nextq, ip->out(), c, flag, p, t); break; case kInstAltMatch: @@ -324,52 +368,58 @@ int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) { break; // The match is ours if we want it. if (ip->greedy(prog_) || longest_) { - CopyCapture((const char**)match_, t->capture); - FreeThread(t); - for (++i; i != runq->end(); ++i) - FreeThread(i->second); - runq->clear(); + CopyCapture(match_, t->capture); matched_ = true; + + Decref(t); + for (++i; i != runq->end(); ++i) + Decref(i->second); + runq->clear(); if (ip->greedy(prog_)) return ip->out1(); return ip->out(); } break; - case kInstMatch: - if (endmatch_ && p != etext_) + case kInstMatch: { + // Avoid invoking undefined behavior when p happens + // to be null - and p-1 would be meaningless anyway. + if (p == NULL) + break; + + if (endmatch_ && p-1 != etext_) break; - const char* old = t->capture[1]; // previous end pointer - t->capture[1] = p; if (longest_) { // Leftmost-longest mode: save this match only if // it is either farther to the left or at the same // point but longer than an existing match. if (!matched_ || t->capture[0] < match_[0] || - (t->capture[0] == match_[0] && t->capture[1] > match_[1])) - CopyCapture((const char**)match_, t->capture); + (t->capture[0] == match_[0] && p-1 > match_[1])) { + CopyCapture(match_, t->capture); + match_[1] = p-1; + matched_ = true; + } } else { // Leftmost-biased mode: this match is by definition // better than what we've already found (see next line). - CopyCapture((const char**)match_, t->capture); + CopyCapture(match_, t->capture); + match_[1] = p-1; + matched_ = true; // Cut off the threads that can only find matches // worse than the one we just found: don't run the // rest of the current Threadq. - t->capture[0] = old; - FreeThread(t); + Decref(t); for (++i; i != runq->end(); ++i) - FreeThread(i->second); + Decref(i->second); runq->clear(); - matched_ = true; return 0; } - t->capture[0] = old; - matched_ = true; break; + } } - FreeThread(t); + Decref(t); } runq->clear(); return 0; @@ -391,12 +441,6 @@ string NFA::FormatCapture(const char** capture) { return s; } -// Returns whether haystack contains needle's memory. -static bool StringPieceContains(const StringPiece haystack, const StringPiece needle) { - return haystack.begin() <= needle.begin() && - haystack.end() >= needle.end(); -} - bool NFA::Search(const StringPiece& text, const StringPiece& const_context, bool anchored, bool longest, StringPiece* submatch, int nsubmatch) { @@ -407,12 +451,9 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, if (context.begin() == NULL) context = text; - if (!StringPieceContains(context, text)) { - LOG(FATAL) << "Bad args: context does not contain text " - << reinterpret_cast(context.begin()) - << "+" << context.size() << " " - << reinterpret_cast(text.begin()) - << "+" << text.size(); + // Sanity check: make sure that text lies within context. + if (text.begin() < context.begin() || text.end() > context.end()) { + LOG(DFATAL) << "context does not contain text"; return false; } @@ -445,16 +486,14 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, match_ = new const char*[ncapture_]; matched_ = false; - memset(match_, 0, ncapture_*sizeof match_[0]); // For debugging prints. btext_ = context.begin(); - if (Debug) { + if (ExtraDebug) fprintf(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n", - text.as_string().c_str(), context.as_string().c_str(), anchored, + text.ToString().c_str(), context.ToString().c_str(), anchored, longest); - } // Set up search. Threadq* runq = &q0_; @@ -462,14 +501,10 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, runq->clear(); nextq->clear(); memset(&match_[0], 0, ncapture_*sizeof match_[0]); - const char* bp = context.begin(); - int c = -1; int wasword = 0; - if (text.begin() > context.begin()) { - c = text.begin()[-1] & 0xFF; - wasword = Prog::IsWordChar(c); - } + if (text.begin() > context.begin()) + wasword = Prog::IsWordChar(text.begin()[-1] & 0xFF); // Loop over the text, stepping the machine. for (const char* p = text.begin();; p++) { @@ -498,24 +533,29 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, else flag |= kEmptyNonWordBoundary; - if (Debug) { - fprintf(stderr, "%c[%#x/%d/%d]:", p > text.end() ? '$' : p == bp ? '^' : c, flag, isword, wasword); + if (ExtraDebug) { + int c = 0; + if (p == context.begin()) + c = '^'; + else if (p > text.end()) + c = '$'; + else if (p < text.end()) + c = p[0] & 0xFF; + + fprintf(stderr, "%c[%#x/%d/%d]:", c, flag, isword, wasword); for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) { Thread* t = i->second; if (t == NULL) continue; - fprintf(stderr, " %d%s", t->id, - FormatCapture((const char**)t->capture).c_str()); + fprintf(stderr, " %d%s", i->index(), FormatCapture(t->capture).c_str()); } fprintf(stderr, "\n"); } - // Process previous character (waited until now to avoid - // repeating the flag computation above). - // This is a no-op the first time around the loop, because - // runq is empty. - int id = Step(runq, nextq, c, flag, p-1); + // This is a no-op the first time around the loop because runq is empty. + int id = Step(runq, nextq, p < text.end() ? p[0] & 0xFF : -1, flag, p); DCHECK_EQ(runq->size(), 0); + using std::swap; swap(nextq, runq); nextq->clear(); if (id != 0) { @@ -529,7 +569,8 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, break; case kInstCapture: - match_[ip->cap()] = p; + if (ip->cap() < ncapture_) + match_[ip->cap()] = p; id = ip->out(); continue; @@ -541,14 +582,6 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, match_[1] = p; matched_ = true; break; - - case kInstEmptyWidth: - if (ip->empty() & ~(kEmptyEndLine|kEmptyEndText)) { - LOG(DFATAL) << "Unexpected empty-width in short circuit: " << ip->empty(); - break; - } - id = ip->out(); - continue; } break; } @@ -566,10 +599,10 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, // If there's a required first byte for an unanchored search // and we're not in the middle of any possible matches, // use memchr to search for the byte quickly. - if (!anchored && first_byte_ >= 0 && runq->size() == 0 && - p < text.end() && (p[0] & 0xFF) != first_byte_) { - p = reinterpret_cast(memchr(p, first_byte_, - text.end() - p)); + int fb = prog_->first_byte(); + if (!anchored && runq->size() == 0 && + fb >= 0 && p < text.end() && (p[0] & 0xFF) != fb) { + p = reinterpret_cast(memchr(p, fb, text.end() - p)); if (p == NULL) { p = text.end(); isword = 0; @@ -579,59 +612,48 @@ bool NFA::Search(const StringPiece& text, const StringPiece& const_context, flag = Prog::EmptyFlags(context, p); } - // Steal match storage (cleared but unused as of yet) - // temporarily to hold match boundaries for new thread. - match_[0] = p; - AddToThreadq(runq, start_, flag, p, match_); - match_[0] = NULL; + Thread* t = AllocThread(); + CopyCapture(t->capture, match_); + t->capture[0] = p; + AddToThreadq(runq, start_, p < text.end() ? p[0] & 0xFF : -1, flag, p, t); + Decref(t); } // If all the threads have died, stop early. if (runq->size() == 0) { - if (Debug) + if (ExtraDebug) fprintf(stderr, "dead\n"); break; } - if (p == text.end()) - c = 0; - else - c = *p & 0xFF; wasword = isword; - - // Will run step(runq, nextq, c, ...) on next iteration. See above. } for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) - FreeThread(i->second); + Decref(i->second); if (matched_) { for (int i = 0; i < nsubmatch; i++) - submatch[i].set(match_[2*i], match_[2*i+1] - match_[2*i]); - if (Debug) - fprintf(stderr, "match (%d,%d)\n", - static_cast(match_[0] - btext_), - static_cast(match_[1] - btext_)); + submatch[i] = + StringPiece(match_[2 * i], + static_cast(match_[2 * i + 1] - match_[2 * i])); + if (ExtraDebug) + fprintf(stderr, "match (%td,%td)\n", + match_[0] - btext_, match_[1] - btext_); return true; } - VLOG(1) << "No matches found"; return false; } // Computes whether all successful matches have a common first byte, // and if so, returns that byte. If not, returns -1. -int NFA::ComputeFirstByte() { - if (start_ == 0) - return -1; - - int b = -1; // first byte, not yet computed - - typedef SparseSet Workq; - Workq q(prog_->size()); - q.insert(start_); - for (Workq::iterator it = q.begin(); it != q.end(); ++it) { +int Prog::ComputeFirstByte() { + int b = -1; + SparseSet q(size()); + q.insert(start()); + for (SparseSet::iterator it = q.begin(); it != q.end(); ++it) { int id = *it; - Prog::Inst* ip = prog_->inst(id); + Prog::Inst* ip = inst(id); switch (ip->opcode()) { default: LOG(DFATAL) << "unhandled " << ip->opcode() << " in ComputeFirstByte"; @@ -642,6 +664,9 @@ int NFA::ComputeFirstByte() { return -1; case kInstByteRange: + if (!ip->last()) + q.insert(id+1); + // Must match only a single byte if (ip->lo() != ip->hi()) return -1; @@ -658,6 +683,9 @@ int NFA::ComputeFirstByte() { case kInstNop: case kInstCapture: case kInstEmptyWidth: + if (!ip->last()) + q.insert(id+1); + // Continue on. // Ignore ip->empty() flags for kInstEmptyWidth // in order to be as conservative as possible @@ -666,13 +694,9 @@ int NFA::ComputeFirstByte() { q.insert(ip->out()); break; - case kInstAlt: case kInstAltMatch: - // Explore alternatives. - if (ip->out()) - q.insert(ip->out()); - if (ip->out1()) - q.insert(ip->out1()); + DCHECK(!ip->last()); + q.insert(id+1); break; case kInstFail: @@ -686,7 +710,7 @@ bool Prog::SearchNFA(const StringPiece& text, const StringPiece& context, Anchor anchor, MatchKind kind, StringPiece* match, int nmatch) { - if (NFA::Debug) + if (ExtraDebug) Dump(); NFA nfa(this); @@ -705,5 +729,63 @@ Prog::SearchNFA(const StringPiece& text, const StringPiece& context, return true; } -} // namespace re2 +// For each instruction i in the program reachable from the start, compute the +// number of instructions reachable from i by following only empty transitions +// and record that count as fanout[i]. +// +// fanout holds the results and is also the work queue for the outer iteration. +// reachable holds the reached nodes for the inner iteration. +void Prog::Fanout(SparseArray* fanout) { + DCHECK_EQ(fanout->max_size(), size()); + SparseSet reachable(size()); + fanout->clear(); + fanout->set_new(start(), 0); + for (SparseArray::iterator i = fanout->begin(); i != fanout->end(); ++i) { + int* count = &i->second; + reachable.clear(); + reachable.insert(i->index()); + for (SparseSet::iterator j = reachable.begin(); j != reachable.end(); ++j) { + int id = *j; + Prog::Inst* ip = inst(id); + switch (ip->opcode()) { + default: + LOG(DFATAL) << "unhandled " << ip->opcode() << " in Prog::Fanout()"; + break; + case kInstByteRange: + if (!ip->last()) + reachable.insert(id+1); + + (*count)++; + if (!fanout->has_index(ip->out())) { + fanout->set_new(ip->out(), 0); + } + break; + + case kInstAltMatch: + DCHECK(!ip->last()); + reachable.insert(id+1); + break; + + case kInstCapture: + case kInstEmptyWidth: + case kInstNop: + if (!ip->last()) + reachable.insert(id+1); + + reachable.insert(ip->out()); + break; + + case kInstMatch: + if (!ip->last()) + reachable.insert(id+1); + break; + + case kInstFail: + break; + } + } + } +} + +} // namespace re2 diff --git a/contrib/libre2/re2/onepass.cc b/contrib/libre2/re2/onepass.cc index 1c4998828b5..10dc6422e78 100644 --- a/contrib/libre2/re2/onepass.cc +++ b/contrib/libre2/re2/onepass.cc @@ -50,17 +50,29 @@ // See also Anne Brüggemann-Klein and Derick Wood, // "One-unambiguous regular languages", Information and Computation 142(2). +#include #include +#include #include +#include +#include + #include "util/util.h" -#include "util/arena.h" +#include "util/logging.h" #include "util/sparse_set.h" +#include "util/strutil.h" +#include "util/utf.h" #include "re2/prog.h" #include "re2/stringpiece.h" +// Silence "zero-sized array in struct/union" warning for OneState::action. +#ifdef _MSC_VER +#pragma warning(disable: 4200) +#endif + namespace re2 { -static const int Debug = 0; +static const bool ExtraDebug = false; // The key insight behind this implementation is that the // non-determinism in an NFA for a one-pass regular expression @@ -126,19 +138,16 @@ static const int Debug = 0; // whether a set of conditions required to finish a match at that // point in the input rather than process the next byte. -// A state in the one-pass NFA (aka DFA) - just an array of actions. -struct OneState; - // A state in the one-pass NFA - just an array of actions indexed // by the bytemap_[] of the next input byte. (The bytemap // maps next input bytes into equivalence classes, to reduce // the memory footprint.) struct OneState { - uint32 matchcond; // conditions to match right now. - uint32 action[1]; + uint32_t matchcond; // conditions to match right now. + uint32_t action[]; }; -// The uint32 conditions in the action are a combination of +// The uint32_t conditions in the action are a combination of // condition and capture bits and the next state. The bottom 16 bits // are the condition and capture bits, and the top 16 are the index of // the next state. @@ -155,8 +164,8 @@ struct OneState { // and kEmptyNonWordBoundary, so we can use that as a sentinel // instead of needing an extra bit. -static const int kIndexShift = 16; // number of bits below index -static const int kEmptyShift = 6; // number of empty flags in prog.h +static const int kIndexShift = 16; // number of bits below index +static const int kEmptyShift = 6; // number of empty flags in prog.h static const int kRealCapShift = kEmptyShift + 1; static const int kRealMaxCap = (kIndexShift - kRealCapShift) / 2 * 2; @@ -164,23 +173,23 @@ static const int kRealMaxCap = (kIndexShift - kRealCapShift) / 2 * 2; static const int kCapShift = kRealCapShift - 2; static const int kMaxCap = kRealMaxCap + 2; -static const uint32 kMatchWins = 1 << kEmptyShift; -static const uint32 kCapMask = ((1 << kRealMaxCap) - 1) << kRealCapShift; +static const uint32_t kMatchWins = 1 << kEmptyShift; +static const uint32_t kCapMask = ((1 << kRealMaxCap) - 1) << kRealCapShift; -static const uint32 kImpossible = kEmptyWordBoundary | kEmptyNonWordBoundary; +static const uint32_t kImpossible = kEmptyWordBoundary | kEmptyNonWordBoundary; // Check, at compile time, that prog.h agrees with math above. // This function is never called. void OnePass_Checks() { - COMPILE_ASSERT((1<( - const_cast(nodes + statesize*nodeindex)); + return reinterpret_cast(nodes + statesize*nodeindex); } bool Prog::SearchOnePass(const StringPiece& text, @@ -237,30 +243,27 @@ bool Prog::SearchOnePass(const StringPiece& text, if (anchor_end()) kind = kFullMatch; - // State and act are marked volatile to - // keep the compiler from re-ordering the - // memory accesses walking over the NFA. - // This is worth about 5%. - volatile OneState* state = onepass_start_; - volatile uint8* nodes = onepass_nodes_; - volatile uint32 statesize = onepass_statesize_; - uint8* bytemap = bytemap_; + uint8_t* nodes = onepass_nodes_; + int statesize = sizeof(OneState) + bytemap_range()*sizeof(uint32_t); + // start() is always mapped to the zeroth OneState. + OneState* state = IndexToNode(nodes, statesize, 0); + uint8_t* bytemap = bytemap_; const char* bp = text.begin(); const char* ep = text.end(); const char* p; bool matched = false; matchcap[0] = bp; cap[0] = bp; - uint32 nextmatchcond = state->matchcond; + uint32_t nextmatchcond = state->matchcond; for (p = bp; p < ep; p++) { int c = bytemap[*p & 0xFF]; - uint32 matchcond = nextmatchcond; - uint32 cond = state->action[c]; + uint32_t matchcond = nextmatchcond; + uint32_t cond = state->action[c]; // Determine whether we can reach act->next. // If so, advance state and nextmatchcond. if ((cond & kEmptyAllFlags) == 0 || Satisfy(cond, context, p)) { - uint32 nextindex = cond >> kIndexShift; + uint32_t nextindex = cond >> kIndexShift; state = IndexToNode(nodes, statesize, nextindex); nextmatchcond = state->matchcond; } else { @@ -319,7 +322,7 @@ bool Prog::SearchOnePass(const StringPiece& text, // Look for match at end of input. { - uint32 matchcond = state->matchcond; + uint32_t matchcond = state->matchcond; if (matchcond != kImpossible && ((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p))) { if (nmatch > 1 && (matchcond & kCapMask)) @@ -335,7 +338,9 @@ done: if (!matched) return false; for (int i = 0; i < nmatch; i++) - match[i].set(matchcap[2*i], matchcap[2*i+1] - matchcap[2*i]); + match[i] = + StringPiece(matchcap[2 * i], + static_cast(matchcap[2 * i + 1] - matchcap[2 * i])); return true; } @@ -357,7 +362,7 @@ static bool AddQ(Instq *q, int id) { struct InstCond { int id; - uint32 cond; + uint32_t cond; }; // Returns whether this is a one-pass program; that is, @@ -377,7 +382,7 @@ struct InstCond { // Constructs and saves corresponding one-pass NFA on success. bool Prog::IsOnePass() { if (did_onepass_) - return onepass_start_ != NULL; + return onepass_nodes_ != NULL; did_onepass_ = true; if (start() == 0) // no match @@ -387,32 +392,37 @@ bool Prog::IsOnePass() { // Willing to use at most 1/4 of the DFA budget (heuristic). // Limit max node count to 65000 as a conservative estimate to // avoid overflowing 16-bit node index in encoding. - int maxnodes = 2 + byte_inst_count_; - int statesize = sizeof(OneState) + (bytemap_range_-1)*sizeof(uint32); + int maxnodes = 2 + inst_count(kInstByteRange); + int statesize = sizeof(OneState) + bytemap_range()*sizeof(uint32_t); if (maxnodes >= 65000 || dfa_mem_ / 4 / statesize < maxnodes) return false; // Flood the graph starting at the start state, and check // that in each reachable state, each possible byte leads // to a unique next state. - int size = this->size(); - InstCond *stack = new InstCond[size]; + int stacksize = inst_count(kInstCapture) + + inst_count(kInstEmptyWidth) + + inst_count(kInstNop) + 1; // + 1 for start inst + InstCond* stack = new InstCond[stacksize]; + int size = this->size(); int* nodebyid = new int[size]; // indexed by ip memset(nodebyid, 0xFF, size*sizeof nodebyid[0]); - uint8* nodes = new uint8[maxnodes*statesize]; - uint8* nodep = nodes; + // Originally, nodes was a uint8_t[maxnodes*statesize], but that was + // unnecessarily optimistic: why allocate a large amount of memory + // upfront for a large program when it is unlikely to be one-pass? + std::vector nodes; Instq tovisit(size), workq(size); AddQ(&tovisit, start()); nodebyid[start()] = 0; - nodep += statesize; int nalloc = 1; + nodes.insert(nodes.end(), statesize, 0); for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) { int id = *it; int nodeindex = nodebyid[id]; - OneState* node = IndexToNode(nodes, statesize, nodeindex); + OneState* node = IndexToNode(nodes.data(), statesize, nodeindex); // Flood graph using manual stack, filling in actions as found. // Default is none. @@ -427,93 +437,108 @@ bool Prog::IsOnePass() { stack[nstack++].cond = 0; while (nstack > 0) { int id = stack[--nstack].id; + uint32_t cond = stack[nstack].cond; + + Loop: Prog::Inst* ip = inst(id); - uint32 cond = stack[nstack].cond; switch (ip->opcode()) { + default: + LOG(DFATAL) << "unhandled opcode: " << ip->opcode(); + break; + case kInstAltMatch: // TODO(rsc): Ignoring kInstAltMatch optimization. // Should implement it in this engine, but it's subtle. - // Fall through. - case kInstAlt: + DCHECK(!ip->last()); // If already on work queue, (1) is violated: bail out. - if (!AddQ(&workq, ip->out()) || !AddQ(&workq, ip->out1())) + if (!AddQ(&workq, id+1)) goto fail; - stack[nstack].id = ip->out1(); - stack[nstack++].cond = cond; - stack[nstack].id = ip->out(); - stack[nstack++].cond = cond; - break; + id = id+1; + goto Loop; case kInstByteRange: { int nextindex = nodebyid[ip->out()]; if (nextindex == -1) { if (nalloc >= maxnodes) { - if (Debug) - LOG(ERROR) - << StringPrintf("Not OnePass: hit node limit %d > %d", - nalloc, maxnodes); + if (ExtraDebug) + LOG(ERROR) << StringPrintf( + "Not OnePass: hit node limit %d >= %d", nalloc, maxnodes); goto fail; } nextindex = nalloc; - nodep += statesize; - nodebyid[ip->out()] = nextindex; - nalloc++; AddQ(&tovisit, ip->out()); + nodebyid[ip->out()] = nalloc; + nalloc++; + nodes.insert(nodes.end(), statesize, 0); + // Update node because it might have been invalidated. + node = IndexToNode(nodes.data(), statesize, nodeindex); } - if (matched) - cond |= kMatchWins; for (int c = ip->lo(); c <= ip->hi(); c++) { int b = bytemap_[c]; - c = unbytemap_[b]; // last c in byte class - uint32 act = node->action[b]; - uint32 newact = (nextindex << kIndexShift) | cond; + // Skip any bytes immediately after c that are also in b. + while (c < 256-1 && bytemap_[c+1] == b) + c++; + uint32_t act = node->action[b]; + uint32_t newact = (nextindex << kIndexShift) | cond; + if (matched) + newact |= kMatchWins; if ((act & kImpossible) == kImpossible) { node->action[b] = newact; } else if (act != newact) { - if (Debug) { - LOG(ERROR) - << StringPrintf("Not OnePass: conflict on byte " - "%#x at state %d", - c, *it); - } + if (ExtraDebug) + LOG(ERROR) << StringPrintf( + "Not OnePass: conflict on byte %#x at state %d", c, *it); goto fail; } } if (ip->foldcase()) { - Rune lo = max(ip->lo(), 'a') + 'A' - 'a'; - Rune hi = min(ip->hi(), 'z') + 'A' - 'a'; + Rune lo = std::max(ip->lo(), 'a') + 'A' - 'a'; + Rune hi = std::min(ip->hi(), 'z') + 'A' - 'a'; for (int c = lo; c <= hi; c++) { int b = bytemap_[c]; - c = unbytemap_[b]; // last c in class - uint32 act = node->action[b]; - uint32 newact = (nextindex << kIndexShift) | cond; + // Skip any bytes immediately after c that are also in b. + while (c < 256-1 && bytemap_[c+1] == b) + c++; + uint32_t act = node->action[b]; + uint32_t newact = (nextindex << kIndexShift) | cond; + if (matched) + newact |= kMatchWins; if ((act & kImpossible) == kImpossible) { node->action[b] = newact; } else if (act != newact) { - if (Debug) { - LOG(ERROR) - << StringPrintf("Not OnePass: conflict on byte " - "%#x at state %d", - c, *it); - } + if (ExtraDebug) + LOG(ERROR) << StringPrintf( + "Not OnePass: conflict on byte %#x at state %d", c, *it); goto fail; } } } - break; + + if (ip->last()) + break; + // If already on work queue, (1) is violated: bail out. + if (!AddQ(&workq, id+1)) + goto fail; + id = id+1; + goto Loop; } case kInstCapture: - if (ip->cap() < kMaxCap) - cond |= (1 << kCapShift) << ip->cap(); - goto QueueEmpty; - case kInstEmptyWidth: - cond |= ip->empty(); - goto QueueEmpty; - case kInstNop: - QueueEmpty: + if (!ip->last()) { + // If already on work queue, (1) is violated: bail out. + if (!AddQ(&workq, id+1)) + goto fail; + stack[nstack].id = id+1; + stack[nstack++].cond = cond; + } + + if (ip->opcode() == kInstCapture && ip->cap() < kMaxCap) + cond |= (1 << kCapShift) << ip->cap(); + if (ip->opcode() == kInstEmptyWidth) + cond |= ip->empty(); + // kInstCapture and kInstNop always proceed to ip->out(). // kInstEmptyWidth only sometimes proceeds to ip->out(), // but as a conservative approximation we assume it always does. @@ -522,29 +547,32 @@ bool Prog::IsOnePass() { // If already on work queue, (1) is violated: bail out. if (!AddQ(&workq, ip->out())) { - if (Debug) { - LOG(ERROR) << StringPrintf("Not OnePass: multiple paths" - " %d -> %d\n", - *it, ip->out()); - } + if (ExtraDebug) + LOG(ERROR) << StringPrintf( + "Not OnePass: multiple paths %d -> %d\n", *it, ip->out()); goto fail; } - stack[nstack].id = ip->out(); - stack[nstack++].cond = cond; - break; + id = ip->out(); + goto Loop; case kInstMatch: if (matched) { // (3) is violated - if (Debug) { - LOG(ERROR) << StringPrintf("Not OnePass: multiple matches" - " from %d\n", *it); - } + if (ExtraDebug) + LOG(ERROR) << StringPrintf( + "Not OnePass: multiple matches from %d\n", *it); goto fail; } matched = true; node->matchcond = cond; - break; + + if (ip->last()) + break; + // If already on work queue, (1) is violated: bail out. + if (!AddQ(&workq, id+1)) + goto fail; + id = id+1; + goto Loop; case kInstFail: break; @@ -552,29 +580,22 @@ bool Prog::IsOnePass() { } } - if (Debug) { // For debugging, dump one-pass NFA to LOG(ERROR). - string dump = "prog dump:\n" + Dump() + "node dump\n"; - map idmap; + if (ExtraDebug) { // For debugging, dump one-pass NFA to LOG(ERROR). + LOG(ERROR) << "bytemap:\n" << DumpByteMap(); + LOG(ERROR) << "prog:\n" << Dump(); + + std::map idmap; for (int i = 0; i < size; i++) if (nodebyid[i] != -1) idmap[nodebyid[i]] = i; - StringAppendF(&dump, "byte ranges:\n"); - int i = 0; - for (int b = 0; b < bytemap_range_; b++) { - int lo = i; - while (bytemap_[i] == b) - i++; - StringAppendF(&dump, "\t%d: %#x-%#x\n", b, lo, i - 1); - } - + string dump; for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) { int id = *it; int nodeindex = nodebyid[id]; if (nodeindex == -1) - continue; - OneState* node = IndexToNode(nodes, statesize, nodeindex); - string s; + continue; + OneState* node = IndexToNode(nodes.data(), statesize, nodeindex); StringAppendF(&dump, "node %d id=%d: matchcond=%#x\n", nodeindex, id, node->matchcond); for (int i = 0; i < bytemap_range_; i++) { @@ -586,19 +607,12 @@ bool Prog::IsOnePass() { idmap[node->action[i] >> kIndexShift]); } } - LOG(ERROR) << dump; + LOG(ERROR) << "nodes:\n" << dump; } - // Overallocated earlier; cut down to actual size. - nodep = new uint8[nalloc*statesize]; - memmove(nodep, nodes, nalloc*statesize); - delete[] nodes; - nodes = nodep; - - onepass_start_ = IndexToNode(nodes, statesize, nodebyid[start()]); - onepass_nodes_ = nodes; - onepass_statesize_ = statesize; dfa_mem_ -= nalloc*statesize; + onepass_nodes_ = new uint8_t[nalloc*statesize]; + memmove(onepass_nodes_, nodes.data(), nalloc*statesize); delete[] stack; delete[] nodebyid; @@ -607,7 +621,6 @@ bool Prog::IsOnePass() { fail: delete[] stack; delete[] nodebyid; - delete[] nodes; return false; } diff --git a/contrib/libre2/re2/parse.cc b/contrib/libre2/re2/parse.cc index 0571a2f89b2..3374345c72b 100644 --- a/contrib/libre2/re2/parse.cc +++ b/contrib/libre2/re2/parse.cc @@ -16,14 +16,40 @@ // and recognizes the Perl escape sequences \d, \s, \w, \D, \S, and \W. // See regexp.h for rationale. +#include +#include +#include +#include +#include +#include +#include +#include + #include "util/util.h" +#include "util/logging.h" +#include "util/strutil.h" +#include "util/utf.h" #include "re2/regexp.h" #include "re2/stringpiece.h" #include "re2/unicode_casefold.h" #include "re2/unicode_groups.h" +#include "re2/walker-inl.h" + +#if defined(RE2_USE_ICU) +#include "unicode/uniset.h" +#include "unicode/unistr.h" +#include "unicode/utypes.h" +#endif namespace re2 { +// Reduce the maximum repeat count by an order of magnitude when fuzzing. +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION +static const int kMaxRepeat = 100; +#else +static const int kMaxRepeat = 1000; +#endif + // Regular expression parse state. // The list of parsed regexps so far is maintained as a vector of // Regexp pointers called the stack. Left parenthesis and vertical @@ -156,7 +182,8 @@ private: int ncap_; // number of capturing parens seen int rune_max_; // maximum char value for this encoding - DISALLOW_EVIL_CONSTRUCTORS(ParseState); + ParseState(const ParseState&) = delete; + ParseState& operator=(const ParseState&) = delete; }; // Pseudo-operators - only on parse stack. @@ -214,7 +241,8 @@ bool Regexp::ParseState::PushRegexp(Regexp* re) { // single characters (e.g., [.] instead of \.), and some // analysis does better with fewer character classes. // Similarly, [Aa] can be rewritten as a literal A with ASCII case folding. - if (re->op_ == kRegexpCharClass) { + if (re->op_ == kRegexpCharClass && re->ccb_ != NULL) { + re->ccb_->RemoveAbove(rune_max_); if (re->ccb_->size() == 1) { Rune r = re->ccb_->begin()->lo; re->Decref(); @@ -246,9 +274,9 @@ const CaseFold* LookupCaseFold(const CaseFold *f, int n, Rune r) { // Binary search for entry containing r. while (n > 0) { int m = n/2; - if (static_cast(f[m].lo) <= r && r <= static_cast(f[m].hi)) + if (f[m].lo <= r && r <= f[m].hi) return &f[m]; - if (r < static_cast(f[m].lo)) { + if (r < f[m].lo) { n = m; } else { f += m+1; @@ -276,7 +304,7 @@ Rune ApplyFold(const CaseFold *f, Rune r) { case EvenOddSkip: // even <-> odd but only applies to every other if ((r - f->lo) % 2) return r; - // fall through + FALLTHROUGH_INTENDED; case EvenOdd: // even <-> odd if (r%2 == 0) return r + 1; @@ -285,7 +313,7 @@ Rune ApplyFold(const CaseFold *f, Rune r) { case OddEvenSkip: // odd <-> even but only applies to every other if ((r - f->lo) % 2) return r; - // fall through + FALLTHROUGH_INTENDED; case OddEven: // odd <-> even if (r%2 == 1) return r + 1; @@ -305,7 +333,7 @@ Rune ApplyFold(const CaseFold *f, Rune r) { // CycleFoldRune('?') = '?' Rune CycleFoldRune(Rune r) { const CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, r); - if (f == NULL || r < static_cast(f->lo)) + if (f == NULL || r < f->lo) return r; return ApplyFold(f, r); } @@ -330,7 +358,7 @@ static void AddFoldedRange(CharClassBuilder* cc, Rune lo, Rune hi, int depth) { const CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, lo); if (f == NULL) // lo has no fold, nor does anything above lo break; - if (lo < static_cast(f->lo)) { // lo has no fold; next rune with a fold is f->lo + if (lo < f->lo) { // lo has no fold; next rune with a fold is f->lo lo = f->lo; continue; } @@ -338,7 +366,7 @@ static void AddFoldedRange(CharClassBuilder* cc, Rune lo, Rune hi, int depth) { // Add in the result of folding the range lo - f->hi // and that range's fold, recursively. Rune lo1 = lo; - Rune hi1 = min(hi, f->hi); + Rune hi1 = std::min(hi, f->hi); switch (f->delta) { default: lo1 += f->delta; @@ -377,7 +405,6 @@ bool Regexp::ParseState::PushLiteral(Rune r) { } r = CycleFoldRune(r); } while (r != r1); - re->ccb_->RemoveAbove(rune_max_); return PushRegexp(re); } @@ -454,6 +481,23 @@ bool Regexp::ParseState::PushRepeatOp(RegexpOp op, const StringPiece& s, Regexp::ParseFlags fl = flags_; if (nongreedy) fl = fl ^ NonGreedy; + + // Squash **, ++ and ??. Regexp::Star() et al. handle this too, but + // they're mostly for use during simplification, not during parsing. + if (op == stacktop_->op() && fl == stacktop_->parse_flags()) + return true; + + // Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because + // op is a repeat, we just have to check that stacktop_->op() is too, + // then adjust stacktop_. + if ((stacktop_->op() == kRegexpStar || + stacktop_->op() == kRegexpPlus || + stacktop_->op() == kRegexpQuest) && + fl == stacktop_->parse_flags()) { + stacktop_->op_ = kRegexpStar; + return true; + } + Regexp* re = new Regexp(op, fl); re->AllocSub(1); re->down_ = stacktop_->down_; @@ -463,12 +507,66 @@ bool Regexp::ParseState::PushRepeatOp(RegexpOp op, const StringPiece& s, return true; } +// RepetitionWalker reports whether the repetition regexp is valid. +// Valid means that the combination of the top-level repetition +// and any inner repetitions does not exceed n copies of the +// innermost thing. +// This rewalks the regexp tree and is called for every repetition, +// so we have to worry about inducing quadratic behavior in the parser. +// We avoid this by only using RepetitionWalker when min or max >= 2. +// In that case the depth of any >= 2 nesting can only get to 9 without +// triggering a parse error, so each subtree can only be rewalked 9 times. +class RepetitionWalker : public Regexp::Walker { + public: + RepetitionWalker() {} + virtual int PreVisit(Regexp* re, int parent_arg, bool* stop); + virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg, + int* child_args, int nchild_args); + virtual int ShortVisit(Regexp* re, int parent_arg); + + private: + RepetitionWalker(const RepetitionWalker&) = delete; + RepetitionWalker& operator=(const RepetitionWalker&) = delete; +}; + +int RepetitionWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) { + int arg = parent_arg; + if (re->op() == kRegexpRepeat) { + int m = re->max(); + if (m < 0) { + m = re->min(); + } + if (m > 0) { + arg /= m; + } + } + return arg; +} + +int RepetitionWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg, + int* child_args, int nchild_args) { + int arg = pre_arg; + for (int i = 0; i < nchild_args; i++) { + if (child_args[i] < arg) { + arg = child_args[i]; + } + } + return arg; +} + +int RepetitionWalker::ShortVisit(Regexp* re, int parent_arg) { + // This should never be called, since we use Walk and not + // WalkExponential. + LOG(DFATAL) << "RepetitionWalker::ShortVisit called"; + return 0; +} + // Pushes a repetition regexp onto the stack. // A valid argument for the operator must already be on the stack. bool Regexp::ParseState::PushRepetition(int min, int max, const StringPiece& s, bool nongreedy) { - if ((max != -1 && max < min) || min > 1000 || max > 1000) { + if ((max != -1 && max < min) || min > kMaxRepeat || max > kMaxRepeat) { status_->set_code(kRegexpRepeatSize); status_->set_error_arg(s); return false; @@ -488,8 +586,15 @@ bool Regexp::ParseState::PushRepetition(int min, int max, re->down_ = stacktop_->down_; re->sub()[0] = FinishRegexp(stacktop_); re->simple_ = re->ComputeSimple(); - stacktop_ = re; + if (min >= 2 || max >= 2) { + RepetitionWalker w; + if (w.Walk(stacktop_, kMaxRepeat) == 0) { + status_->set_code(kRegexpRepeatSize); + status_->set_error_arg(s); + return false; + } + } return true; } @@ -504,7 +609,7 @@ bool Regexp::ParseState::DoLeftParen(const StringPiece& name) { Regexp* re = new Regexp(kLeftParen, flags_); re->cap_ = ++ncap_; if (name.data() != NULL) - re->name_ = new string(name.as_string()); + re->name_ = new string(name.ToString()); return PushRegexp(re); } @@ -515,13 +620,6 @@ bool Regexp::ParseState::DoLeftParenNoCapture() { return PushRegexp(re); } -// Adds r to cc, along with r's upper case if foldascii is set. -static void AddLiteral(CharClassBuilder* cc, Rune r, bool foldascii) { - cc->AddRange(r, r); - if (foldascii && 'a' <= r && r <= 'z') - cc->AddRange(r + 'A' - 'a', r + 'A' - 'a'); -} - // Processes a vertical bar in the input. bool Regexp::ParseState::DoVerticalBar() { MaybeConcatString(-1, NoParseFlags); @@ -535,46 +633,34 @@ bool Regexp::ParseState::DoVerticalBar() { Regexp* r1; Regexp* r2; if ((r1 = stacktop_) != NULL && - (r2 = stacktop_->down_) != NULL && + (r2 = r1->down_) != NULL && r2->op() == kVerticalBar) { - // If above and below vertical bar are literal or char class, - // can merge into a single char class. Regexp* r3; - if ((r1->op() == kRegexpLiteral || - r1->op() == kRegexpCharClass || - r1->op() == kRegexpAnyChar) && - (r3 = r2->down_) != NULL) { - Rune rune; - switch (r3->op()) { - case kRegexpLiteral: // convert to char class - rune = r3->rune_; - r3->op_ = kRegexpCharClass; - r3->cc_ = NULL; - r3->ccb_ = new CharClassBuilder; - AddLiteral(r3->ccb_, rune, r3->parse_flags_ & Regexp::FoldCase); - // fall through - case kRegexpCharClass: - if (r1->op() == kRegexpLiteral) - AddLiteral(r3->ccb_, r1->rune_, - r1->parse_flags_ & Regexp::FoldCase); - else if (r1->op() == kRegexpCharClass) - r3->ccb_->AddCharClass(r1->ccb_); - if (r1->op() == kRegexpAnyChar || r3->ccb_->full()) { - delete r3->ccb_; - r3->ccb_ = NULL; - r3->op_ = kRegexpAnyChar; - } - // fall through - case kRegexpAnyChar: - // pop r1 - stacktop_ = r2; - r1->Decref(); - return true; - default: - break; + if ((r3 = r2->down_) != NULL && + (r1->op() == kRegexpAnyChar || r3->op() == kRegexpAnyChar)) { + // AnyChar is above or below the vertical bar. Let it subsume + // the other when the other is Literal, CharClass or AnyChar. + if (r3->op() == kRegexpAnyChar && + (r1->op() == kRegexpLiteral || + r1->op() == kRegexpCharClass || + r1->op() == kRegexpAnyChar)) { + // Discard r1. + stacktop_ = r2; + r1->Decref(); + return true; + } + if (r1->op() == kRegexpAnyChar && + (r3->op() == kRegexpLiteral || + r3->op() == kRegexpCharClass || + r3->op() == kRegexpAnyChar)) { + // Rearrange the stack and discard r3. + r1->down_ = r3->down_; + r2->down_ = r1; + stacktop_ = r2; + r3->Decref(); + return true; } } - // Swap r1 below vertical bar (r2). r1->down_ = r2->down_; r2->down_ = r1; @@ -716,7 +802,7 @@ void Regexp::RemoveLeadingString(Regexp* re, int n) { Regexp* stk[4]; int d = 0; while (re->op() == kRegexpConcat) { - if (static_cast(d) < arraysize(stk)) + if (d < arraysize(stk)) stk[d++] = re; re = re->sub()[0]; } @@ -780,59 +866,180 @@ void Regexp::RemoveLeadingString(Regexp* re, int n) { } } +// In the context of factoring alternations, a Splice is: a factored prefix or +// merged character class computed by one iteration of one round of factoring; +// the span of subexpressions of the alternation to be "spliced" (i.e. removed +// and replaced); and, for a factored prefix, the number of suffixes after any +// factoring that might have subsequently been performed on them. For a merged +// character class, there are no suffixes, of course, so the field is ignored. +struct Splice { + Splice(Regexp* prefix, Regexp** sub, int nsub) + : prefix(prefix), + sub(sub), + nsub(nsub), + nsuffix(-1) {} + + Regexp* prefix; + Regexp** sub; + int nsub; + int nsuffix; +}; + +// Named so because it is used to implement an explicit stack, a Frame is: the +// span of subexpressions of the alternation to be factored; the current round +// of factoring; any Splices computed; and, for a factored prefix, an iterator +// to the next Splice to be factored (i.e. in another Frame) because suffixes. +struct Frame { + Frame(Regexp** sub, int nsub) + : sub(sub), + nsub(nsub), + round(0) {} + + Regexp** sub; + int nsub; + int round; + std::vector splices; + std::vector::iterator spliceiter; +}; + +// Bundled into a class for friend access to Regexp without needing to declare +// (or define) Splice in regexp.h. +class FactorAlternationImpl { + public: + static void Round1(Regexp** sub, int nsub, + Regexp::ParseFlags flags, + std::vector* splices); + static void Round2(Regexp** sub, int nsub, + Regexp::ParseFlags flags, + std::vector* splices); + static void Round3(Regexp** sub, int nsub, + Regexp::ParseFlags flags, + std::vector* splices); +}; + // Factors common prefixes from alternation. // For example, // ABC|ABD|AEF|BCX|BCY // simplifies to // A(B(C|D)|EF)|BC(X|Y) -// which the normal parse state routines will further simplify to +// and thence to // A(B[CD]|EF)|BC[XY] // // Rewrites sub to contain simplified list to alternate and returns // the new length of sub. Adjusts reference counts accordingly // (incoming sub[i] decremented, outgoing sub[i] incremented). +int Regexp::FactorAlternation(Regexp** sub, int nsub, ParseFlags flags) { + std::vector stk; + stk.emplace_back(sub, nsub); -// It's too much of a pain to write this code with an explicit stack, -// so instead we let the caller specify a maximum depth and -// don't simplify beyond that. There are around 15 words of local -// variables and parameters in the frame, so allowing 8 levels -// on a 64-bit machine is still less than a kilobyte of stack and -// probably enough benefit for practical uses. -const int kFactorAlternationMaxDepth = 8; + for (;;) { + auto& sub = stk.back().sub; + auto& nsub = stk.back().nsub; + auto& round = stk.back().round; + auto& splices = stk.back().splices; + auto& spliceiter = stk.back().spliceiter; -int Regexp::FactorAlternation( - Regexp** sub, int n, - Regexp::ParseFlags altflags) { - return FactorAlternationRecursive(sub, n, altflags, - kFactorAlternationMaxDepth); + if (splices.empty()) { + // Advance to the next round of factoring. Note that this covers + // the initialised state: when splices is empty and round is 0. + round++; + } else if (spliceiter != splices.end()) { + // We have at least one more Splice to factor. Recurse logically. + stk.emplace_back(spliceiter->sub, spliceiter->nsub); + continue; + } else { + // We have no more Splices to factor. Apply them. + auto iter = splices.begin(); + int out = 0; + for (int i = 0; i < nsub; ) { + // Copy until we reach where the next Splice begins. + while (sub + i < iter->sub) + sub[out++] = sub[i++]; + switch (round) { + case 1: + case 2: { + // Assemble the Splice prefix and the suffixes. + Regexp* re[2]; + re[0] = iter->prefix; + re[1] = Regexp::AlternateNoFactor(iter->sub, iter->nsuffix, flags); + sub[out++] = Regexp::Concat(re, 2, flags); + i += iter->nsub; + break; + } + case 3: + // Just use the Splice prefix. + sub[out++] = iter->prefix; + i += iter->nsub; + break; + default: + LOG(DFATAL) << "unknown round: " << round; + break; + } + // If we are done, copy until the end of sub. + if (++iter == splices.end()) { + while (i < nsub) + sub[out++] = sub[i++]; + } + } + splices.clear(); + nsub = out; + // Advance to the next round of factoring. + round++; + } + + switch (round) { + case 1: + FactorAlternationImpl::Round1(sub, nsub, flags, &splices); + break; + case 2: + FactorAlternationImpl::Round2(sub, nsub, flags, &splices); + break; + case 3: + FactorAlternationImpl::Round3(sub, nsub, flags, &splices); + break; + case 4: + if (stk.size() == 1) { + // We are at the top of the stack. Just return. + return nsub; + } else { + // Pop the stack and set the number of suffixes. + // (Note that references will be invalidated!) + int nsuffix = nsub; + stk.pop_back(); + stk.back().spliceiter->nsuffix = nsuffix; + ++stk.back().spliceiter; + continue; + } + default: + LOG(DFATAL) << "unknown round: " << round; + break; + } + + // Set spliceiter depending on whether we have Splices to factor. + if (splices.empty() || round == 3) { + spliceiter = splices.end(); + } else { + spliceiter = splices.begin(); + } + } } -int Regexp::FactorAlternationRecursive( - Regexp** sub, int n, - Regexp::ParseFlags altflags, - int maxdepth) { - - if (maxdepth <= 0) - return n; - +void FactorAlternationImpl::Round1(Regexp** sub, int nsub, + Regexp::ParseFlags flags, + std::vector* splices) { // Round 1: Factor out common literal prefixes. - Rune *rune = NULL; + int start = 0; + Rune* rune = NULL; int nrune = 0; Regexp::ParseFlags runeflags = Regexp::NoParseFlags; - int start = 0; - int out = 0; - for (int i = 0; i <= n; i++) { - // Invariant: what was in sub[0:start] has been Decref'ed - // and that space has been reused for sub[0:out] (out <= start). - // - // Invariant: sub[start:i] consists of regexps that all begin - // with the string rune[0:nrune]. - + for (int i = 0; i <= nsub; i++) { + // Invariant: sub[start:i] consists of regexps that all + // begin with rune[0:nrune]. Rune* rune_i = NULL; int nrune_i = 0; Regexp::ParseFlags runeflags_i = Regexp::NoParseFlags; - if (i < n) { - rune_i = LeadingString(sub[i], &nrune_i, &runeflags_i); + if (i < nsub) { + rune_i = Regexp::LeadingString(sub[i], &nrune_i, &runeflags_i); if (runeflags_i == runeflags) { int same = 0; while (same < nrune && same < nrune_i && rune[same] == rune_i[same]) @@ -846,109 +1053,121 @@ int Regexp::FactorAlternationRecursive( } // Found end of a run with common leading literal string: - // sub[start:i] all begin with rune[0:nrune] but sub[i] - // does not even begin with rune[0]. - // - // Factor out common string and append factored expression to sub[0:out]. + // sub[start:i] all begin with rune[0:nrune], + // but sub[i] does not even begin with rune[0]. if (i == start) { // Nothing to do - first iteration. } else if (i == start+1) { // Just one: don't bother factoring. - sub[out++] = sub[start]; } else { - // Construct factored form: prefix(suffix1|suffix2|...) - Regexp* x[2]; // x[0] = prefix, x[1] = suffix1|suffix2|... - x[0] = LiteralString(rune, nrune, runeflags); + Regexp* prefix = Regexp::LiteralString(rune, nrune, runeflags); for (int j = start; j < i; j++) - RemoveLeadingString(sub[j], nrune); - int nn = FactorAlternationRecursive(sub + start, i - start, altflags, - maxdepth - 1); - x[1] = AlternateNoFactor(sub + start, nn, altflags); - sub[out++] = Concat(x, 2, altflags); + Regexp::RemoveLeadingString(sub[j], nrune); + splices->emplace_back(prefix, sub + start, i - start); } - // Prepare for next round (if there is one). - if (i < n) { + // Prepare for next iteration (if there is one). + if (i < nsub) { start = i; rune = rune_i; nrune = nrune_i; runeflags = runeflags_i; } } - n = out; +} - // Round 2: Factor out common complex prefixes, - // just the first piece of each concatenation, - // whatever it is. This is good enough a lot of the time. - start = 0; - out = 0; +void FactorAlternationImpl::Round2(Regexp** sub, int nsub, + Regexp::ParseFlags flags, + std::vector* splices) { + // Round 2: Factor out common simple prefixes, + // just the first piece of each concatenation. + // This will be good enough a lot of the time. + // + // Complex subexpressions (e.g. involving quantifiers) + // are not safe to factor because that collapses their + // distinct paths through the automaton, which affects + // correctness in some cases. + int start = 0; Regexp* first = NULL; - for (int i = 0; i <= n; i++) { - // Invariant: what was in sub[0:start] has been Decref'ed - // and that space has been reused for sub[0:out] (out <= start). - // - // Invariant: sub[start:i] consists of regexps that all begin with first. - + for (int i = 0; i <= nsub; i++) { + // Invariant: sub[start:i] consists of regexps that all + // begin with first. Regexp* first_i = NULL; - if (i < n) { - first_i = LeadingRegexp(sub[i]); - if (first != NULL && Regexp::Equal(first, first_i)) { + if (i < nsub) { + first_i = Regexp::LeadingRegexp(sub[i]); + if (first != NULL && + // first must be an empty-width op + // OR a char class, any char or any byte + // OR a fixed repeat of a literal, char class, any char or any byte. + (first->op() == kRegexpBeginLine || + first->op() == kRegexpEndLine || + first->op() == kRegexpWordBoundary || + first->op() == kRegexpNoWordBoundary || + first->op() == kRegexpBeginText || + first->op() == kRegexpEndText || + first->op() == kRegexpCharClass || + first->op() == kRegexpAnyChar || + first->op() == kRegexpAnyByte || + (first->op() == kRegexpRepeat && + first->min() == first->max() && + (first->sub()[0]->op() == kRegexpLiteral || + first->sub()[0]->op() == kRegexpCharClass || + first->sub()[0]->op() == kRegexpAnyChar || + first->sub()[0]->op() == kRegexpAnyByte))) && + Regexp::Equal(first, first_i)) continue; - } } // Found end of a run with common leading regexp: - // sub[start:i] all begin with first but sub[i] does not. - // - // Factor out common regexp and append factored expression to sub[0:out]. + // sub[start:i] all begin with first, + // but sub[i] does not. if (i == start) { // Nothing to do - first iteration. } else if (i == start+1) { // Just one: don't bother factoring. - sub[out++] = sub[start]; } else { - // Construct factored form: prefix(suffix1|suffix2|...) - Regexp* x[2]; // x[0] = prefix, x[1] = suffix1|suffix2|... - x[0] = first->Incref(); + Regexp* prefix = first->Incref(); for (int j = start; j < i; j++) - sub[j] = RemoveLeadingRegexp(sub[j]); - int nn = FactorAlternationRecursive(sub + start, i - start, altflags, - maxdepth - 1); - x[1] = AlternateNoFactor(sub + start, nn, altflags); - sub[out++] = Concat(x, 2, altflags); + sub[j] = Regexp::RemoveLeadingRegexp(sub[j]); + splices->emplace_back(prefix, sub + start, i - start); } - // Prepare for next round (if there is one). - if (i < n) { + // Prepare for next iteration (if there is one). + if (i < nsub) { start = i; first = first_i; } } - n = out; +} - // Round 3: Collapse runs of single literals into character classes. - start = 0; - out = 0; - for (int i = 0; i <= n; i++) { - // Invariant: what was in sub[0:start] has been Decref'ed - // and that space has been reused for sub[0:out] (out <= start). - // - // Invariant: sub[start:i] consists of regexps that are either - // literal runes or character classes. +void FactorAlternationImpl::Round3(Regexp** sub, int nsub, + Regexp::ParseFlags flags, + std::vector* splices) { + // Round 3: Merge runs of literals and/or character classes. + int start = 0; + Regexp* first = NULL; + for (int i = 0; i <= nsub; i++) { + // Invariant: sub[start:i] consists of regexps that all + // are either literals (i.e. runes) or character classes. + Regexp* first_i = NULL; + if (i < nsub) { + first_i = sub[i]; + if (first != NULL && + (first->op() == kRegexpLiteral || + first->op() == kRegexpCharClass) && + (first_i->op() == kRegexpLiteral || + first_i->op() == kRegexpCharClass)) + continue; + } - if (i < n && - (sub[i]->op() == kRegexpLiteral || - sub[i]->op() == kRegexpCharClass)) - continue; - - // sub[i] is not a char or char class; - // emit char class for sub[start:i]... + // Found end of a run of Literal/CharClass: + // sub[start:i] all are either one or the other, + // but sub[i] is not. if (i == start) { - // Nothing to do. + // Nothing to do - first iteration. } else if (i == start+1) { - sub[out++] = sub[start]; + // Just one: don't bother factoring. } else { - // Make new char class. CharClassBuilder ccb; for (int j = start; j < i; j++) { Regexp* re = sub[j]; @@ -964,31 +1183,16 @@ int Regexp::FactorAlternationRecursive( } re->Decref(); } - sub[out++] = NewCharClass(ccb.GetCharClass(), altflags); + Regexp* re = Regexp::NewCharClass(ccb.GetCharClass(), flags); + splices->emplace_back(re, sub + start, i - start); } - // ... and then emit sub[i]. - if (i < n) - sub[out++] = sub[i]; - start = i+1; - } - n = out; - - // Round 4: Collapse runs of empty matches into single empty match. - start = 0; - out = 0; - for (int i = 0; i < n; i++) { - if (i + 1 < n && - sub[i]->op() == kRegexpEmptyMatch && - sub[i+1]->op() == kRegexpEmptyMatch) { - sub[i]->Decref(); - continue; + // Prepare for next iteration (if there is one). + if (i < nsub) { + start = i; + first = first_i; } - sub[out++] = sub[i]; } - n = out; - - return n; } // Collapse the regexps on top of the stack, down to the @@ -1105,7 +1309,7 @@ bool Regexp::ParseState::MaybeConcatString(int r, ParseFlags flags) { if (r >= 0) { re1->op_ = kRegexpLiteral; re1->rune_ = r; - re1->parse_flags_ = flags; + re1->parse_flags_ = static_cast(flags); return true; } @@ -1116,9 +1320,8 @@ bool Regexp::ParseState::MaybeConcatString(int r, ParseFlags flags) { // Lexing routines. -// Parses a decimal integer, storing it in *n. +// Parses a decimal integer, storing it in *np. // Sets *s to span the remainder of the string. -// Sets *out_re to the regexp for the class. static bool ParseInteger(StringPiece* s, int* np) { if (s->size() == 0 || !isdigit((*s)[0] & 0xFF)) return false; @@ -1185,9 +1388,18 @@ static bool MaybeParseRepetition(StringPiece* sp, int* lo, int* hi) { // Argument order is backwards from usual Google style // but consistent with chartorune. static int StringPieceToRune(Rune *r, StringPiece *sp, RegexpStatus* status) { - int n; - if (fullrune(sp->data(), sp->size())) { - n = chartorune(r, sp->data()); + // fullrune() takes int, not size_t. However, it just looks + // at the leading byte and treats any length >= 4 the same. + if (fullrune(sp->data(), static_cast(std::min(size_t{4}, sp->size())))) { + int n = chartorune(r, sp->data()); + // Some copies of chartorune have a bug that accepts + // encodings of values in (10FFFF, 1FFFFF] as valid. + // Those values break the character class algorithm, + // which assumes Runemax is the largest rune. + if (*r > Runemax) { + n = 1; + *r = Runeerror; + } if (!(n == 1 && *r == Runeerror)) { // no decoding error sp->remove_prefix(n); return n; @@ -1195,7 +1407,7 @@ static int StringPieceToRune(Rune *r, StringPiece *sp, RegexpStatus* status) { } status->set_code(kRegexpBadUTF8); - status->set_error_arg(NULL); + status->set_error_arg(StringPiece()); return -1; } @@ -1239,12 +1451,12 @@ static bool ParseEscape(StringPiece* s, Rune* rp, if (s->size() < 1 || (*s)[0] != '\\') { // Should not happen - caller always checks. status->set_code(kRegexpInternalError); - status->set_error_arg(NULL); + status->set_error_arg(StringPiece()); return false; } if (s->size() < 2) { status->set_code(kRegexpTrailingBackslash); - status->set_error_arg(NULL); + status->set_error_arg(StringPiece()); return false; } Rune c, c1; @@ -1275,7 +1487,7 @@ static bool ParseEscape(StringPiece* s, Rune* rp, // Single non-zero octal digit is a backreference; not supported. if (s->size() == 0 || (*s)[0] < '0' || (*s)[0] > '7') goto BadEscape; - // fall through + FALLTHROUGH_INTENDED; case '0': // consume up to three octal digits; already have one. code = c - '0'; @@ -1377,7 +1589,8 @@ static bool ParseEscape(StringPiece* s, Rune* rp, BadEscape: // Unrecognized escape sequence. status->set_code(kRegexpBadEscape); - status->set_error_arg(StringPiece(begin, s->data() - begin)); + status->set_error_arg( + StringPiece(begin, static_cast(s->begin() - begin))); return false; } @@ -1414,11 +1627,6 @@ static const UGroup* LookupGroup(const StringPiece& name, return NULL; } -// Fake UGroup containing all Runes -static URange16 any16[] = { { 0, 65535 } }; -static URange32 any32[] = { { 65536, Runemax } }; -static UGroup anygroup = { "Any", +1, any16, 1, any32, 1 }; - // Look for a POSIX group with the given name (e.g., "[:^alpha:]") static const UGroup* LookupPosixGroup(const StringPiece& name) { return LookupGroup(name, posix_groups, num_posix_groups); @@ -1428,6 +1636,12 @@ static const UGroup* LookupPerlGroup(const StringPiece& name) { return LookupGroup(name, perl_groups, num_perl_groups); } +#if !defined(RE2_USE_ICU) +// Fake UGroup containing all Runes +static URange16 any16[] = { { 0, 65535 } }; +static URange32 any32[] = { { 65536, Runemax } }; +static UGroup anygroup = { "Any", +1, any16, 1, any32, 1 }; + // Look for a Unicode group with the given name (e.g., "Han") static const UGroup* LookupUnicodeGroup(const StringPiece& name) { // Special case: "Any" means any. @@ -1435,6 +1649,7 @@ static const UGroup* LookupUnicodeGroup(const StringPiece& name) { return &anygroup; return LookupGroup(name, unicode_groups, num_unicode_groups); } +#endif // Add a UGroup or its negation to the character class. static void AddUGroup(CharClassBuilder *cc, const UGroup *g, int sign, @@ -1468,12 +1683,12 @@ static void AddUGroup(CharClassBuilder *cc, const UGroup *g, int sign, } int next = 0; for (int i = 0; i < g->nr16; i++) { - if (next < static_cast(g->r16[i].lo)) + if (next < g->r16[i].lo) cc->AddRangeFlags(next, g->r16[i].lo - 1, parse_flags); next = g->r16[i].hi + 1; } for (int i = 0; i < g->nr32; i++) { - if (next < static_cast(g->r32[i].lo)) + if (next < g->r32[i].lo) cc->AddRangeFlags(next, g->r32[i].lo - 1, parse_flags); next = g->r32[i].hi + 1; } @@ -1526,7 +1741,7 @@ ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags, // Committed to parse. Results: int sign = +1; // -1 = negated char class if (c == 'P') - sign = -1; + sign = -sign; StringPiece seq = *s; // \p{Han} or \pL StringPiece name; // Han or L s->remove_prefix(2); // '\\', 'p' @@ -1536,11 +1751,11 @@ ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags, if (c != '{') { // Name is the bit of string we just skipped over for c. const char* p = seq.begin() + 2; - name = StringPiece(p, s->begin() - p); + name = StringPiece(p, static_cast(s->begin() - p)); } else { // Name is in braces. Look for closing } - int end = s->find('}', 0); - if (end == static_cast(s->npos)) { + size_t end = s->find('}', 0); + if (end == StringPiece::npos) { if (!IsValidUTF8(seq, status)) return kParseError; status->set_code(kRegexpBadCharRange); @@ -1554,13 +1769,15 @@ ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags, } // Chop seq where s now begins. - seq = StringPiece(seq.begin(), s->begin() - seq.begin()); + seq = StringPiece(seq.begin(), static_cast(s->begin() - seq.begin())); - // Look up group if (name.size() > 0 && name[0] == '^') { sign = -sign; name.remove_prefix(1); // '^' } + +#if !defined(RE2_USE_ICU) + // Look up the group in the RE2 Unicode data. const UGroup *g = LookupUnicodeGroup(name); if (g == NULL) { status->set_code(kRegexpBadCharRange); @@ -1569,6 +1786,31 @@ ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags, } AddUGroup(cc, g, sign, parse_flags); +#else + // Look up the group in the ICU Unicode data. Because ICU provides full + // Unicode properties support, this could be more than a lookup by name. + ::icu::UnicodeString ustr = ::icu::UnicodeString::fromUTF8( + string("\\p{") + name.ToString() + string("}")); + UErrorCode uerr = U_ZERO_ERROR; + ::icu::UnicodeSet uset(ustr, uerr); + if (U_FAILURE(uerr)) { + status->set_code(kRegexpBadCharRange); + status->set_error_arg(seq); + return kParseError; + } + + // Convert the UnicodeSet to a URange32 and UGroup that we can add. + int nr = uset.getRangeCount(); + URange32* r = new URange32[nr]; + for (int i = 0; i < nr; i++) { + r[i].lo = uset.getRangeStart(i); + r[i].hi = uset.getRangeEnd(i); + } + UGroup g = {"", +1, 0, 0, r, nr}; + AddUGroup(cc, &g, sign, parse_flags); + delete[] r; +#endif + return kParseOk; } @@ -1595,7 +1837,7 @@ static ParseStatus ParseCCName(StringPiece* s, Regexp::ParseFlags parse_flags, // Got it. Check that it's valid. q += 2; - StringPiece name(p, q-p); + StringPiece name(p, static_cast(q - p)); const UGroup *g = LookupPosixGroup(name); if (g == NULL) { @@ -1649,7 +1891,8 @@ bool Regexp::ParseState::ParseCCRange(StringPiece* s, RuneRange* rr, return false; if (rr->hi < rr->lo) { status->set_code(kRegexpBadCharRange); - status->set_error_arg(StringPiece(os.data(), s->data() - os.data())); + status->set_error_arg( + StringPiece(os.data(), static_cast(s->data() - os.data()))); return false; } } else { @@ -1668,7 +1911,7 @@ bool Regexp::ParseState::ParseCharClass(StringPiece* s, if (s->size() == 0 || (*s)[0] != '[') { // Caller checked this. status->set_code(kRegexpInternalError); - status->set_error_arg(NULL); + status->set_error_arg(StringPiece()); return false; } bool negated = false; @@ -1763,7 +2006,6 @@ bool Regexp::ParseState::ParseCharClass(StringPiece* s, if (negated) re->ccb_->Negate(); - re->ccb_->RemoveAbove(rune_max_); *out_re = re; return true; @@ -1776,7 +2018,7 @@ bool Regexp::ParseState::ParseCharClass(StringPiece* s, static bool IsValidCaptureName(const StringPiece& name) { if (name.size() == 0) return false; - for (int i = 0; i < name.size(); i++) { + for (size_t i = 0; i < name.size(); i++) { int c = name[i]; if (('0' <= c && c <= '9') || ('a' <= c && c <= 'z') || @@ -1822,8 +2064,8 @@ bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) { // so that's the one we implement. One is enough. if (t.size() > 2 && t[0] == 'P' && t[1] == '<') { // Pull out name. - int end = t.find('>', 2); - if (end == static_cast(t.npos)) { + size_t end = t.find('>', 2); + if (end == StringPiece::npos) { if (!IsValidUTF8(*s, status_)) return false; status_->set_code(kRegexpBadNamedCapture); @@ -1847,7 +2089,7 @@ bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) { return false; } - s->remove_prefix(capture.end() - s->begin()); + s->remove_prefix(static_cast(capture.end() - s->begin())); return true; } @@ -1930,7 +2172,8 @@ bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) { BadPerlOp: status_->set_code(kRegexpBadPerlOp); - status_->set_error_arg(StringPiece(s->begin(), t.begin() - s->begin())); + status_->set_error_arg( + StringPiece(s->begin(), static_cast(t.begin() - s->begin()))); return false; } @@ -1942,7 +2185,7 @@ void ConvertLatin1ToUTF8(const StringPiece& latin1, string* utf) { char buf[UTFmax]; utf->clear(); - for (int i = 0; i < latin1.size(); i++) { + for (size_t i = 0; i < latin1.size(); i++) { Rune r = latin1[i] & 0xFF; int n = runetochar(buf, &r); utf->append(buf, n); @@ -1983,9 +2226,9 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags, return ps.DoFinish(); } - StringPiece lastunary = NULL; + StringPiece lastunary = StringPiece(); while (t.size() > 0) { - StringPiece isunary = NULL; + StringPiece isunary = StringPiece(); switch (t[0]) { default: { Rune r; @@ -2008,7 +2251,7 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags, if (!ps.DoLeftParenNoCapture()) return NULL; } else { - if (!ps.DoLeftParen(NULL)) + if (!ps.DoLeftParen(StringPiece())) return NULL; } t.remove_prefix(1); // '(' @@ -2077,12 +2320,14 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags, // a** is a syntax error, not a double-star. // (and a++ means something else entirely, which we don't support!) status->set_code(kRegexpRepeatOp); - status->set_error_arg(StringPiece(lastunary.begin(), - t.begin() - lastunary.begin())); + status->set_error_arg(StringPiece( + lastunary.begin(), + static_cast(t.begin() - lastunary.begin()))); return NULL; } } - opstr.set(opstr.data(), t.data() - opstr.data()); + opstr = StringPiece(opstr.data(), + static_cast(t.data() - opstr.data())); if (!ps.PushRepeatOp(op, opstr, nongreedy)) return NULL; isunary = opstr; @@ -2108,12 +2353,14 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags, if (lastunary.size() > 0) { // Not allowed to stack repetition operators. status->set_code(kRegexpRepeatOp); - status->set_error_arg(StringPiece(lastunary.begin(), - t.begin() - lastunary.begin())); + status->set_error_arg(StringPiece( + lastunary.begin(), + static_cast(t.begin() - lastunary.begin()))); return NULL; } } - opstr.set(opstr.data(), t.data() - opstr.data()); + opstr = StringPiece(opstr.data(), + static_cast(t.data() - opstr.data())); if (!ps.PushRepetition(lo, hi, opstr, nongreedy)) return NULL; isunary = opstr; diff --git a/contrib/libre2/re2/prefilter.cc b/contrib/libre2/re2/prefilter.cc index 6ce43e1659d..e34aaf010f0 100644 --- a/contrib/libre2/re2/prefilter.cc +++ b/contrib/libre2/re2/prefilter.cc @@ -2,34 +2,38 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -#include "util/util.h" #include "re2/prefilter.h" + +#include +#include +#include +#include + +#include "util/util.h" +#include "util/logging.h" +#include "util/strutil.h" +#include "util/utf.h" #include "re2/re2.h" #include "re2/unicode_casefold.h" #include "re2/walker-inl.h" namespace re2 { -static const int Trace = false; +static const bool ExtraDebug = false; -typedef set::iterator SSIter; -typedef set::const_iterator ConstSSIter; +typedef std::set::iterator SSIter; +typedef std::set::const_iterator ConstSSIter; -static int alloc_id = 100000; // Used for debugging. // Initializes a Prefilter, allocating subs_ as necessary. Prefilter::Prefilter(Op op) { op_ = op; subs_ = NULL; if (op_ == AND || op_ == OR) - subs_ = new vector; - - alloc_id_ = alloc_id++; - VLOG(10) << "alloc_id: " << alloc_id_; + subs_ = new std::vector; } // Destroys a Prefilter. Prefilter::~Prefilter() { - VLOG(10) << "Deleted: " << alloc_id_; if (subs_) { for (size_t i = 0; i < subs_->size(); i++) delete (*subs_)[i]; @@ -45,7 +49,7 @@ Prefilter* Prefilter::Simplify() { } // Nothing left in the AND/OR. - if (subs_->size() == 0) { + if (subs_->empty()) { if (op_ == AND) op_ = ALL; // AND of nothing is true else @@ -136,7 +140,7 @@ Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) { return AndOr(OR, a, b); } -static void SimplifyStringSet(set *ss) { +static void SimplifyStringSet(std::set *ss) { // Now make sure that the strings aren't redundant. For example, if // we know "ab" is a required string, then it doesn't help at all to // know that "abc" is also a required string, so delete "abc". This @@ -157,7 +161,7 @@ static void SimplifyStringSet(set *ss) { } } -Prefilter* Prefilter::OrStrings(set* ss) { +Prefilter* Prefilter::OrStrings(std::set* ss) { SimplifyStringSet(ss); Prefilter* or_prefilter = NULL; if (!ss->empty()) { @@ -176,7 +180,7 @@ static Rune ToLowerRune(Rune r) { } const CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r); - if (f == NULL || r < static_cast(f->lo)) + if (f == NULL || r < f->lo) return r; return ApplyFold(f, r); } @@ -222,14 +226,14 @@ class Prefilter::Info { // Caller takes ownership of the Prefilter. Prefilter* TakeMatch(); - set& exact() { return exact_; } + std::set& exact() { return exact_; } bool is_exact() const { return is_exact_; } class Walker; private: - set exact_; + std::set exact_; // When is_exact_ is true, the strings that match // are placed in exact_. When it is no longer an exact @@ -268,7 +272,9 @@ string Prefilter::Info::ToString() { if (is_exact_) { int n = 0; string s; - for (set::iterator i = exact_.begin(); i != exact_.end(); ++i) { + for (std::set::iterator i = exact_.begin(); + i != exact_.end(); + ++i) { if (n++ > 0) s += ","; s += *i; @@ -283,16 +289,17 @@ string Prefilter::Info::ToString() { } // Add the strings from src to dst. -static void CopyIn(const set& src, set* dst) { +static void CopyIn(const std::set& src, + std::set* dst) { for (ConstSSIter i = src.begin(); i != src.end(); ++i) dst->insert(*i); } // Add the cross-product of a and b to dst. // (For each string i in a and j in b, add i+j.) -static void CrossProduct(const set& a, - const set& b, - set* dst) { +static void CrossProduct(const std::set& a, + const std::set& b, + std::set* dst) { for (ConstSSIter i = a.begin(); i != a.end(); ++i) for (ConstSSIter j = b.begin(); j != b.end(); ++j) dst->insert(*i + *j); @@ -446,10 +453,10 @@ Prefilter::Info* Prefilter::Info::EmptyString() { typedef CharClass::iterator CCIter; Prefilter::Info* Prefilter::Info::CClass(CharClass *cc, bool latin1) { - if (Trace) { - VLOG(0) << "CharClassInfo:"; + if (ExtraDebug) { + LOG(ERROR) << "CharClassInfo:"; for (CCIter i = cc->begin(); i != cc->end(); ++i) - VLOG(0) << " " << i->lo << "-" << i->hi; + LOG(ERROR) << " " << i->lo << "-" << i->hi; } // If the class is too large, it's okay to overestimate. @@ -469,9 +476,8 @@ Prefilter::Info* Prefilter::Info::CClass(CharClass *cc, a->is_exact_ = true; - if (Trace) { - VLOG(0) << " = " << a->ToString(); - } + if (ExtraDebug) + LOG(ERROR) << " = " << a->ToString(); return a; } @@ -492,15 +498,16 @@ class Prefilter::Info::Walker : public Regexp::Walker { bool latin1() { return latin1_; } private: bool latin1_; - DISALLOW_EVIL_CONSTRUCTORS(Walker); + + Walker(const Walker&) = delete; + Walker& operator=(const Walker&) = delete; }; Prefilter::Info* Prefilter::BuildInfo(Regexp* re) { - if (Trace) { - LOG(INFO) << "BuildPrefilter::Info: " << re->ToString(); - } + if (ExtraDebug) + LOG(ERROR) << "BuildPrefilter::Info: " << re->ToString(); - bool latin1 = re->parse_flags() & Regexp::Latin1; + bool latin1 = (re->parse_flags() & Regexp::Latin1) != 0; Prefilter::Info::Walker w(latin1); Prefilter::Info* info = w.WalkExponential(re, NULL, 100000); @@ -600,7 +607,6 @@ Prefilter::Info* Prefilter::Info::Walker::PostVisit( info = child_args[0]; for (int i = 1; i < nchild_args; i++) info = Alt(info, child_args[i]); - VLOG(10) << "Alt: " << info->ToString(); break; case kRegexpStar: @@ -630,10 +636,9 @@ Prefilter::Info* Prefilter::Info::Walker::PostVisit( break; } - if (Trace) { - VLOG(0) << "BuildInfo " << re->ToString() - << ": " << (info ? info->ToString() : ""); - } + if (ExtraDebug) + LOG(ERROR) << "BuildInfo " << re->ToString() + << ": " << (info ? info->ToString() : ""); return info; } diff --git a/contrib/libre2/re2/prefilter.h b/contrib/libre2/re2/prefilter.h index c2f9dddd856..ead09e104db 100644 --- a/contrib/libre2/re2/prefilter.h +++ b/contrib/libre2/re2/prefilter.h @@ -2,14 +2,19 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +#ifndef RE2_PREFILTER_H_ +#define RE2_PREFILTER_H_ + // Prefilter is the class used to extract string guards from regexps. // Rather than using Prefilter class directly, use FilteredRE2. // See filtered_re2.h -#ifndef RE2_PREFILTER_H_ -#define RE2_PREFILTER_H_ +#include +#include +#include #include "util/util.h" +#include "util/logging.h" namespace re2 { @@ -37,14 +42,14 @@ class Prefilter { int unique_id() const { return unique_id_; } // The children of the Prefilter node. - vector* subs() { - CHECK(op_ == AND || op_ == OR); + std::vector* subs() { + DCHECK(op_ == AND || op_ == OR); return subs_; } // Set the children vector. Prefilter takes ownership of subs and // subs_ will be deleted when Prefilter is deleted. - void set_subs(vector* subs) { subs_ = subs; } + void set_subs(std::vector* subs) { subs_ = subs; } // Given a RE2, return a Prefilter. The caller takes ownership of // the Prefilter and should deallocate it. Returns NULL if Prefilter @@ -72,7 +77,7 @@ class Prefilter { static Prefilter* FromString(const string& str); - static Prefilter* OrStrings(set* ss); + static Prefilter* OrStrings(std::set* ss); static Info* BuildInfo(Regexp* re); @@ -82,7 +87,7 @@ class Prefilter { Op op_; // Sub-matches for AND or OR Prefilter. - vector* subs_; + std::vector* subs_; // Actual string to match in leaf node. string atom_; @@ -94,10 +99,8 @@ class Prefilter { // and -1 for duplicate nodes. int unique_id_; - // Used for debugging, helps in tracking memory leaks. - int alloc_id_; - - DISALLOW_EVIL_CONSTRUCTORS(Prefilter); + Prefilter(const Prefilter&) = delete; + Prefilter& operator=(const Prefilter&) = delete; }; } // namespace re2 diff --git a/contrib/libre2/re2/prefilter_tree.cc b/contrib/libre2/re2/prefilter_tree.cc index 0ce19a4330b..419f1544c85 100644 --- a/contrib/libre2/re2/prefilter_tree.cc +++ b/contrib/libre2/re2/prefilter_tree.cc @@ -2,20 +2,35 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -#include "util/util.h" -#include "util/flags.h" -#include "re2/prefilter.h" #include "re2/prefilter_tree.h" -#include "re2/re2.h" -DEFINE_int32(filtered_re2_min_atom_len, - 3, - "Strings less than this length are not stored as atoms"); +#include +#include +#include +#include +#include +#include +#include +#include + +#include "util/util.h" +#include "util/logging.h" +#include "util/strutil.h" +#include "re2/prefilter.h" +#include "re2/re2.h" namespace re2 { +static const bool ExtraDebug = false; + PrefilterTree::PrefilterTree() - : compiled_(false) { + : compiled_(false), + min_atom_len_(3) { +} + +PrefilterTree::PrefilterTree(int min_atom_len) + : compiled_(false), + min_atom_len_(min_atom_len) { } PrefilterTree::~PrefilterTree() { @@ -26,62 +41,22 @@ PrefilterTree::~PrefilterTree() { delete entries_[i].parents; } -// Functions used for adding and Compiling prefilters to the -// PrefilterTree. -static bool KeepPart(Prefilter* prefilter, int level) { - if (prefilter == NULL) - return false; - - switch (prefilter->op()) { - default: - LOG(DFATAL) << "Unexpected op in KeepPart: " - << prefilter->op(); - return false; - - case Prefilter::ALL: - return false; - - case Prefilter::ATOM: - return prefilter->atom().size() >= - static_cast(FLAGS_filtered_re2_min_atom_len); - - case Prefilter::AND: { - int j = 0; - vector* subs = prefilter->subs(); - for (size_t i = 0; i < subs->size(); i++) - if (KeepPart((*subs)[i], level + 1)) - (*subs)[j++] = (*subs)[i]; - else - delete (*subs)[i]; - - subs->resize(j); - return j > 0; - } - - case Prefilter::OR: - for (size_t i = 0; i < prefilter->subs()->size(); i++) - if (!KeepPart((*prefilter->subs())[i], level + 1)) - return false; - return true; - } -} - -void PrefilterTree::Add(Prefilter *f) { +void PrefilterTree::Add(Prefilter* prefilter) { if (compiled_) { - LOG(DFATAL) << "Add after Compile."; + LOG(DFATAL) << "Add called after Compile."; return; } - if (f != NULL && !KeepPart(f, 0)) { - delete f; - f = NULL; + if (prefilter != NULL && !KeepNode(prefilter)) { + delete prefilter; + prefilter = NULL; } - prefilter_vec_.push_back(f); + prefilter_vec_.push_back(prefilter); } -void PrefilterTree::Compile(vector* atom_vec) { +void PrefilterTree::Compile(std::vector* atom_vec) { if (compiled_) { - LOG(DFATAL) << "Compile after Compile."; + LOG(DFATAL) << "Compile called already."; return; } @@ -93,7 +68,9 @@ void PrefilterTree::Compile(vector* atom_vec) { compiled_ = true; - AssignUniqueIds(atom_vec); + // TODO(junyer): Use std::unordered_set instead? + NodeMap nodes; + AssignUniqueIds(&nodes, atom_vec); // Identify nodes that are too common among prefilters and are // triggering too many parents. Then get rid of them if possible. @@ -109,9 +86,11 @@ void PrefilterTree::Compile(vector* atom_vec) { // this trigger. TODO(vsri): Adjust the threshold appropriately, // make it a function of total number of nodes? bool have_other_guard = true; - for (StdIntMap::iterator it = parents->begin(); it != parents->end(); ++it) + for (StdIntMap::iterator it = parents->begin(); + it != parents->end(); ++it) { have_other_guard = have_other_guard && (entries_[it->first].propagate_up_at_count > 1); + } if (have_other_guard) { for (StdIntMap::iterator it = parents->begin(); @@ -123,50 +102,82 @@ void PrefilterTree::Compile(vector* atom_vec) { } } - PrintDebugInfo(); + if (ExtraDebug) + PrintDebugInfo(&nodes); } -Prefilter* PrefilterTree::CanonicalNode(Prefilter* node) { +Prefilter* PrefilterTree::CanonicalNode(NodeMap* nodes, Prefilter* node) { string node_string = NodeString(node); - map::iterator iter = node_map_.find(node_string); - if (iter == node_map_.end()) + std::map::iterator iter = nodes->find(node_string); + if (iter == nodes->end()) return NULL; return (*iter).second; } -static string Itoa(int n) { - char buf[100]; - snprintf(buf, sizeof buf, "%d", n); - return string(buf); -} - string PrefilterTree::NodeString(Prefilter* node) const { // Adding the operation disambiguates AND/OR/atom nodes. - string s = Itoa(node->op()) + ":"; + string s = StringPrintf("%d", node->op()) + ":"; if (node->op() == Prefilter::ATOM) { s += node->atom(); } else { - for (size_t i = 0; i < node->subs()->size() ; i++) { + for (size_t i = 0; i < node->subs()->size(); i++) { if (i > 0) s += ','; - s += Itoa((*node->subs())[i]->unique_id()); + s += StringPrintf("%d", (*node->subs())[i]->unique_id()); } } return s; } -void PrefilterTree::AssignUniqueIds(vector* atom_vec) { +bool PrefilterTree::KeepNode(Prefilter* node) const { + if (node == NULL) + return false; + + switch (node->op()) { + default: + LOG(DFATAL) << "Unexpected op in KeepNode: " << node->op(); + return false; + + case Prefilter::ALL: + return false; + + case Prefilter::ATOM: + return node->atom().size() >= static_cast(min_atom_len_); + + case Prefilter::AND: { + int j = 0; + std::vector* subs = node->subs(); + for (size_t i = 0; i < subs->size(); i++) + if (KeepNode((*subs)[i])) + (*subs)[j++] = (*subs)[i]; + else + delete (*subs)[i]; + + subs->resize(j); + return j > 0; + } + + case Prefilter::OR: + for (size_t i = 0; i < node->subs()->size(); i++) + if (!KeepNode((*node->subs())[i])) + return false; + return true; + } +} + +void PrefilterTree::AssignUniqueIds(NodeMap* nodes, + std::vector* atom_vec) { atom_vec->clear(); // Build vector of all filter nodes, sorted topologically // from top to bottom in v. - vector v; + std::vector v; // Add the top level nodes of each regexp prefilter. for (size_t i = 0; i < prefilter_vec_.size(); i++) { Prefilter* f = prefilter_vec_[i]; if (f == NULL) - unfiltered_.push_back(i); + unfiltered_.push_back(static_cast(i)); // We push NULL also on to v, so that we maintain the // mapping of index==regexpid for level=0 prefilter nodes. @@ -179,7 +190,7 @@ void PrefilterTree::AssignUniqueIds(vector* atom_vec) { if (f == NULL) continue; if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) { - const vector& subs = *f->subs(); + const std::vector& subs = *f->subs(); for (size_t j = 0; j < subs.size(); j++) v.push_back(subs[j]); } @@ -187,16 +198,16 @@ void PrefilterTree::AssignUniqueIds(vector* atom_vec) { // Identify unique nodes. int unique_id = 0; - for (int i = v.size() - 1; i >= 0; i--) { + for (int i = static_cast(v.size()) - 1; i >= 0; i--) { Prefilter *node = v[i]; if (node == NULL) continue; node->set_unique_id(-1); - Prefilter* canonical = CanonicalNode(node); + Prefilter* canonical = CanonicalNode(nodes, node); if (canonical == NULL) { // Any further nodes that have the same node string // will find this node as the canonical node. - node_map_[NodeString(node)] = node; + nodes->emplace(NodeString(node), node); if (node->op() == Prefilter::ATOM) { atom_vec->push_back(node->atom()); atom_index_to_id_.push_back(unique_id); @@ -206,15 +217,15 @@ void PrefilterTree::AssignUniqueIds(vector* atom_vec) { node->set_unique_id(canonical->unique_id()); } } - entries_.resize(node_map_.size()); + entries_.resize(nodes->size()); - // Create parent IntMap for the entries. - for (int i = v.size() - 1; i >= 0; i--) { + // Create parent StdIntMap for the entries. + for (int i = static_cast(v.size()) - 1; i >= 0; i--) { Prefilter* prefilter = v[i]; if (prefilter == NULL) continue; - if (CanonicalNode(prefilter) != prefilter) + if (CanonicalNode(nodes, prefilter) != prefilter) continue; Entry* entry = &entries_[prefilter->unique_id()]; @@ -222,12 +233,12 @@ void PrefilterTree::AssignUniqueIds(vector* atom_vec) { } // Fill the entries. - for (int i = v.size() - 1; i >= 0; i--) { + for (int i = static_cast(v.size()) - 1; i >= 0; i--) { Prefilter* prefilter = v[i]; if (prefilter == NULL) continue; - if (CanonicalNode(prefilter) != prefilter) + if (CanonicalNode(nodes, prefilter) != prefilter) continue; Entry* entry = &entries_[prefilter->unique_id()]; @@ -244,10 +255,10 @@ void PrefilterTree::AssignUniqueIds(vector* atom_vec) { case Prefilter::OR: case Prefilter::AND: { - set uniq_child; - for (size_t j = 0; j < prefilter->subs()->size() ; j++) { + std::set uniq_child; + for (size_t j = 0; j < prefilter->subs()->size(); j++) { Prefilter* child = (*prefilter->subs())[j]; - Prefilter* canonical = CanonicalNode(child); + Prefilter* canonical = CanonicalNode(nodes, child); if (canonical == NULL) { LOG(DFATAL) << "Null canonical node"; return; @@ -256,11 +267,14 @@ void PrefilterTree::AssignUniqueIds(vector* atom_vec) { uniq_child.insert(child_id); // To the child, we want to add to parent indices. Entry* child_entry = &entries_[child_id]; - if (child_entry->parents->find(prefilter->unique_id()) == child_entry->parents->end()) + if (child_entry->parents->find(prefilter->unique_id()) == + child_entry->parents->end()) { (*child_entry->parents)[prefilter->unique_id()] = 1; + } } - entry->propagate_up_at_count = - prefilter->op() == Prefilter::AND ? uniq_child.size() : 1; + entry->propagate_up_at_count = prefilter->op() == Prefilter::AND + ? static_cast(uniq_child.size()) + : 1; break; } @@ -271,29 +285,28 @@ void PrefilterTree::AssignUniqueIds(vector* atom_vec) { for (size_t i = 0; i < prefilter_vec_.size(); i++) { if (prefilter_vec_[i] == NULL) continue; - int id = CanonicalNode(prefilter_vec_[i])->unique_id(); + int id = CanonicalNode(nodes, prefilter_vec_[i])->unique_id(); DCHECK_LE(0, id); Entry* entry = &entries_[id]; - entry->regexps.push_back(i); + entry->regexps.push_back(static_cast(i)); } } // Functions for triggering during search. void PrefilterTree::RegexpsGivenStrings( - const vector& matched_atoms, - vector* regexps) const { + const std::vector& matched_atoms, + std::vector* regexps) const { regexps->clear(); if (!compiled_) { - LOG(WARNING) << "Compile() not called"; + LOG(ERROR) << "RegexpsGivenStrings called before Compile."; for (size_t i = 0; i < prefilter_vec_.size(); ++i) - regexps->push_back(i); + regexps->push_back(static_cast(i)); } else { if (!prefilter_vec_.empty()) { - IntMap regexps_map(prefilter_vec_.size()); - vector matched_atom_ids; + IntMap regexps_map(static_cast(prefilter_vec_.size())); + std::vector matched_atom_ids; for (size_t j = 0; j < matched_atoms.size(); j++) { matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]); - VLOG(10) << "Atom id:" << atom_index_to_id_[matched_atoms[j]]; } PropagateMatch(matched_atom_ids, ®exps_map); for (IntMap::iterator it = regexps_map.begin(); @@ -304,23 +317,20 @@ void PrefilterTree::RegexpsGivenStrings( regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end()); } } - sort(regexps->begin(), regexps->end()); + std::sort(regexps->begin(), regexps->end()); } -void PrefilterTree::PropagateMatch(const vector& atom_ids, +void PrefilterTree::PropagateMatch(const std::vector& atom_ids, IntMap* regexps) const { - IntMap count(entries_.size()); - IntMap work(entries_.size()); + IntMap count(static_cast(entries_.size())); + IntMap work(static_cast(entries_.size())); for (size_t i = 0; i < atom_ids.size(); i++) work.set(atom_ids[i], 1); for (IntMap::iterator it = work.begin(); it != work.end(); ++it) { const Entry& entry = entries_[it->index()]; - VLOG(10) << "Processing: " << it->index(); // Record regexps triggered. - for (size_t i = 0; i < entry.regexps.size(); i++) { - VLOG(10) << "Regexp triggered: " << entry.regexps[i]; + for (size_t i = 0; i < entry.regexps.size(); i++) regexps->set(entry.regexps[i], 1); - } int c; // Pass trigger up to parents. for (StdIntMap::iterator it = entry.parents->begin(); @@ -328,7 +338,6 @@ void PrefilterTree::PropagateMatch(const vector& atom_ids, ++it) { int j = it->first; const Entry& parent = entries_[j]; - VLOG(10) << " parent= " << j << " trig= " << parent.propagate_up_at_count; // Delay until all the children have succeeded. if (parent.propagate_up_at_count > 1) { if (count.has_index(j)) { @@ -341,7 +350,6 @@ void PrefilterTree::PropagateMatch(const vector& atom_ids, if (c < parent.propagate_up_at_count) continue; } - VLOG(10) << "Triggering: " << j; // Trigger the parent. work.set(j, 1); } @@ -350,26 +358,26 @@ void PrefilterTree::PropagateMatch(const vector& atom_ids, // Debugging help. void PrefilterTree::PrintPrefilter(int regexpid) { - LOG(INFO) << DebugNodeString(prefilter_vec_[regexpid]); + LOG(ERROR) << DebugNodeString(prefilter_vec_[regexpid]); } -void PrefilterTree::PrintDebugInfo() { - VLOG(10) << "#Unique Atoms: " << atom_index_to_id_.size(); - VLOG(10) << "#Unique Nodes: " << entries_.size(); +void PrefilterTree::PrintDebugInfo(NodeMap* nodes) { + LOG(ERROR) << "#Unique Atoms: " << atom_index_to_id_.size(); + LOG(ERROR) << "#Unique Nodes: " << entries_.size(); for (size_t i = 0; i < entries_.size(); ++i) { StdIntMap* parents = entries_[i].parents; - const vector& regexps = entries_[i].regexps; - VLOG(10) << "EntryId: " << i - << " N: " << parents->size() << " R: " << regexps.size(); + const std::vector& regexps = entries_[i].regexps; + LOG(ERROR) << "EntryId: " << i + << " N: " << parents->size() << " R: " << regexps.size(); for (StdIntMap::iterator it = parents->begin(); it != parents->end(); ++it) - VLOG(10) << it->first; + LOG(ERROR) << it->first; } - VLOG(10) << "Map:"; - for (map::const_iterator iter = node_map_.begin(); - iter != node_map_.end(); ++iter) - VLOG(10) << "NodeId: " << (*iter).second->unique_id() - << " Str: " << (*iter).first; + LOG(ERROR) << "Map:"; + for (std::map::const_iterator iter = nodes->begin(); + iter != nodes->end(); ++iter) + LOG(ERROR) << "NodeId: " << (*iter).second->unique_id() + << " Str: " << (*iter).first; } string PrefilterTree::DebugNodeString(Prefilter* node) const { @@ -382,10 +390,10 @@ string PrefilterTree::DebugNodeString(Prefilter* node) const { // Adding the operation disambiguates AND and OR nodes. node_string += node->op() == Prefilter::AND ? "AND" : "OR"; node_string += "("; - for (size_t i = 0; i < node->subs()->size() ; i++) { + for (size_t i = 0; i < node->subs()->size(); i++) { if (i > 0) node_string += ','; - node_string += Itoa((*node->subs())[i]->unique_id()); + node_string += StringPrintf("%d", (*node->subs())[i]->unique_id()); node_string += ":"; node_string += DebugNodeString((*node->subs())[i]); } diff --git a/contrib/libre2/re2/prefilter_tree.h b/contrib/libre2/re2/prefilter_tree.h index e1d3e5f9b33..f81e1346e0f 100644 --- a/contrib/libre2/re2/prefilter_tree.h +++ b/contrib/libre2/re2/prefilter_tree.h @@ -2,6 +2,9 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +#ifndef RE2_PREFILTER_TREE_H_ +#define RE2_PREFILTER_TREE_H_ + // The PrefilterTree class is used to form an AND-OR tree of strings // that would trigger each regexp. The 'prefilter' of each regexp is // added tp PrefilterTree, and then PrefilterTree is used to find all @@ -12,23 +15,21 @@ // favorite engine. PrefilterTree provides a set of strings (called // atoms) that the user of this class should use to do the string // matching. -// -#ifndef RE2_PREFILTER_TREE_H_ -#define RE2_PREFILTER_TREE_H_ + +#include +#include +#include #include "util/util.h" #include "util/sparse_array.h" +#include "re2/prefilter.h" namespace re2 { -typedef SparseArray IntMap; -typedef map StdIntMap; - -class Prefilter; - class PrefilterTree { public: PrefilterTree(); + explicit PrefilterTree(int min_atom_len); ~PrefilterTree(); // Adds the prefilter for the next regexp. Note that we assume that @@ -42,20 +43,24 @@ class PrefilterTree { // The caller should use the returned set of strings to do string matching. // Each time a string matches, the corresponding index then has to be // and passed to RegexpsGivenStrings below. - void Compile(vector* atom_vec); + void Compile(std::vector* atom_vec); // Given the indices of the atoms that matched, returns the indexes // of regexps that should be searched. The matched_atoms should // contain all the ids of string atoms that were found to match the // content. The caller can use any string match engine to perform // this function. This function is thread safe. - void RegexpsGivenStrings(const vector& matched_atoms, - vector* regexps) const; + void RegexpsGivenStrings(const std::vector& matched_atoms, + std::vector* regexps) const; // Print debug prefilter. Also prints unique ids associated with // nodes of the prefilter of the regexp. void PrintPrefilter(int regexpid); + private: + typedef SparseArray IntMap; + typedef std::map StdIntMap; + typedef std::map NodeMap; // Each unique node has a corresponding Entry that helps in // passing the matching trigger information along the tree. @@ -76,22 +81,24 @@ class PrefilterTree { // When this node is ready to trigger the parent, what are the // regexps that are triggered. - vector regexps; + std::vector regexps; }; - private: + // Returns true if the prefilter node should be kept. + bool KeepNode(Prefilter* node) const; + // This function assigns unique ids to various parts of the // prefilter, by looking at if these nodes are already in the // PrefilterTree. - void AssignUniqueIds(vector* atom_vec); + void AssignUniqueIds(NodeMap* nodes, std::vector* atom_vec); // Given the matching atoms, find the regexps to be triggered. - void PropagateMatch(const vector& atom_ids, + void PropagateMatch(const std::vector& atom_ids, IntMap* regexps) const; // Returns the prefilter node that has the same NodeString as this // node. For the canonical node, returns node. - Prefilter* CanonicalNode(Prefilter* node); + Prefilter* CanonicalNode(NodeMap* nodes, Prefilter* node); // A string that uniquely identifies the node. Assumes that the // children of node has already been assigned unique ids. @@ -101,29 +108,30 @@ class PrefilterTree { string DebugNodeString(Prefilter* node) const; // Used for debugging. - void PrintDebugInfo(); + void PrintDebugInfo(NodeMap* nodes); // These are all the nodes formed by Compile. Essentially, there is // one node for each unique atom and each unique AND/OR node. - vector entries_; - - // Map node string to canonical Prefilter node. - map node_map_; + std::vector entries_; // indices of regexps that always pass through the filter (since we // found no required literals in these regexps). - vector unfiltered_; + std::vector unfiltered_; // vector of Prefilter for all regexps. - vector prefilter_vec_; + std::vector prefilter_vec_; // Atom index in returned strings to entry id mapping. - vector atom_index_to_id_; + std::vector atom_index_to_id_; // Has the prefilter tree been compiled. bool compiled_; - DISALLOW_EVIL_CONSTRUCTORS(PrefilterTree); + // Strings less than this length are not stored as atoms. + const int min_atom_len_; + + PrefilterTree(const PrefilterTree&) = delete; + PrefilterTree& operator=(const PrefilterTree&) = delete; }; } // namespace diff --git a/contrib/libre2/re2/prog.cc b/contrib/libre2/re2/prog.cc index f326ffdd12e..b0ae375970c 100644 --- a/contrib/libre2/re2/prog.cc +++ b/contrib/libre2/re2/prog.cc @@ -5,48 +5,57 @@ // Compiled regular expression representation. // Tested by compile_test.cc -#include "util/util.h" -#include "util/sparse_set.h" #include "re2/prog.h" + +#include +#include +#include +#include +#include + +#include "util/util.h" +#include "util/logging.h" +#include "util/strutil.h" +#include "re2/bitmap256.h" #include "re2/stringpiece.h" namespace re2 { // Constructors per Inst opcode -void Prog::Inst::InitAlt(uint32 out, uint32 out1) { +void Prog::Inst::InitAlt(uint32_t out, uint32_t out1) { DCHECK_EQ(out_opcode_, 0); set_out_opcode(out, kInstAlt); out1_ = out1; } -void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32 out) { +void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32_t out) { DCHECK_EQ(out_opcode_, 0); set_out_opcode(out, kInstByteRange); lo_ = lo & 0xFF; hi_ = hi & 0xFF; - foldcase_ = foldcase; + foldcase_ = foldcase & 0xFF; } -void Prog::Inst::InitCapture(int cap, uint32 out) { +void Prog::Inst::InitCapture(int cap, uint32_t out) { DCHECK_EQ(out_opcode_, 0); set_out_opcode(out, kInstCapture); cap_ = cap; } -void Prog::Inst::InitEmptyWidth(EmptyOp empty, uint32 out) { +void Prog::Inst::InitEmptyWidth(EmptyOp empty, uint32_t out) { DCHECK_EQ(out_opcode_, 0); set_out_opcode(out, kInstEmptyWidth); empty_ = empty; } -void Prog::Inst::InitMatch(int32 id) { +void Prog::Inst::InitMatch(int32_t id) { DCHECK_EQ(out_opcode_, 0); set_opcode(kInstMatch); match_id_ = id; } -void Prog::Inst::InitNop(uint32 out) { +void Prog::Inst::InitNop(uint32_t out) { DCHECK_EQ(out_opcode_, 0); set_opcode(kInstNop); } @@ -94,34 +103,27 @@ Prog::Prog() : anchor_start_(false), anchor_end_(false), reversed_(false), + did_flatten_(false), did_onepass_(false), start_(0), start_unanchored_(0), size_(0), - byte_inst_count_(0), bytemap_range_(0), + first_byte_(-1), flags_(0), - onepass_statesize_(0), + list_count_(0), inst_(NULL), - dfa_first_(NULL), - dfa_longest_(NULL), - dfa_mem_(0), - delete_dfa_(NULL), - unbytemap_(NULL), onepass_nodes_(NULL), - onepass_start_(NULL) { + dfa_mem_(0), + dfa_first_(NULL), + dfa_longest_(NULL) { } Prog::~Prog() { - if (delete_dfa_) { - if (dfa_first_) - delete_dfa_(dfa_first_); - if (dfa_longest_) - delete_dfa_(dfa_longest_); - } + DeleteDFA(dfa_longest_); + DeleteDFA(dfa_first_); delete[] onepass_nodes_; delete[] inst_; - delete[] unbytemap_; } typedef SparseSet Workq; @@ -133,7 +135,6 @@ static inline void AddToQueue(Workq* q, int id) { static string ProgToString(Prog* prog, Workq* q) { string s; - for (Workq::iterator i = q->begin(); i != q->end(); ++i) { int id = *i; Prog::Inst* ip = prog->inst(id); @@ -145,29 +146,56 @@ static string ProgToString(Prog* prog, Workq* q) { return s; } -string Prog::Dump() { - string map; - if (false) { // Debugging - int lo = 0; - StringAppendF(&map, "byte map:\n"); - for (int i = 0; i < bytemap_range_; i++) { - StringAppendF(&map, "\t%d. [%02x-%02x]\n", i, lo, unbytemap_[i]); - lo = unbytemap_[i] + 1; - } - StringAppendF(&map, "\n"); +static string FlattenedProgToString(Prog* prog, int start) { + string s; + for (int id = start; id < prog->size(); id++) { + Prog::Inst* ip = prog->inst(id); + if (ip->last()) + StringAppendF(&s, "%d. %s\n", id, ip->Dump().c_str()); + else + StringAppendF(&s, "%d+ %s\n", id, ip->Dump().c_str()); } + return s; +} + +string Prog::Dump() { + if (did_flatten_) + return FlattenedProgToString(this, start_); Workq q(size_); AddToQueue(&q, start_); - return map + ProgToString(this, &q); + return ProgToString(this, &q); } string Prog::DumpUnanchored() { + if (did_flatten_) + return FlattenedProgToString(this, start_unanchored_); + Workq q(size_); AddToQueue(&q, start_unanchored_); return ProgToString(this, &q); } +string Prog::DumpByteMap() { + string map; + for (int c = 0; c < 256; c++) { + int b = bytemap_[c]; + int lo = c; + while (c < 256-1 && bytemap_[c+1] == b) + c++; + int hi = c; + StringAppendF(&map, "[%02x-%02x] -> %d\n", lo, hi, b); + } + return map; +} + +int Prog::first_byte() { + std::call_once(first_byte_once_, [](Prog* prog) { + prog->first_byte_ = prog->ComputeFirstByte(); + }, this); + return first_byte_; +} + static bool IsMatch(Prog*, Prog::Inst*); // Peep-hole optimizer. @@ -260,7 +288,7 @@ static bool IsMatch(Prog* prog, Prog::Inst* ip) { } } -uint32 Prog::EmptyFlags(const StringPiece& text, const char* p) { +uint32_t Prog::EmptyFlags(const StringPiece& text, const char* p) { int flags = 0; // ^ and \A @@ -294,50 +322,505 @@ uint32 Prog::EmptyFlags(const StringPiece& text, const char* p) { return flags; } -void Prog::MarkByteRange(int lo, int hi) { +// ByteMapBuilder implements a coloring algorithm. +// +// The first phase is a series of "mark and merge" batches: we mark one or more +// [lo-hi] ranges, then merge them into our internal state. Batching is not for +// performance; rather, it means that the ranges are treated indistinguishably. +// +// Internally, the ranges are represented using a bitmap that stores the splits +// and a vector that stores the colors; both of them are indexed by the ranges' +// last bytes. Thus, in order to merge a [lo-hi] range, we split at lo-1 and at +// hi (if not already split), then recolor each range in between. The color map +// (i.e. from the old color to the new color) is maintained for the lifetime of +// the batch and so underpins this somewhat obscure approach to set operations. +// +// The second phase builds the bytemap from our internal state: we recolor each +// range, then store the new color (which is now the byte class) in each of the +// corresponding array elements. Finally, we output the number of byte classes. +class ByteMapBuilder { + public: + ByteMapBuilder() { + // Initial state: the [0-255] range has color 256. + // This will avoid problems during the second phase, + // in which we assign byte classes numbered from 0. + splits_.Set(255); + colors_.resize(256); + colors_[255] = 256; + nextcolor_ = 257; + } + + void Mark(int lo, int hi); + void Merge(); + void Build(uint8_t* bytemap, int* bytemap_range); + + private: + int Recolor(int oldcolor); + + Bitmap256 splits_; + std::vector colors_; + int nextcolor_; + std::vector> colormap_; + std::vector> ranges_; + + ByteMapBuilder(const ByteMapBuilder&) = delete; + ByteMapBuilder& operator=(const ByteMapBuilder&) = delete; +}; + +void ByteMapBuilder::Mark(int lo, int hi) { DCHECK_GE(lo, 0); DCHECK_GE(hi, 0); DCHECK_LE(lo, 255); DCHECK_LE(hi, 255); DCHECK_LE(lo, hi); - if (0 < lo && lo <= 255) - byterange_.Set(lo - 1); - if (0 <= hi && hi <= 255) - byterange_.Set(hi); + + // Ignore any [0-255] ranges. They cause us to recolor every range, which + // has no effect on the eventual result and is therefore a waste of time. + if (lo == 0 && hi == 255) + return; + + ranges_.emplace_back(lo, hi); +} + +void ByteMapBuilder::Merge() { + for (std::vector>::const_iterator it = ranges_.begin(); + it != ranges_.end(); + ++it) { + int lo = it->first-1; + int hi = it->second; + + if (0 <= lo && !splits_.Test(lo)) { + splits_.Set(lo); + int next = splits_.FindNextSetBit(lo+1); + colors_[lo] = colors_[next]; + } + if (!splits_.Test(hi)) { + splits_.Set(hi); + int next = splits_.FindNextSetBit(hi+1); + colors_[hi] = colors_[next]; + } + + int c = lo+1; + while (c < 256) { + int next = splits_.FindNextSetBit(c); + colors_[next] = Recolor(colors_[next]); + if (next == hi) + break; + c = next+1; + } + } + colormap_.clear(); + ranges_.clear(); +} + +void ByteMapBuilder::Build(uint8_t* bytemap, int* bytemap_range) { + // Assign byte classes numbered from 0. + nextcolor_ = 0; + + int c = 0; + while (c < 256) { + int next = splits_.FindNextSetBit(c); + uint8_t b = static_cast(Recolor(colors_[next])); + while (c <= next) { + bytemap[c] = b; + c++; + } + } + + *bytemap_range = nextcolor_; +} + +int ByteMapBuilder::Recolor(int oldcolor) { + // Yes, this is a linear search. There can be at most 256 + // colors and there will typically be far fewer than that. + // Also, we need to consider keys *and* values in order to + // avoid recoloring a given range more than once per batch. + std::vector>::const_iterator it = + std::find_if(colormap_.begin(), colormap_.end(), + [=](const std::pair& kv) -> bool { + return kv.first == oldcolor || kv.second == oldcolor; + }); + if (it != colormap_.end()) + return it->second; + int newcolor = nextcolor_; + nextcolor_++; + colormap_.emplace_back(oldcolor, newcolor); + return newcolor; } void Prog::ComputeByteMap() { - // Fill in bytemap with byte classes for prog_. - // Ranges of bytes that are treated as indistinguishable - // by the regexp program are mapped to a single byte class. - // The vector prog_->byterange() marks the end of each - // such range. - const Bitmap<256>& v = byterange(); + // Fill in bytemap with byte classes for the program. + // Ranges of bytes that are treated indistinguishably + // will be mapped to a single byte class. + ByteMapBuilder builder; - COMPILE_ASSERT(8*sizeof(v.Word(0)) == 32, wordsize); - uint8 n = 0; - uint32 bits = 0; - for (int i = 0; i < 256; i++) { - if ((i&31) == 0) - bits = v.Word(i >> 5); - bytemap_[i] = n; - n += bits & 1; - bits >>= 1; - } - bytemap_range_ = bytemap_[255] + 1; - unbytemap_ = new uint8[bytemap_range_]; - for (int i = 0; i < 256; i++) - unbytemap_[bytemap_[i]] = i; + // Don't repeat the work for ^ and $. + bool marked_line_boundaries = false; + // Don't repeat the work for \b and \B. + bool marked_word_boundaries = false; - if (0) { // For debugging: use trivial byte map. - for (int i = 0; i < 256; i++) { - bytemap_[i] = i; - unbytemap_[i] = i; + for (int id = 0; id < size(); id++) { + Inst* ip = inst(id); + if (ip->opcode() == kInstByteRange) { + int lo = ip->lo(); + int hi = ip->hi(); + builder.Mark(lo, hi); + if (ip->foldcase() && lo <= 'z' && hi >= 'a') { + int foldlo = lo; + int foldhi = hi; + if (foldlo < 'a') + foldlo = 'a'; + if (foldhi > 'z') + foldhi = 'z'; + if (foldlo <= foldhi) + builder.Mark(foldlo + 'A' - 'a', foldhi + 'A' - 'a'); + } + // If this Inst is not the last Inst in its list AND the next Inst is + // also a ByteRange AND the Insts have the same out, defer the merge. + if (!ip->last() && + inst(id+1)->opcode() == kInstByteRange && + ip->out() == inst(id+1)->out()) + continue; + builder.Merge(); + } else if (ip->opcode() == kInstEmptyWidth) { + if (ip->empty() & (kEmptyBeginLine|kEmptyEndLine) && + !marked_line_boundaries) { + builder.Mark('\n', '\n'); + builder.Merge(); + marked_line_boundaries = true; + } + if (ip->empty() & (kEmptyWordBoundary|kEmptyNonWordBoundary) && + !marked_word_boundaries) { + // We require two batches here: the first for ranges that are word + // characters, the second for ranges that are not word characters. + for (bool isword : {true, false}) { + int j; + for (int i = 0; i < 256; i = j) { + for (j = i + 1; j < 256 && + Prog::IsWordChar(static_cast(i)) == + Prog::IsWordChar(static_cast(j)); + j++) + ; + if (Prog::IsWordChar(static_cast(i)) == isword) + builder.Mark(i, j - 1); + } + builder.Merge(); + } + marked_word_boundaries = true; + } } + } + + builder.Build(bytemap_, &bytemap_range_); + + if (0) { // For debugging, use trivial bytemap. + LOG(ERROR) << "Using trivial bytemap."; + for (int i = 0; i < 256; i++) + bytemap_[i] = static_cast(i); bytemap_range_ = 256; - LOG(INFO) << "Using trivial bytemap."; + } +} + +// Prog::Flatten() implements a graph rewriting algorithm. +// +// The overall process is similar to epsilon removal, but retains some epsilon +// transitions: those from Capture and EmptyWidth instructions; and those from +// nullable subexpressions. (The latter avoids quadratic blowup in transitions +// in the worst case.) It might be best thought of as Alt instruction elision. +// +// In conceptual terms, it divides the Prog into "trees" of instructions, then +// traverses the "trees" in order to produce "lists" of instructions. A "tree" +// is one or more instructions that grow from one "root" instruction to one or +// more "leaf" instructions; if a "tree" has exactly one instruction, then the +// "root" is also the "leaf". In most cases, a "root" is the successor of some +// "leaf" (i.e. the "leaf" instruction's out() returns the "root" instruction) +// and is considered a "successor root". A "leaf" can be a ByteRange, Capture, +// EmptyWidth or Match instruction. However, this is insufficient for handling +// nested nullable subexpressions correctly, so in some cases, a "root" is the +// dominator of the instructions reachable from some "successor root" (i.e. it +// has an unreachable predecessor) and is considered a "dominator root". Since +// only Alt instructions can be "dominator roots" (other instructions would be +// "leaves"), only Alt instructions are required to be marked as predecessors. +// +// Dividing the Prog into "trees" comprises two passes: marking the "successor +// roots" and the predecessors; and marking the "dominator roots". Sorting the +// "successor roots" by their bytecode offsets enables iteration in order from +// greatest to least during the second pass; by working backwards in this case +// and flooding the graph no further than "leaves" and already marked "roots", +// it becomes possible to mark "dominator roots" without doing excessive work. +// +// Traversing the "trees" is just iterating over the "roots" in order of their +// marking and flooding the graph no further than "leaves" and "roots". When a +// "leaf" is reached, the instruction is copied with its successor remapped to +// its "root" number. When a "root" is reached, a Nop instruction is generated +// with its successor remapped similarly. As each "list" is produced, its last +// instruction is marked as such. After all of the "lists" have been produced, +// a pass over their instructions remaps their successors to bytecode offsets. +void Prog::Flatten() { + if (did_flatten_) + return; + did_flatten_ = true; + + // Scratch structures. It's important that these are reused by functions + // that we call in loops because they would thrash the heap otherwise. + SparseSet reachable(size()); + std::vector stk; + stk.reserve(size()); + + // First pass: Marks "successor roots" and predecessors. + // Builds the mapping from inst-ids to root-ids. + SparseArray rootmap(size()); + SparseArray predmap(size()); + std::vector> predvec; + MarkSuccessors(&rootmap, &predmap, &predvec, &reachable, &stk); + + // Second pass: Marks "dominator roots". + SparseArray sorted(rootmap); + std::sort(sorted.begin(), sorted.end(), sorted.less); + for (SparseArray::const_iterator i = sorted.end() - 1; + i != sorted.begin(); + --i) { + if (i->index() != start_unanchored() && i->index() != start()) + MarkDominator(i->index(), &rootmap, &predmap, &predvec, &reachable, &stk); + } + + // Third pass: Emits "lists". Remaps outs to root-ids. + // Builds the mapping from root-ids to flat-ids. + std::vector flatmap(rootmap.size()); + std::vector flat; + flat.reserve(size()); + for (SparseArray::const_iterator i = rootmap.begin(); + i != rootmap.end(); + ++i) { + flatmap[i->value()] = static_cast(flat.size()); + EmitList(i->index(), &rootmap, &flat, &reachable, &stk); + flat.back().set_last(); + } + + list_count_ = static_cast(flatmap.size()); + for (int i = 0; i < kNumInst; i++) + inst_count_[i] = 0; + + // Fourth pass: Remaps outs to flat-ids. + // Counts instructions by opcode. + for (int id = 0; id < static_cast(flat.size()); id++) { + Inst* ip = &flat[id]; + if (ip->opcode() != kInstAltMatch) // handled in EmitList() + ip->set_out(flatmap[ip->out()]); + inst_count_[ip->opcode()]++; + } + + int total = 0; + for (int i = 0; i < kNumInst; i++) + total += inst_count_[i]; + DCHECK_EQ(total, static_cast(flat.size())); + + // Remap start_unanchored and start. + if (start_unanchored() == 0) { + DCHECK_EQ(start(), 0); + } else if (start_unanchored() == start()) { + set_start_unanchored(flatmap[1]); + set_start(flatmap[1]); + } else { + set_start_unanchored(flatmap[1]); + set_start(flatmap[2]); + } + + // Finally, replace the old instructions with the new instructions. + size_ = static_cast(flat.size()); + delete[] inst_; + inst_ = new Inst[size_]; + memmove(inst_, flat.data(), size_ * sizeof *inst_); +} + +void Prog::MarkSuccessors(SparseArray* rootmap, + SparseArray* predmap, + std::vector>* predvec, + SparseSet* reachable, std::vector* stk) { + // Mark the kInstFail instruction. + rootmap->set_new(0, rootmap->size()); + + // Mark the start_unanchored and start instructions. + if (!rootmap->has_index(start_unanchored())) + rootmap->set_new(start_unanchored(), rootmap->size()); + if (!rootmap->has_index(start())) + rootmap->set_new(start(), rootmap->size()); + + reachable->clear(); + stk->clear(); + stk->push_back(start_unanchored()); + while (!stk->empty()) { + int id = stk->back(); + stk->pop_back(); + Loop: + if (reachable->contains(id)) + continue; + reachable->insert_new(id); + + Inst* ip = inst(id); + switch (ip->opcode()) { + default: + LOG(DFATAL) << "unhandled opcode: " << ip->opcode(); + break; + + case kInstAltMatch: + case kInstAlt: + // Mark this instruction as a predecessor of each out. + for (int out : {ip->out(), ip->out1()}) { + if (!predmap->has_index(out)) { + predmap->set_new(out, static_cast(predvec->size())); + predvec->emplace_back(); + } + (*predvec)[predmap->get_existing(out)].emplace_back(id); + } + stk->push_back(ip->out1()); + id = ip->out(); + goto Loop; + + case kInstByteRange: + case kInstCapture: + case kInstEmptyWidth: + // Mark the out of this instruction as a "root". + if (!rootmap->has_index(ip->out())) + rootmap->set_new(ip->out(), rootmap->size()); + id = ip->out(); + goto Loop; + + case kInstNop: + id = ip->out(); + goto Loop; + + case kInstMatch: + case kInstFail: + break; + } + } +} + +void Prog::MarkDominator(int root, SparseArray* rootmap, + SparseArray* predmap, + std::vector>* predvec, + SparseSet* reachable, std::vector* stk) { + reachable->clear(); + stk->clear(); + stk->push_back(root); + while (!stk->empty()) { + int id = stk->back(); + stk->pop_back(); + Loop: + if (reachable->contains(id)) + continue; + reachable->insert_new(id); + + if (id != root && rootmap->has_index(id)) { + // We reached another "tree" via epsilon transition. + continue; + } + + Inst* ip = inst(id); + switch (ip->opcode()) { + default: + LOG(DFATAL) << "unhandled opcode: " << ip->opcode(); + break; + + case kInstAltMatch: + case kInstAlt: + stk->push_back(ip->out1()); + id = ip->out(); + goto Loop; + + case kInstByteRange: + case kInstCapture: + case kInstEmptyWidth: + break; + + case kInstNop: + id = ip->out(); + goto Loop; + + case kInstMatch: + case kInstFail: + break; + } + } + + for (SparseSet::const_iterator i = reachable->begin(); + i != reachable->end(); + ++i) { + int id = *i; + if (predmap->has_index(id)) { + for (int pred : (*predvec)[predmap->get_existing(id)]) { + if (!reachable->contains(pred)) { + // id has a predecessor that cannot be reached from root! + // Therefore, id must be a "root" too - mark it as such. + if (!rootmap->has_index(id)) + rootmap->set_new(id, rootmap->size()); + } + } + } + } +} + +void Prog::EmitList(int root, SparseArray* rootmap, + std::vector* flat, + SparseSet* reachable, std::vector* stk) { + reachable->clear(); + stk->clear(); + stk->push_back(root); + while (!stk->empty()) { + int id = stk->back(); + stk->pop_back(); + Loop: + if (reachable->contains(id)) + continue; + reachable->insert_new(id); + + if (id != root && rootmap->has_index(id)) { + // We reached another "tree" via epsilon transition. Emit a kInstNop + // instruction so that the Prog does not become quadratically larger. + flat->emplace_back(); + flat->back().set_opcode(kInstNop); + flat->back().set_out(rootmap->get_existing(id)); + continue; + } + + Inst* ip = inst(id); + switch (ip->opcode()) { + default: + LOG(DFATAL) << "unhandled opcode: " << ip->opcode(); + break; + + case kInstAltMatch: + flat->emplace_back(); + flat->back().set_opcode(kInstAltMatch); + flat->back().set_out(static_cast(flat->size())); + flat->back().out1_ = static_cast(flat->size())+1; + FALLTHROUGH_INTENDED; + + case kInstAlt: + stk->push_back(ip->out1()); + id = ip->out(); + goto Loop; + + case kInstByteRange: + case kInstCapture: + case kInstEmptyWidth: + flat->emplace_back(); + memmove(&flat->back(), ip, sizeof *ip); + flat->back().set_out(rootmap->get_existing(ip->out())); + break; + + case kInstNop: + id = ip->out(); + goto Loop; + + case kInstMatch: + case kInstFail: + flat->emplace_back(); + memmove(&flat->back(), ip, sizeof *ip); + break; + } } } } // namespace re2 - diff --git a/contrib/libre2/re2/prog.h b/contrib/libre2/re2/prog.h index 2cf65bc7672..3fb1c1fb0aa 100644 --- a/contrib/libre2/re2/prog.h +++ b/contrib/libre2/re2/prog.h @@ -2,50 +2,27 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +#ifndef RE2_PROG_H_ +#define RE2_PROG_H_ + // Compiled representation of regular expressions. // See regexp.h for the Regexp class, which represents a regular // expression symbolically. -#ifndef RE2_PROG_H__ -#define RE2_PROG_H__ +#include +#include +#include +#include +#include #include "util/util.h" +#include "util/logging.h" +#include "util/sparse_array.h" +#include "util/sparse_set.h" #include "re2/re2.h" namespace re2 { -// Simple fixed-size bitmap. -template -class Bitmap { - public: - Bitmap() { Reset(); } - int Size() { return Bits; } - - void Reset() { - for (int i = 0; i < Words; i++) - w_[i] = 0; - } - bool Get(int k) const { - return w_[k >> WordLog] & (1<<(k & 31)); - } - void Set(int k) { - w_[k >> WordLog] |= 1<<(k & 31); - } - void Clear(int k) { - w_[k >> WordLog] &= ~(1<<(k & 31)); - } - uint32 Word(int i) const { - return w_[i]; - } - - private: - static const int WordLog = 5; - static const int Words = (Bits+31)/32; - uint32 w_[Words]; - DISALLOW_EVIL_CONSTRUCTORS(Bitmap); -}; - - // Opcodes for Inst enum InstOp { kInstAlt = 0, // choose between out_ and out1_ @@ -56,6 +33,7 @@ enum InstOp { kInstMatch, // found a match! kInstNop, // no-op; occasionally unavoidable kInstFail, // never match; occasionally unavoidable + kNumInst, }; // Bit flags for empty-width specials @@ -69,10 +47,8 @@ enum EmptyOp { kEmptyAllFlags = (1<<6)-1, }; -class Regexp; - class DFA; -struct OneState; +class Regexp; // Compiled form of regexp program. class Prog { @@ -83,31 +59,39 @@ class Prog { // Single instruction in regexp program. class Inst { public: - Inst() : out_opcode_(0), out1_(0) { } + Inst() : out_opcode_(0), out1_(0) {} + + // Copyable. + Inst(const Inst&) = default; + Inst& operator=(const Inst&) = default; // Constructors per opcode - void InitAlt(uint32 out, uint32 out1); - void InitByteRange(int lo, int hi, int foldcase, uint32 out); - void InitCapture(int cap, uint32 out); - void InitEmptyWidth(EmptyOp empty, uint32 out); + void InitAlt(uint32_t out, uint32_t out1); + void InitByteRange(int lo, int hi, int foldcase, uint32_t out); + void InitCapture(int cap, uint32_t out); + void InitEmptyWidth(EmptyOp empty, uint32_t out); void InitMatch(int id); - void InitNop(uint32 out); + void InitNop(uint32_t out); void InitFail(); // Getters - int id(Prog* p) { return this - p->inst_; } + int id(Prog* p) { return static_cast(this - p->inst_); } InstOp opcode() { return static_cast(out_opcode_&7); } - int out() { return out_opcode_>>3; } - int out1() { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; } + int last() { return (out_opcode_>>3)&1; } + int out() { return out_opcode_>>4; } + int out1() { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; } int cap() { DCHECK_EQ(opcode(), kInstCapture); return cap_; } int lo() { DCHECK_EQ(opcode(), kInstByteRange); return lo_; } int hi() { DCHECK_EQ(opcode(), kInstByteRange); return hi_; } int foldcase() { DCHECK_EQ(opcode(), kInstByteRange); return foldcase_; } int match_id() { DCHECK_EQ(opcode(), kInstMatch); return match_id_; } EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; } - bool greedy(Prog *p) { + + bool greedy(Prog* p) { DCHECK_EQ(opcode(), kInstAltMatch); - return p->inst(out())->opcode() == kInstByteRange; + return p->inst(out())->opcode() == kInstByteRange || + (p->inst(out())->opcode() == kInstNop && + p->inst(p->inst(out())->out())->opcode() == kInstByteRange); } // Does this inst (an kInstByteRange) match c? @@ -122,52 +106,54 @@ class Prog { string Dump(); // Maximum instruction id. - // (Must fit in out_opcode_, and PatchList steals another bit.) + // (Must fit in out_opcode_. PatchList/last steal another bit.) static const int kMaxInst = (1<<28) - 1; private: void set_opcode(InstOp opcode) { - out_opcode_ = (out()<<3) | opcode; + out_opcode_ = (out()<<4) | (last()<<3) | opcode; + } + + void set_last() { + out_opcode_ = (out()<<4) | (1<<3) | opcode(); } void set_out(int out) { - out_opcode_ = (out<<3) | opcode(); + out_opcode_ = (out<<4) | (last()<<3) | opcode(); } void set_out_opcode(int out, InstOp opcode) { - out_opcode_ = (out<<3) | opcode; + out_opcode_ = (out<<4) | (last()<<3) | opcode; } - uint32 out_opcode_; // 29 bits of out, 3 (low) bits opcode - union { // additional instruction arguments: - uint32 out1_; // opcode == kInstAlt - // alternate next instruction + uint32_t out_opcode_; // 28 bits: out, 1 bit: last, 3 (low) bits: opcode + union { // additional instruction arguments: + uint32_t out1_; // opcode == kInstAlt + // alternate next instruction - int32 cap_; // opcode == kInstCapture - // Index of capture register (holds text - // position recorded by capturing parentheses). - // For \n (the submatch for the nth parentheses), - // the left parenthesis captures into register 2*n - // and the right one captures into register 2*n+1. + int32_t cap_; // opcode == kInstCapture + // Index of capture register (holds text + // position recorded by capturing parentheses). + // For \n (the submatch for the nth parentheses), + // the left parenthesis captures into register 2*n + // and the right one captures into register 2*n+1. - int32 match_id_; // opcode == kInstMatch - // Match ID to identify this match (for re2::Set). + int32_t match_id_; // opcode == kInstMatch + // Match ID to identify this match (for re2::Set). - struct { // opcode == kInstByteRange - uint8 lo_; // byte range is lo_-hi_ inclusive - uint8 hi_; // - uint8 foldcase_; // convert A-Z to a-z before checking range. + struct { // opcode == kInstByteRange + uint8_t lo_; // byte range is lo_-hi_ inclusive + uint8_t hi_; // + uint8_t foldcase_; // convert A-Z to a-z before checking range. }; - EmptyOp empty_; // opcode == kInstEmptyWidth - // empty_ is bitwise OR of kEmpty* flags above. + EmptyOp empty_; // opcode == kInstEmptyWidth + // empty_ is bitwise OR of kEmpty* flags above. }; friend class Compiler; friend struct PatchList; friend class Prog; - - DISALLOW_EVIL_CONSTRUCTORS(Inst); }; // Whether to anchor the search. @@ -200,13 +186,13 @@ class Prog { int start_unanchored() { return start_unanchored_; } void set_start(int start) { start_ = start; } void set_start_unanchored(int start) { start_unanchored_ = start; } - int64 size() { return size_; } + int size() { return size_; } bool reversed() { return reversed_; } void set_reversed(bool reversed) { reversed_ = reversed; } - int64 byte_inst_count() { return byte_inst_count_; } - const Bitmap<256>& byterange() { return byterange_; } - void set_dfa_mem(int64 dfa_mem) { dfa_mem_ = dfa_mem; } - int64 dfa_mem() { return dfa_mem_; } + int list_count() { return list_count_; } + int inst_count(InstOp op) { return inst_count_[op]; } + void set_dfa_mem(int64_t dfa_mem) { dfa_mem_ = dfa_mem; } + int64_t dfa_mem() { return dfa_mem_; } int flags() { return flags_; } void set_flags(int flags) { flags_ = flags; } bool anchor_start() { return anchor_start_; } @@ -214,21 +200,19 @@ class Prog { bool anchor_end() { return anchor_end_; } void set_anchor_end(bool b) { anchor_end_ = b; } int bytemap_range() { return bytemap_range_; } - const uint8* bytemap() { return bytemap_; } + const uint8_t* bytemap() { return bytemap_; } + + // Lazily computed. + int first_byte(); // Returns string representation of program for debugging. string Dump(); string DumpUnanchored(); - - // Record that at some point in the prog, the bytes in the range - // lo-hi (inclusive) are treated as different from bytes outside the range. - // Tracking this lets the DFA collapse commonly-treated byte ranges - // when recording state pointers, greatly reducing its memory footprint. - void MarkByteRange(int lo, int hi); + string DumpByteMap(); // Returns the set of kEmpty flags that are in effect at // position p within context. - static uint32 EmptyFlags(const StringPiece& context, const char* p); + static uint32_t EmptyFlags(const StringPiece& context, const char* p); // Returns whether byte c is a word character: ASCII only. // Used by the implementation of \b and \B. @@ -237,7 +221,7 @@ class Prog { // (the DFA has only one-byte lookahead). // - even if the lookahead were possible, the Progs would be huge. // This crude approximation is the same one PCRE uses. - static bool IsWordChar(uint8 c) { + static bool IsWordChar(uint8_t c) { return ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z') || ('0' <= c && c <= '9') || @@ -270,19 +254,37 @@ class Prog { // If matches != NULL and kind == kManyMatch and there is a match, // SearchDFA fills matches with the match IDs of the final matching state. bool SearchDFA(const StringPiece& text, const StringPiece& context, - Anchor anchor, MatchKind kind, - StringPiece* match0, bool* failed, - vector* matches); + Anchor anchor, MatchKind kind, StringPiece* match0, + bool* failed, SparseSet* matches); - // Build the entire DFA for the given match kind. FOR TESTING ONLY. + // The callback issued after building each DFA state with BuildEntireDFA(). + // If next is null, then the memory budget has been exhausted and building + // will halt. Otherwise, the state has been built and next points to an array + // of bytemap_range()+1 slots holding the next states as per the bytemap and + // kByteEndText. The number of the state is implied by the callback sequence: + // the first callback is for state 0, the second callback is for state 1, ... + // match indicates whether the state is a matching state. + using DFAStateCallback = std::function; + + // Build the entire DFA for the given match kind. // Usually the DFA is built out incrementally, as needed, which - // avoids lots of unnecessary work. This function is useful only - // for testing purposes. Returns number of states. - int BuildEntireDFA(MatchKind kind); + // avoids lots of unnecessary work. + // If cb is not empty, it receives one callback per state built. + // Returns the number of states built. + // FOR TESTING OR EXPERIMENTAL PURPOSES ONLY. + int BuildEntireDFA(MatchKind kind, const DFAStateCallback& cb); - // Compute byte map. + // Controls whether the DFA should bail out early if the NFA would be faster. + // FOR TESTING ONLY. + static void TEST_dfa_should_bail_when_slow(bool b); + + // Compute bytemap. void ComputeByteMap(); + // Computes whether all matches must begin with the same first + // byte, and if so, returns that byte. If not, returns -1. + int ComputeFirstByte(); + // Run peep-hole optimizer on program. void Optimize(); @@ -329,48 +331,80 @@ class Prog { // Returns true on success, false on error. bool PossibleMatchRange(string* min, string* max, int maxlen); + // EXPERIMENTAL! SUBJECT TO CHANGE! + // Outputs the program fanout into the given sparse array. + void Fanout(SparseArray* fanout); + // Compiles a collection of regexps to Prog. Each regexp will have - // its own Match instruction recording the index in the vector. - static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor, - Regexp* re); + // its own Match instruction recording the index in the output vector. + static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem); + + // Flattens the Prog from "tree" form to "list" form. This is an in-place + // operation in the sense that the old instructions are lost. + void Flatten(); + + // Walks the Prog; the "successor roots" or predecessors of the reachable + // instructions are marked in rootmap or predmap/predvec, respectively. + // reachable and stk are preallocated scratch structures. + void MarkSuccessors(SparseArray* rootmap, + SparseArray* predmap, + std::vector>* predvec, + SparseSet* reachable, std::vector* stk); + + // Walks the Prog from the given "root" instruction; the "dominator root" + // of the reachable instructions (if such exists) is marked in rootmap. + // reachable and stk are preallocated scratch structures. + void MarkDominator(int root, SparseArray* rootmap, + SparseArray* predmap, + std::vector>* predvec, + SparseSet* reachable, std::vector* stk); + + // Walks the Prog from the given "root" instruction; the reachable + // instructions are emitted in "list" form and appended to flat. + // reachable and stk are preallocated scratch structures. + void EmitList(int root, SparseArray* rootmap, + std::vector* flat, + SparseSet* reachable, std::vector* stk); private: friend class Compiler; DFA* GetDFA(MatchKind kind); + void DeleteDFA(DFA* dfa); bool anchor_start_; // regexp has explicit start anchor bool anchor_end_; // regexp has explicit end anchor bool reversed_; // whether program runs backward over input + bool did_flatten_; // has Flatten been called? bool did_onepass_; // has IsOnePass been called? int start_; // entry point for program int start_unanchored_; // unanchored entry point for program int size_; // number of instructions - int byte_inst_count_; // number of kInstByteRange instructions int bytemap_range_; // bytemap_[x] < bytemap_range_ + int first_byte_; // required first byte for match, or -1 if none int flags_; // regexp parse flags - int onepass_statesize_; // byte size of each OneState* node + + int list_count_; // count of lists (see above) + int inst_count_[kNumInst]; // count of instructions by opcode Inst* inst_; // pointer to instruction array + uint8_t* onepass_nodes_; // data for OnePass nodes - Mutex dfa_mutex_; // Protects dfa_first_, dfa_longest_ - DFA* volatile dfa_first_; // DFA cached for kFirstMatch - DFA* volatile dfa_longest_; // DFA cached for kLongestMatch and kFullMatch - int64 dfa_mem_; // Maximum memory for DFAs. - void (*delete_dfa_)(DFA* dfa); + int64_t dfa_mem_; // Maximum memory for DFAs. + DFA* dfa_first_; // DFA cached for kFirstMatch/kManyMatch + DFA* dfa_longest_; // DFA cached for kLongestMatch/kFullMatch - Bitmap<256> byterange_; // byterange.Get(x) true if x ends a - // commonly-treated byte range. - uint8 bytemap_[256]; // map from input bytes to byte classes - uint8 *unbytemap_; // bytemap_[unbytemap_[x]] == x + uint8_t bytemap_[256]; // map from input bytes to byte classes - uint8* onepass_nodes_; // data for OnePass nodes - OneState* onepass_start_; // start node for OnePass program + std::once_flag first_byte_once_; + std::once_flag dfa_first_once_; + std::once_flag dfa_longest_once_; - DISALLOW_EVIL_CONSTRUCTORS(Prog); + Prog(const Prog&) = delete; + Prog& operator=(const Prog&) = delete; }; } // namespace re2 -#endif // RE2_PROG_H__ +#endif // RE2_PROG_H_ diff --git a/contrib/libre2/re2/re2.cc b/contrib/libre2/re2/re2.cc index edef26edd2a..ff4f7a62bd2 100644 --- a/contrib/libre2/re2/re2.cc +++ b/contrib/libre2/re2/re2.cc @@ -9,33 +9,34 @@ #include "re2/re2.h" -#include -#include -#include +#include +#include #include -#include "util/atomicops.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + #include "util/util.h" -#include "util/flags.h" +#include "util/logging.h" +#include "util/sparse_array.h" +#include "util/strutil.h" +#include "util/utf.h" #include "re2/prog.h" #include "re2/regexp.h" -DEFINE_bool(trace_re2, false, "trace RE2 execution"); - namespace re2 { // Maximum number of args we can set static const int kMaxArgs = 16; static const int kVecSize = 1+kMaxArgs; -const VariadicFunction2 RE2::FullMatch = {}; -const VariadicFunction2 RE2::PartialMatch = {}; -const VariadicFunction2 RE2::Consume = {}; -const VariadicFunction2 RE2::FindAndConsume = {}; - -// This will trigger LNK2005 error in MSVC. -#ifndef COMPILER_MSVC const int RE2::Options::kDefaultMaxMem; // initialized in re2.h -#endif // COMPILER_MSVC RE2::Options::Options(RE2::CannedOptions opt) : encoding_(opt == RE2::Latin1 ? EncodingLatin1 : EncodingUTF8), @@ -53,22 +54,11 @@ RE2::Options::Options(RE2::CannedOptions opt) one_line_(false) { } -// static empty things for use as const references. -// To avoid global constructors, initialized on demand. -GLOBAL_MUTEX(empty_mutex); -static const string *empty_string; -static const map *empty_named_groups; -static const map *empty_group_names; - -static void InitEmpty() { - GLOBAL_MUTEX_LOCK(empty_mutex); - if (empty_string == NULL) { - empty_string = new string; - empty_named_groups = new map; - empty_group_names = new map; - } - GLOBAL_MUTEX_UNLOCK(empty_mutex); -} +// static empty objects for use as const references. +// To avoid global constructors, allocated in RE2::Init(). +static const string* empty_string; +static const std::map* empty_named_groups; +static const std::map* empty_group_names; // Converts from Regexp error code to RE2 error code. // Maybe some day they will diverge. In any event, this @@ -109,8 +99,8 @@ static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) { static string trunc(const StringPiece& pattern) { if (pattern.size() < 100) - return pattern.as_string(); - return pattern.substr(0, 100).as_string() + "..."; + return pattern.ToString(); + return pattern.substr(0, 100).ToString() + "..."; } @@ -175,19 +165,24 @@ int RE2::Options::ParseFlags() const { } void RE2::Init(const StringPiece& pattern, const Options& options) { - mutex_ = new Mutex; - pattern_ = pattern.as_string(); + static std::once_flag empty_once; + std::call_once(empty_once, []() { + empty_string = new string; + empty_named_groups = new std::map; + empty_group_names = new std::map; + }); + + pattern_ = pattern.ToString(); options_.Copy(options); - InitEmpty(); - error_ = empty_string; - error_code_ = NoError; - suffix_regexp_ = NULL; entire_regexp_ = NULL; + suffix_regexp_ = NULL; prog_ = NULL; rprog_ = NULL; + error_ = empty_string; + error_code_ = NoError; + num_captures_ = -1; named_groups_ = NULL; group_names_ = NULL; - num_captures_ = -1; RegexpStatus status; entire_regexp_ = Regexp::Parse( @@ -195,19 +190,16 @@ void RE2::Init(const StringPiece& pattern, const Options& options) { static_cast(options_.ParseFlags()), &status); if (entire_regexp_ == NULL) { - if (error_ == empty_string) - error_ = new string(status.Text()); if (options_.log_errors()) { LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': " << status.Text(); } - error_arg_ = status.error_arg().as_string(); + error_ = new string(status.Text()); error_code_ = RegexpErrorToRE2(status.code()); + error_arg_ = status.error_arg().ToString(); return; } - prefix_.clear(); - prefix_foldcase_ = false; re2::Regexp* suffix; if (entire_regexp_->RequiredPrefix(&prefix_, &prefix_foldcase_, &suffix)) suffix_regexp_ = suffix; @@ -236,17 +228,16 @@ void RE2::Init(const StringPiece& pattern, const Options& options) { // Returns rprog_, computing it if needed. re2::Prog* RE2::ReverseProg() const { - MutexLock l(mutex_); - if (rprog_ == NULL && error_ == empty_string) { - rprog_ = suffix_regexp_->CompileToReverseProg(options_.max_mem()/3); - if (rprog_ == NULL) { - if (options_.log_errors()) - LOG(ERROR) << "Error reverse compiling '" << trunc(pattern_) << "'"; - error_ = new string("pattern too large - reverse compile failed"); - error_code_ = RE2::ErrorPatternTooLarge; - return NULL; + std::call_once(rprog_once_, [](const RE2* re) { + re->rprog_ = + re->suffix_regexp_->CompileToReverseProg(re->options_.max_mem() / 3); + if (re->rprog_ == NULL) { + if (re->options_.log_errors()) + LOG(ERROR) << "Error reverse compiling '" << trunc(re->pattern_) << "'"; + re->error_ = new string("pattern too large - reverse compile failed"); + re->error_code_ = RE2::ErrorPatternTooLarge; } - } + }, this); return rprog_; } @@ -255,7 +246,6 @@ RE2::~RE2() { suffix_regexp_->Decref(); if (entire_regexp_) entire_regexp_->Decref(); - delete mutex_; delete prog_; delete rprog_; if (error_ != empty_string) @@ -272,29 +262,52 @@ int RE2::ProgramSize() const { return prog_->size(); } -// Returns named_groups_, computing it if needed. -const map& RE2::NamedCapturingGroups() const { - MutexLock l(mutex_); - if (!ok()) - return *empty_named_groups; - if (named_groups_ == NULL) { - named_groups_ = suffix_regexp_->NamedCaptures(); - if (named_groups_ == NULL) - named_groups_ = empty_named_groups; +int RE2::ProgramFanout(std::map* histogram) const { + if (prog_ == NULL) + return -1; + SparseArray fanout(prog_->size()); + prog_->Fanout(&fanout); + histogram->clear(); + for (SparseArray::iterator i = fanout.begin(); i != fanout.end(); ++i) { + // TODO(junyer): Optimise this? + int bucket = 0; + while (1 << bucket < i->second) { + bucket++; + } + (*histogram)[bucket]++; } + return histogram->rbegin()->first; +} + +// Returns num_captures_, computing it if needed, or -1 if the +// regexp wasn't valid on construction. +int RE2::NumberOfCapturingGroups() const { + std::call_once(num_captures_once_, [](const RE2* re) { + if (re->suffix_regexp_ != NULL) + re->num_captures_ = re->suffix_regexp_->NumCaptures(); + }, this); + return num_captures_; +} + +// Returns named_groups_, computing it if needed. +const std::map& RE2::NamedCapturingGroups() const { + std::call_once(named_groups_once_, [](const RE2* re) { + if (re->suffix_regexp_ != NULL) + re->named_groups_ = re->suffix_regexp_->NamedCaptures(); + if (re->named_groups_ == NULL) + re->named_groups_ = empty_named_groups; + }, this); return *named_groups_; } // Returns group_names_, computing it if needed. -const map& RE2::CapturingGroupNames() const { - MutexLock l(mutex_); - if (!ok()) - return *empty_group_names; - if (group_names_ == NULL) { - group_names_ = suffix_regexp_->CaptureNames(); - if (group_names_ == NULL) - group_names_ = empty_group_names; - } +const std::map& RE2::CapturingGroupNames() const { + std::call_once(group_names_once_, [](const RE2* re) { + if (re->suffix_regexp_ != NULL) + re->group_names_ = re->suffix_regexp_->CaptureNames(); + if (re->group_names_ == NULL) + re->group_names_ = empty_group_names; + }, this); return *group_names_; } @@ -312,7 +325,7 @@ bool RE2::PartialMatchN(const StringPiece& text, const RE2& re, bool RE2::ConsumeN(StringPiece* input, const RE2& re, const Arg* const args[], int n) { - int consumed; + size_t consumed; if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) { input->remove_prefix(consumed); return true; @@ -323,7 +336,7 @@ bool RE2::ConsumeN(StringPiece* input, const RE2& re, bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re, const Arg* const args[], int n) { - int consumed; + size_t consumed; if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) { input->remove_prefix(consumed); return true; @@ -332,31 +345,12 @@ bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re, } } -// Returns the maximum submatch needed for the rewrite to be done by Replace(). -// E.g. if rewrite == "foo \\2,\\1", returns 2. -int RE2::MaxSubmatch(const StringPiece& rewrite) { - int max = 0; - for (const char *s = rewrite.data(), *end = s + rewrite.size(); - s < end; s++) { - if (*s == '\\') { - s++; - int c = (s < end) ? *s : -1; - if (isdigit(c)) { - int n = (c - '0'); - if (n > max) - max = n; - } - } - } - return max; -} - bool RE2::Replace(string *str, const RE2& re, const StringPiece& rewrite) { StringPiece vec[kVecSize]; int nvec = 1 + MaxSubmatch(rewrite); - if (static_cast(nvec) > arraysize(vec)) + if (nvec > arraysize(vec)) return false; if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec)) return false; @@ -376,7 +370,7 @@ int RE2::GlobalReplace(string *str, const StringPiece& rewrite) { StringPiece vec[kVecSize]; int nvec = 1 + MaxSubmatch(rewrite); - if (static_cast(nvec) > arraysize(vec)) + if (nvec > arraysize(vec)) return false; const char* p = str->data(); @@ -384,13 +378,44 @@ int RE2::GlobalReplace(string *str, const char* lastend = NULL; string out; int count = 0; +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + // Iterate just once when fuzzing. Otherwise, we easily get bogged down + // and coverage is unlikely to improve despite significant expense. + while (p == str->data()) { +#else while (p <= ep) { - if (!re.Match(*str, p - str->data(), str->size(), UNANCHORED, vec, nvec)) +#endif + if (!re.Match(*str, static_cast(p - str->data()), + str->size(), UNANCHORED, vec, nvec)) break; if (p < vec[0].begin()) out.append(p, vec[0].begin() - p); if (vec[0].begin() == lastend && vec[0].size() == 0) { // Disallow empty match at end of last match: skip ahead. + // + // fullrune() takes int, not size_t. However, it just looks + // at the leading byte and treats any length >= 4 the same. + if (re.options().encoding() == RE2::Options::EncodingUTF8 && + fullrune(p, static_cast(std::min(static_cast(4), + ep - p)))) { + // re is in UTF-8 mode and there is enough left of str + // to allow us to advance by up to UTFmax bytes. + Rune r; + int n = chartorune(&r, p); + // Some copies of chartorune have a bug that accepts + // encodings of values in (10FFFF, 1FFFFF] as valid. + if (r > Runemax) { + n = 1; + r = Runeerror; + } + if (!(n == 1 && r == Runeerror)) { // no decoding error + out.append(p, n); + p += n; + continue; + } + } + // Most likely, re is in Latin-1 mode. If it is in UTF-8 mode, + // we fell through from above and the GIGO principle applies. if (p < ep) out.append(p, 1); p++; @@ -407,6 +432,7 @@ int RE2::GlobalReplace(string *str, if (p < ep) out.append(p, ep - p); + using std::swap; swap(out, *str); return count; } @@ -417,7 +443,7 @@ bool RE2::Extract(const StringPiece &text, string *out) { StringPiece vec[kVecSize]; int nvec = 1 + MaxSubmatch(rewrite); - if (static_cast(nvec) > arraysize(vec)) + if (nvec > arraysize(vec)) return false; if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec)) @@ -438,7 +464,7 @@ string RE2::QuoteMeta(const StringPiece& unquoted) { // that. (This also makes it identical to the perl function of the // same name except for the null-character special case; // see `perldoc -f quotemeta`.) - for (int ii = 0; ii < unquoted.length(); ++ii) { + for (size_t ii = 0; ii < unquoted.size(); ++ii) { // Note that using 'isalnum' here raises the benchmark time from // 32ns to 58ns: if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') && @@ -469,19 +495,19 @@ bool RE2::PossibleMatchRange(string* min, string* max, int maxlen) const { if (prog_ == NULL) return false; - int n = prefix_.size(); + int n = static_cast(prefix_.size()); if (n > maxlen) n = maxlen; // Determine initial min max from prefix_ literal. - string pmin, pmax; - pmin = prefix_.substr(0, n); - pmax = prefix_.substr(0, n); + *min = prefix_.substr(0, n); + *max = prefix_.substr(0, n); if (prefix_foldcase_) { - // prefix is ASCII lowercase; change pmin to uppercase. + // prefix is ASCII lowercase; change *min to uppercase. for (int i = 0; i < n; i++) { - if ('a' <= pmin[i] && pmin[i] <= 'z') - pmin[i] += 'A' - 'a'; + char& c = (*min)[i]; + if ('a' <= c && c <= 'z') + c += 'A' - 'a'; } } @@ -489,13 +515,13 @@ bool RE2::PossibleMatchRange(string* min, string* max, int maxlen) const { string dmin, dmax; maxlen -= n; if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) { - pmin += dmin; - pmax += dmax; - } else if (pmax.size() > 0) { + min->append(dmin); + max->append(dmax); + } else if (!max->empty()) { // prog_->PossibleMatchRange has failed us, // but we still have useful information from prefix_. - // Round up pmax to allow any possible suffix. - pmax = PrefixSuccessor(pmax); + // Round up *max to allow any possible suffix. + PrefixSuccessor(max); } else { // Nothing useful. *min = ""; @@ -503,19 +529,17 @@ bool RE2::PossibleMatchRange(string* min, string* max, int maxlen) const { return false; } - *min = pmin; - *max = pmax; return true; } // Avoid possible locale nonsense in standard strcasecmp. // The string a is known to be all lowercase. -static int ascii_strcasecmp(const char* a, const char* b, int len) { +static int ascii_strcasecmp(const char* a, const char* b, size_t len) { const char *ae = a + len; for (; a < ae; a++, b++) { - uint8 x = *a; - uint8 y = *b; + uint8_t x = *a; + uint8_t y = *b; if ('A' <= y && y <= 'Z') y += 'a' - 'A'; if (x != y) @@ -528,8 +552,8 @@ static int ascii_strcasecmp(const char* a, const char* b, int len) { /***** Actual matching and rewriting code *****/ bool RE2::Match(const StringPiece& text, - int startpos, - int endpos, + size_t startpos, + size_t endpos, Anchor re_anchor, StringPiece* submatch, int nsubmatch) const { @@ -539,9 +563,12 @@ bool RE2::Match(const StringPiece& text, return false; } - if (startpos < 0 || startpos > endpos || endpos > text.size()) { + if (startpos > endpos || endpos > text.size()) { if (options_.log_errors()) - LOG(ERROR) << "RE2: invalid startpos, endpos pair."; + LOG(ERROR) << "RE2: invalid startpos, endpos pair. [" + << "startpos: " << startpos << ", " + << "endpos: " << endpos << ", " + << "text size: " << text.size() << "]"; return false; } @@ -574,7 +601,7 @@ bool RE2::Match(const StringPiece& text, re_anchor = ANCHOR_START; // Check for the required prefix, if any. - int prefixlen = 0; + size_t prefixlen = 0; if (!prefix_.empty()) { if (startpos != 0) return false; @@ -610,7 +637,7 @@ bool RE2::Match(const StringPiece& text, const int MaxBitStateProg = 500; // prog_->size() <= Max. const int MaxBitStateVector = 256*1024; // bit vector size <= Max (bits) bool can_bit_state = prog_->size() <= MaxBitStateProg; - int bit_state_text_max = MaxBitStateVector / prog_->size(); + size_t bit_state_text_max = MaxBitStateVector / prog_->size(); bool dfa_failed = false; switch (re_anchor) { @@ -619,24 +646,16 @@ bool RE2::Match(const StringPiece& text, if (!prog_->SearchDFA(subtext, text, anchor, kind, matchp, &dfa_failed, NULL)) { if (dfa_failed) { + if (options_.log_errors()) + LOG(ERROR) << "DFA out of memory: size " << prog_->size() << ", " + << "bytemap range " << prog_->bytemap_range() << ", " + << "list count " << prog_->list_count(); // Fall back to NFA below. skipped_test = true; - if (FLAGS_trace_re2) - LOG(INFO) << "Match " << trunc(pattern_) - << " [" << CEscape(subtext) << "]" - << " DFA failed."; break; } - if (FLAGS_trace_re2) - LOG(INFO) << "Match " << trunc(pattern_) - << " [" << CEscape(subtext) << "]" - << " used DFA - no match."; return false; } - if (FLAGS_trace_re2) - LOG(INFO) << "Match " << trunc(pattern_) - << " [" << CEscape(subtext) << "]" - << " used DFA - match"; if (matchp == NULL) // Matched. Don't care where return true; // SearchDFA set match[0].end() but didn't know where the @@ -648,26 +667,18 @@ bool RE2::Match(const StringPiece& text, if (!prog->SearchDFA(match, text, Prog::kAnchored, Prog::kLongestMatch, &match, &dfa_failed, NULL)) { if (dfa_failed) { + if (options_.log_errors()) + LOG(ERROR) << "DFA out of memory: size " << prog_->size() << ", " + << "bytemap range " << prog_->bytemap_range() << ", " + << "list count " << prog_->list_count(); // Fall back to NFA below. skipped_test = true; - if (FLAGS_trace_re2) - LOG(INFO) << "Match " << trunc(pattern_) - << " [" << CEscape(subtext) << "]" - << " reverse DFA failed."; break; } - if (FLAGS_trace_re2) - LOG(INFO) << "Match " << trunc(pattern_) - << " [" << CEscape(subtext) << "]" - << " DFA inconsistency."; if (options_.log_errors()) - LOG(ERROR) << "DFA inconsistency"; + LOG(ERROR) << "SearchDFA inconsistency"; return false; } - if (FLAGS_trace_re2) - LOG(INFO) << "Match " << trunc(pattern_) - << " [" << CEscape(subtext) << "]" - << " used reverse DFA."; break; } @@ -686,35 +697,24 @@ bool RE2::Match(const StringPiece& text, // the DFA does. if (can_one_pass && text.size() <= 4096 && (ncap > 1 || text.size() <= 8)) { - if (FLAGS_trace_re2) - LOG(INFO) << "Match " << trunc(pattern_) - << " [" << CEscape(subtext) << "]" - << " skipping DFA for OnePass."; skipped_test = true; break; } if (can_bit_state && text.size() <= bit_state_text_max && ncap > 1) { - if (FLAGS_trace_re2) - LOG(INFO) << "Match " << trunc(pattern_) - << " [" << CEscape(subtext) << "]" - << " skipping DFA for BitState."; skipped_test = true; break; } if (!prog_->SearchDFA(subtext, text, anchor, kind, &match, &dfa_failed, NULL)) { if (dfa_failed) { - if (FLAGS_trace_re2) - LOG(INFO) << "Match " << trunc(pattern_) - << " [" << CEscape(subtext) << "]" - << " DFA failed."; + if (options_.log_errors()) + LOG(ERROR) << "DFA out of memory: size " << prog_->size() << ", " + << "bytemap range " << prog_->bytemap_range() << ", " + << "list count " << prog_->list_count(); + // Fall back to NFA below. skipped_test = true; break; } - if (FLAGS_trace_re2) - LOG(INFO) << "Match " << trunc(pattern_) - << " [" << CEscape(subtext) << "]" - << " used DFA - no match."; return false; } break; @@ -740,20 +740,12 @@ bool RE2::Match(const StringPiece& text, } if (can_one_pass && anchor != Prog::kUnanchored) { - if (FLAGS_trace_re2) - LOG(INFO) << "Match " << trunc(pattern_) - << " [" << CEscape(subtext) << "]" - << " using OnePass."; if (!prog_->SearchOnePass(subtext1, text, anchor, kind, submatch, ncap)) { if (!skipped_test && options_.log_errors()) LOG(ERROR) << "SearchOnePass inconsistency"; return false; } } else if (can_bit_state && subtext1.size() <= bit_state_text_max) { - if (FLAGS_trace_re2) - LOG(INFO) << "Match " << trunc(pattern_) - << " [" << CEscape(subtext) << "]" - << " using BitState."; if (!prog_->SearchBitState(subtext1, text, anchor, kind, submatch, ncap)) { if (!skipped_test && options_.log_errors()) @@ -761,10 +753,6 @@ bool RE2::Match(const StringPiece& text, return false; } } else { - if (FLAGS_trace_re2) - LOG(INFO) << "Match " << trunc(pattern_) - << " [" << CEscape(subtext) << "]" - << " using NFA."; if (!prog_->SearchNFA(subtext1, text, anchor, kind, submatch, ncap)) { if (!skipped_test && options_.log_errors()) LOG(ERROR) << "SearchNFA inconsistency"; @@ -775,19 +763,19 @@ bool RE2::Match(const StringPiece& text, // Adjust overall match for required prefix that we stripped off. if (prefixlen > 0 && nsubmatch > 0) - submatch[0] = StringPiece(submatch[0].begin() - prefixlen, + submatch[0] = StringPiece(submatch[0].data() - prefixlen, submatch[0].size() + prefixlen); // Zero submatches that don't exist in the regexp. for (int i = ncap; i < nsubmatch; i++) - submatch[i] = NULL; + submatch[i] = StringPiece(); return true; } // Internal matcher - like Match() but takes Args not StringPieces. bool RE2::DoMatch(const StringPiece& text, Anchor anchor, - int* consumed, + size_t* consumed, const Arg* const* args, int n) const { if (!ok()) { @@ -807,7 +795,7 @@ bool RE2::DoMatch(const StringPiece& text, StringPiece stkvec[kVecSize]; StringPiece* heapvec = NULL; - if (static_cast(nvec) <= arraysize(stkvec)) { + if (nvec <= arraysize(stkvec)) { vec = stkvec; } else { vec = new StringPiece[nvec]; @@ -819,8 +807,8 @@ bool RE2::DoMatch(const StringPiece& text, return false; } - if(consumed != NULL) - *consumed = vec[0].end() - text.begin(); + if (consumed != NULL) + *consumed = static_cast(vec[0].end() - text.begin()); if (n == 0 || args == NULL) { // We are not interested in results @@ -831,7 +819,6 @@ bool RE2::DoMatch(const StringPiece& text, int ncap = NumberOfCapturingGroups(); if (ncap < n) { // RE has fewer capturing groups than number of arg pointers passed in - VLOG(1) << "Asked for " << n << " but only have " << ncap; delete[] heapvec; return false; } @@ -841,8 +828,6 @@ bool RE2::DoMatch(const StringPiece& text, const StringPiece& s = vec[i+1]; if (!args[i]->Parse(s.data(), s.size())) { // TODO: Should we indicate what the error was? - VLOG(1) << "Parse error on #" << i << " " << s << " " - << (void*)s.data() << "/" << s.size(); delete[] heapvec; return false; } @@ -852,56 +837,6 @@ bool RE2::DoMatch(const StringPiece& text, return true; } -// Append the "rewrite" string, with backslash subsitutions from "vec", -// to string "out". -bool RE2::Rewrite(string *out, const StringPiece &rewrite, - const StringPiece *vec, int veclen) const { - for (const char *s = rewrite.data(), *end = s + rewrite.size(); - s < end; s++) { - int c = *s; - if (c == '\\') { - s++; - c = (s < end) ? *s : -1; - if (isdigit(c)) { - int n = (c - '0'); - if (n >= veclen) { - if (options_.log_errors()) { - LOG(ERROR) << "requested group " << n - << " in regexp " << rewrite.data(); - } - return false; - } - StringPiece snip = vec[n]; - if (snip.size() > 0) - out->append(snip.data(), snip.size()); - } else if (c == '\\') { - out->push_back('\\'); - } else { - if (options_.log_errors()) - LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data(); - return false; - } - } else { - out->push_back(c); - } - } - return true; -} - -// Return the number of capturing subpatterns, or -1 if the -// regexp wasn't valid on construction. -int RE2::NumberOfCapturingGroups() const { - if (suffix_regexp_ == NULL) - return -1; - int n; - ATOMIC_LOAD_RELAXED(n, &num_captures_); - if (n == -1) { - n = suffix_regexp_->NumCaptures(); - ATOMIC_STORE_RELAXED(&num_captures_, n); - } - return n; -} - // Checks that the rewrite string is well-formed with respect to this // regular expression. bool RE2::CheckRewriteString(const StringPiece& rewrite, string* error) const { @@ -940,33 +875,94 @@ bool RE2::CheckRewriteString(const StringPiece& rewrite, string* error) const { return true; } +// Returns the maximum submatch needed for the rewrite to be done by Replace(). +// E.g. if rewrite == "foo \\2,\\1", returns 2. +int RE2::MaxSubmatch(const StringPiece& rewrite) { + int max = 0; + for (const char *s = rewrite.data(), *end = s + rewrite.size(); + s < end; s++) { + if (*s == '\\') { + s++; + int c = (s < end) ? *s : -1; + if (isdigit(c)) { + int n = (c - '0'); + if (n > max) + max = n; + } + } + } + return max; +} + +// Append the "rewrite" string, with backslash subsitutions from "vec", +// to string "out". +bool RE2::Rewrite(string* out, const StringPiece& rewrite, + const StringPiece* vec, int veclen) const { + for (const char *s = rewrite.data(), *end = s + rewrite.size(); + s < end; s++) { + if (*s != '\\') { + out->push_back(*s); + continue; + } + s++; + int c = (s < end) ? *s : -1; + if (isdigit(c)) { + int n = (c - '0'); + if (n >= veclen) { + if (options_.log_errors()) { + LOG(ERROR) << "requested group " << n + << " in regexp " << rewrite.data(); + } + return false; + } + StringPiece snip = vec[n]; + if (snip.size() > 0) + out->append(snip.data(), snip.size()); + } else if (c == '\\') { + out->push_back('\\'); + } else { + if (options_.log_errors()) + LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data(); + return false; + } + } + return true; +} + /***** Parsers for various types *****/ -bool RE2::Arg::parse_null(const char* str, int n, void* dest) { +bool RE2::Arg::parse_null(const char* str, size_t n, void* dest) { // We fail if somebody asked us to store into a non-NULL void* pointer return (dest == NULL); } -bool RE2::Arg::parse_string(const char* str, int n, void* dest) { +bool RE2::Arg::parse_string(const char* str, size_t n, void* dest) { if (dest == NULL) return true; reinterpret_cast(dest)->assign(str, n); return true; } -bool RE2::Arg::parse_stringpiece(const char* str, int n, void* dest) { +bool RE2::Arg::parse_stringpiece(const char* str, size_t n, void* dest) { if (dest == NULL) return true; - reinterpret_cast(dest)->set(str, n); + *(reinterpret_cast(dest)) = StringPiece(str, n); return true; } -bool RE2::Arg::parse_char(const char* str, int n, void* dest) { +bool RE2::Arg::parse_char(const char* str, size_t n, void* dest) { if (n != 1) return false; if (dest == NULL) return true; *(reinterpret_cast(dest)) = str[0]; return true; } -bool RE2::Arg::parse_uchar(const char* str, int n, void* dest) { +bool RE2::Arg::parse_schar(const char* str, size_t n, void* dest) { + if (n != 1) return false; + if (dest == NULL) return true; + *(reinterpret_cast(dest)) = str[0]; + return true; +} + +bool RE2::Arg::parse_uchar(const char* str, size_t n, void* dest) { if (n != 1) return false; if (dest == NULL) return true; *(reinterpret_cast(dest)) = str[0]; @@ -976,16 +972,23 @@ bool RE2::Arg::parse_uchar(const char* str, int n, void* dest) { // Largest number spec that we are willing to parse static const int kMaxNumberLength = 32; -// REQUIRES "buf" must have length at least kMaxNumberLength+1 +// REQUIRES "buf" must have length at least nbuf. // Copies "str" into "buf" and null-terminates. // Overwrites *np with the new length. -static const char* TerminateNumber(char* buf, const char* str, int* np) { - int n = *np; - if (n <= 0) return ""; +static const char* TerminateNumber(char* buf, size_t nbuf, const char* str, + size_t* np, bool accept_spaces) { + size_t n = *np; + if (n == 0) return ""; if (n > 0 && isspace(*str)) { // We are less forgiving than the strtoxxx() routines and do not - // allow leading spaces. - return ""; + // allow leading spaces. We do allow leading spaces for floats. + if (!accept_spaces) { + return ""; + } + while (n > 0 && isspace(*str)) { + n--; + str++; + } } // Although buf has a fixed maximum size, we can still handle @@ -1015,7 +1018,7 @@ static const char* TerminateNumber(char* buf, const char* str, int* np) { str--; } - if (n > kMaxNumberLength) return ""; + if (n > nbuf-1) return ""; memmove(buf, str, n); if (neg) { @@ -1027,12 +1030,12 @@ static const char* TerminateNumber(char* buf, const char* str, int* np) { } bool RE2::Arg::parse_long_radix(const char* str, - int n, - void* dest, - int radix) { + size_t n, + void* dest, + int radix) { if (n == 0) return false; char buf[kMaxNumberLength+1]; - str = TerminateNumber(buf, str, &n); + str = TerminateNumber(buf, sizeof buf, str, &n, false); char* end; errno = 0; long r = strtol(str, &end, radix); @@ -1044,16 +1047,16 @@ bool RE2::Arg::parse_long_radix(const char* str, } bool RE2::Arg::parse_ulong_radix(const char* str, - int n, - void* dest, - int radix) { + size_t n, + void* dest, + int radix) { if (n == 0) return false; char buf[kMaxNumberLength+1]; - str = TerminateNumber(buf, str, &n); + str = TerminateNumber(buf, sizeof buf, str, &n, false); if (str[0] == '-') { - // strtoul() will silently accept negative numbers and parse - // them. This module is more strict and treats them as errors. - return false; + // strtoul() will silently accept negative numbers and parse + // them. This module is more strict and treats them as errors. + return false; } char* end; @@ -1067,78 +1070,77 @@ bool RE2::Arg::parse_ulong_radix(const char* str, } bool RE2::Arg::parse_short_radix(const char* str, - int n, - void* dest, - int radix) { + size_t n, + void* dest, + int radix) { long r; - if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse - if ((short)r != r) return false; // Out of range + if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse + if ((short)r != r) return false; // Out of range if (dest == NULL) return true; - *(reinterpret_cast(dest)) = r; + *(reinterpret_cast(dest)) = (short)r; return true; } bool RE2::Arg::parse_ushort_radix(const char* str, - int n, - void* dest, - int radix) { + size_t n, + void* dest, + int radix) { unsigned long r; - if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse - if ((ushort)r != r) return false; // Out of range + if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse + if ((unsigned short)r != r) return false; // Out of range if (dest == NULL) return true; - *(reinterpret_cast(dest)) = r; + *(reinterpret_cast(dest)) = (unsigned short)r; return true; } bool RE2::Arg::parse_int_radix(const char* str, - int n, - void* dest, - int radix) { + size_t n, + void* dest, + int radix) { long r; - if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse - if ((int)r != r) return false; // Out of range + if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse + if ((int)r != r) return false; // Out of range if (dest == NULL) return true; - *(reinterpret_cast(dest)) = r; + *(reinterpret_cast(dest)) = (int)r; return true; } bool RE2::Arg::parse_uint_radix(const char* str, - int n, - void* dest, - int radix) { + size_t n, + void* dest, + int radix) { unsigned long r; - if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse - if ((uint)r != r) return false; // Out of range + if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse + if ((unsigned int)r != r) return false; // Out of range if (dest == NULL) return true; - *(reinterpret_cast(dest)) = r; + *(reinterpret_cast(dest)) = (unsigned int)r; return true; } -#ifdef RE2_HAVE_LONGLONG bool RE2::Arg::parse_longlong_radix(const char* str, - int n, - void* dest, - int radix) { - if (n == 0) return false; - char buf[kMaxNumberLength+1]; - str = TerminateNumber(buf, str, &n); - char* end; - errno = 0; - int64 r = strtoll(str, &end, radix); - if (end != str + n) return false; // Leftover junk - if (errno) return false; - if (dest == NULL) return true; - *(reinterpret_cast(dest)) = r; - return true; -} - -bool RE2::Arg::parse_ulonglong_radix(const char* str, - int n, + size_t n, void* dest, int radix) { if (n == 0) return false; char buf[kMaxNumberLength+1]; - str = TerminateNumber(buf, str, &n); + str = TerminateNumber(buf, sizeof buf, str, &n, false); + char* end; + errno = 0; + long long r = strtoll(str, &end, radix); + if (end != str + n) return false; // Leftover junk + if (errno) return false; + if (dest == NULL) return true; + *(reinterpret_cast(dest)) = r; + return true; +} + +bool RE2::Arg::parse_ulonglong_radix(const char* str, + size_t n, + void* dest, + int radix) { + if (n == 0) return false; + char buf[kMaxNumberLength+1]; + str = TerminateNumber(buf, sizeof buf, str, &n, false); if (str[0] == '-') { // strtoull() will silently accept negative numbers and parse // them. This module is more strict and treats them as errors. @@ -1146,73 +1148,71 @@ bool RE2::Arg::parse_ulonglong_radix(const char* str, } char* end; errno = 0; - uint64 r = strtoull(str, &end, radix); + unsigned long long r = strtoull(str, &end, radix); if (end != str + n) return false; // Leftover junk if (errno) return false; if (dest == NULL) return true; - *(reinterpret_cast(dest)) = r; + *(reinterpret_cast(dest)) = r; return true; } -#endif -static bool parse_double_float(const char* str, int n, bool isfloat, void *dest) { +static bool parse_double_float(const char* str, size_t n, bool isfloat, + void* dest) { if (n == 0) return false; static const int kMaxLength = 200; - char buf[kMaxLength]; - if (n >= kMaxLength) return false; - memcpy(buf, str, n); - buf[n] = '\0'; - errno = 0; + char buf[kMaxLength+1]; + str = TerminateNumber(buf, sizeof buf, str, &n, true); char* end; + errno = 0; double r; if (isfloat) { - r = strtof(buf, &end); + r = strtof(str, &end); } else { - r = strtod(buf, &end); + r = strtod(str, &end); } - if (end != buf + n) return false; // Leftover junk + if (end != str + n) return false; // Leftover junk if (errno) return false; if (dest == NULL) return true; if (isfloat) { - *(reinterpret_cast(dest)) = r; + *(reinterpret_cast(dest)) = (float)r; } else { *(reinterpret_cast(dest)) = r; } return true; } -bool RE2::Arg::parse_double(const char* str, int n, void* dest) { +bool RE2::Arg::parse_double(const char* str, size_t n, void* dest) { return parse_double_float(str, n, false, dest); } -bool RE2::Arg::parse_float(const char* str, int n, void* dest) { +bool RE2::Arg::parse_float(const char* str, size_t n, void* dest) { return parse_double_float(str, n, true, dest); } - -#define DEFINE_INTEGER_PARSERS(name) \ - bool RE2::Arg::parse_##name(const char* str, int n, void* dest) { \ - return parse_##name##_radix(str, n, dest, 10); \ - } \ - bool RE2::Arg::parse_##name##_hex(const char* str, int n, void* dest) { \ - return parse_##name##_radix(str, n, dest, 16); \ - } \ - bool RE2::Arg::parse_##name##_octal(const char* str, int n, void* dest) { \ - return parse_##name##_radix(str, n, dest, 8); \ - } \ - bool RE2::Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \ - return parse_##name##_radix(str, n, dest, 0); \ +#define DEFINE_INTEGER_PARSER(name) \ + bool RE2::Arg::parse_##name(const char* str, size_t n, void* dest) { \ + return parse_##name##_radix(str, n, dest, 10); \ + } \ + bool RE2::Arg::parse_##name##_hex(const char* str, size_t n, void* dest) { \ + return parse_##name##_radix(str, n, dest, 16); \ + } \ + bool RE2::Arg::parse_##name##_octal(const char* str, size_t n, void* dest) { \ + return parse_##name##_radix(str, n, dest, 8); \ + } \ + bool RE2::Arg::parse_##name##_cradix(const char* str, size_t n, \ + void* dest) { \ + return parse_##name##_radix(str, n, dest, 0); \ } -DEFINE_INTEGER_PARSERS(short); -DEFINE_INTEGER_PARSERS(ushort); -DEFINE_INTEGER_PARSERS(int); -DEFINE_INTEGER_PARSERS(uint); -DEFINE_INTEGER_PARSERS(long); -DEFINE_INTEGER_PARSERS(ulong); -DEFINE_INTEGER_PARSERS(longlong); -DEFINE_INTEGER_PARSERS(ulonglong); +DEFINE_INTEGER_PARSER(short); +DEFINE_INTEGER_PARSER(ushort); +DEFINE_INTEGER_PARSER(int); +DEFINE_INTEGER_PARSER(uint); +DEFINE_INTEGER_PARSER(long); +DEFINE_INTEGER_PARSER(ulong); +DEFINE_INTEGER_PARSER(longlong); +DEFINE_INTEGER_PARSER(ulonglong); -#undef DEFINE_INTEGER_PARSERS +#undef DEFINE_INTEGER_PARSER } // namespace re2 diff --git a/contrib/libre2/re2/re2.h b/contrib/libre2/re2/re2.h index 1aabcbc4fed..9307704f71f 100644 --- a/contrib/libre2/re2/re2.h +++ b/contrib/libre2/re2/re2.h @@ -2,8 +2,8 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -#ifndef RE2_RE2_H -#define RE2_RE2_H +#ifndef RE2_RE2_H_ +#define RE2_RE2_H_ // C++ interface to the re2 regular-expression library. // RE2 supports Perl-style regular expressions (with extensions like @@ -17,7 +17,7 @@ // some of the more complicated things thrown away. In particular, // backreferences and generalized assertions are not available, nor is \Z. // -// See http://code.google.com/p/re2/wiki/Syntax for the syntax +// See https://github.com/google/re2/wiki/Syntax for the syntax // supported by RE2, and a comparison with PCRE and PERL regexps. // // For those not familiar with Perl's regular expressions, @@ -179,38 +179,24 @@ // RE2::Octal(&a), RE2::Hex(&b), RE2::CRadix(&c), RE2::CRadix(&d)); // will leave 64 in a, b, c, and d. +#include #include +#include #include +#include #include -#include "re2/stringpiece.h" -#include "re2/variadic_function.h" -#ifndef RE2_HAVE_LONGLONG -#define RE2_HAVE_LONGLONG 1 -#endif +#include "re2/stringpiece.h" + +namespace re2 { +class Prog; +class Regexp; +} // namespace re2 namespace re2 { +// TODO(junyer): Get rid of this. using std::string; -using std::map; -class Mutex; -class Prog; -class Regexp; - -// The following enum should be used only as a constructor argument to indicate -// that the variable has static storage class, and that the constructor should -// do nothing to its state. It indicates to the reader that it is legal to -// declare a static instance of the class, provided the constructor is given -// the LINKER_INITIALIZED argument. Normally, it is unsafe to declare a -// static variable that has a constructor or a destructor because invocation -// order is undefined. However, IF the type can be initialized by filling with -// zeroes (which the loader does for static variables), AND the type's -// destructor does nothing to the storage, then a constructor for static -// initialization can be declared as -// explicit MyClass(LinkerInitialized x) {} -// and invoked as -// static MyClass my_variable_name(LINKER_INITIALIZED); -enum LinkerInitialized { LINKER_INITIALIZED }; // Interface for regular expression matching. Also corresponds to a // pre-compiled regular expression. An "RE2" object is safe for @@ -266,7 +252,7 @@ class RE2 { RE2(const string& pattern); #endif RE2(const StringPiece& pattern); - RE2(const StringPiece& pattern, const Options& option); + RE2(const StringPiece& pattern, const Options& options); ~RE2(); // Returns whether RE2 was created properly. @@ -293,6 +279,11 @@ class RE2 { // Larger numbers are more expensive than smaller numbers. int ProgramSize() const; + // EXPERIMENTAL! SUBJECT TO CHANGE! + // Outputs the program fanout as a histogram bucketed by powers of 2. + // Returns the number of the largest non-empty bucket. + int ProgramFanout(std::map* histogram) const; + // Returns the underlying Regexp; not for general use. // Returns entire_regexp_ so that callers don't need // to know about prefix_ and prefix_foldcase_. @@ -300,21 +291,21 @@ class RE2 { /***** The useful part: the matching interface *****/ - // Matches "text" against "pattern". If pointer arguments are + // Matches "text" against "re". If pointer arguments are // supplied, copies matched sub-patterns into them. // // You can pass in a "const char*" or a "string" for "text". - // You can pass in a "const char*" or a "string" or a "RE2" for "pattern". + // You can pass in a "const char*" or a "string" or a "RE2" for "re". // // The provided pointer arguments can be pointers to any scalar numeric // type, or one of: // string (matched piece is copied to string) // StringPiece (StringPiece is mutated to point to matched piece) - // T (where "bool T::ParseFrom(const char*, int)" exists) + // T (where "bool T::ParseFrom(const char*, size_t)" exists) // (void*)NULL (the corresponding matched sub-pattern is not copied) // // Returns true iff all of the following conditions are satisfied: - // a. "text" matches "pattern" exactly + // a. "text" matches "re" exactly // b. The number of matched sub-patterns is >= number of supplied pointers // c. The "i"th argument has a suitable type for holding the // string captured as the "i"th sub-pattern. If you pass in @@ -330,32 +321,65 @@ class RE2 { // RE2::FullMatch("abc", "[a-z]+(\\d+)?", &number); static bool FullMatchN(const StringPiece& text, const RE2& re, const Arg* const args[], int argc); - static const VariadicFunction2< - bool, const StringPiece&, const RE2&, Arg, RE2::FullMatchN> FullMatch; - // Exactly like FullMatch(), except that "pattern" is allowed to match + // Exactly like FullMatch(), except that "re" is allowed to match // a substring of "text". - static bool PartialMatchN(const StringPiece& text, const RE2& re, // 3..16 args + static bool PartialMatchN(const StringPiece& text, const RE2& re, const Arg* const args[], int argc); - static const VariadicFunction2< - bool, const StringPiece&, const RE2&, Arg, RE2::PartialMatchN> PartialMatch; - // Like FullMatch() and PartialMatch(), except that pattern has to - // match a prefix of "text", and "input" is advanced past the matched + // Like FullMatch() and PartialMatch(), except that "re" has to match + // a prefix of the text, and "input" is advanced past the matched // text. Note: "input" is modified iff this routine returns true. - static bool ConsumeN(StringPiece* input, const RE2& pattern, // 3..16 args + static bool ConsumeN(StringPiece* input, const RE2& re, const Arg* const args[], int argc); - static const VariadicFunction2< - bool, StringPiece*, const RE2&, Arg, RE2::ConsumeN> Consume; - // Like Consume(..), but does not anchor the match at the beginning of the - // string. That is, "pattern" need not start its match at the beginning of - // "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds the next - // word in "s" and stores it in "word". - static bool FindAndConsumeN(StringPiece* input, const RE2& pattern, - const Arg* const args[], int argc); - static const VariadicFunction2< - bool, StringPiece*, const RE2&, Arg, RE2::FindAndConsumeN> FindAndConsume; + // Like Consume(), but does not anchor the match at the beginning of + // the text. That is, "re" need not start its match at the beginning + // of "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds + // the next word in "s" and stores it in "word". + static bool FindAndConsumeN(StringPiece* input, const RE2& re, + const Arg* const args[], int argc); + +#ifndef SWIG + private: + template + static inline bool Apply(F f, SP sp, const RE2& re) { + return f(sp, re, NULL, 0); + } + + template + static inline bool Apply(F f, SP sp, const RE2& re, const A&... a) { + const Arg* const args[] = {&a...}; + const int argc = sizeof...(a); + return f(sp, re, args, argc); + } + + public: + // In order to allow FullMatch() et al. to be called with a varying number + // of arguments of varying types, we use two layers of variadic templates. + // The first layer constructs the temporary Arg objects. The second layer + // (above) constructs the array of pointers to the temporary Arg objects. + + template + static bool FullMatch(const StringPiece& text, const RE2& re, A&&... a) { + return Apply(FullMatchN, text, re, Arg(std::forward(a))...); + } + + template + static bool PartialMatch(const StringPiece& text, const RE2& re, A&&... a) { + return Apply(PartialMatchN, text, re, Arg(std::forward(a))...); + } + + template + static bool Consume(StringPiece* input, const RE2& re, A&&... a) { + return Apply(ConsumeN, input, re, Arg(std::forward(a))...); + } + + template + static bool FindAndConsume(StringPiece* input, const RE2& re, A&&... a) { + return Apply(FindAndConsumeN, input, re, Arg(std::forward(a))...); + } +#endif // Replace the first match of "pattern" in "str" with "rewrite". // Within "rewrite", backslash-escaped digits (\1 to \9) can be @@ -397,6 +421,8 @@ class RE2 { // // Returns true iff a match occurred and the extraction happened // successfully; if no match occurs, the string is left unaffected. + // + // REQUIRES: "text" must not alias any part of "*out". static bool Extract(const StringPiece &text, const RE2& pattern, const StringPiece &rewrite, @@ -440,17 +466,16 @@ class RE2 { // does not count: if the regexp is "(a)(b)", returns 2. int NumberOfCapturingGroups() const; - // Return a map from names to capturing indices. // The map records the index of the leftmost group // with the given name. // Only valid until the re is deleted. - const map& NamedCapturingGroups() const; + const std::map& NamedCapturingGroups() const; // Return a map from capturing indices to names. // The map has no entries for unnamed groups. // Only valid until the re is deleted. - const map& CapturingGroupNames() const; + const std::map& CapturingGroupNames() const; // General matching routine. // Match against text starting at offset startpos @@ -459,8 +484,8 @@ class RE2 { // On a successful match, fills in match[] (up to nmatch entries) // with information about submatches. // I.e. matching RE2("(foo)|(bar)baz") on "barbazbla" will return true, - // setting match[0] = "barbaz", match[1] = NULL, match[2] = "bar", - // match[3] = NULL, ..., up to match[nmatch-1] = NULL. + // setting match[0] = "barbaz", match[1].data() = NULL, match[2] = "bar", + // match[3].data() = NULL, ..., up to match[nmatch-1].data() = NULL. // // Don't ask for more match information than you will use: // runs much faster with nmatch == 1 than nmatch > 1, and @@ -471,10 +496,10 @@ class RE2 { // Passing text == StringPiece(NULL, 0) will be handled like any other // empty string, but note that on return, it will not be possible to tell // whether submatch i matched the empty string or did not match: - // either way, match[i] == NULL. + // either way, match[i].data() == NULL. bool Match(const StringPiece& text, - int startpos, - int endpos, + size_t startpos, + size_t endpos, Anchor anchor, StringPiece *match, int nmatch) const; @@ -498,8 +523,8 @@ class RE2 { // Returns true on success. This method can fail because of a malformed // rewrite string. CheckRewriteString guarantees that the rewrite will // be sucessful. - bool Rewrite(string *out, - const StringPiece &rewrite, + bool Rewrite(string* out, + const StringPiece& rewrite, const StringPiece* vec, int veclen) const; @@ -632,19 +657,7 @@ class RE2 { void set_one_line(bool b) { one_line_ = b; } void Copy(const Options& src) { - encoding_ = src.encoding_; - posix_syntax_ = src.posix_syntax_; - longest_match_ = src.longest_match_; - log_errors_ = src.log_errors_; - max_mem_ = src.max_mem_; - literal_ = src.literal_; - never_nl_ = src.never_nl_; - dot_nl_ = src.dot_nl_; - never_capture_ = src.never_capture_; - case_sensitive_ = src.case_sensitive_; - perl_classes_ = src.perl_classes_; - word_boundary_ = src.word_boundary_; - one_line_ = src.one_line_; + *this = src; } int ParseFlags() const; @@ -663,10 +676,6 @@ class RE2 { bool perl_classes_; bool word_boundary_; bool one_line_; - - //DISALLOW_EVIL_CONSTRUCTORS(Options); - Options(const Options&); - void operator=(const Options&); }; // Returns the options set in the constructor. @@ -679,10 +688,8 @@ class RE2 { static inline Arg CRadix(unsigned int* x); static inline Arg CRadix(long* x); static inline Arg CRadix(unsigned long* x); - #ifdef RE2_HAVE_LONGLONG static inline Arg CRadix(long long* x); static inline Arg CRadix(unsigned long long* x); - #endif static inline Arg Hex(short* x); static inline Arg Hex(unsigned short* x); @@ -690,10 +697,8 @@ class RE2 { static inline Arg Hex(unsigned int* x); static inline Arg Hex(long* x); static inline Arg Hex(unsigned long* x); - #ifdef RE2_HAVE_LONGLONG static inline Arg Hex(long long* x); static inline Arg Hex(unsigned long long* x); - #endif static inline Arg Octal(short* x); static inline Arg Octal(unsigned short* x); @@ -701,47 +706,50 @@ class RE2 { static inline Arg Octal(unsigned int* x); static inline Arg Octal(long* x); static inline Arg Octal(unsigned long* x); - #ifdef RE2_HAVE_LONGLONG static inline Arg Octal(long long* x); static inline Arg Octal(unsigned long long* x); - #endif private: void Init(const StringPiece& pattern, const Options& options); bool DoMatch(const StringPiece& text, - Anchor anchor, - int* consumed, - const Arg* const args[], - int n) const; + Anchor anchor, + size_t* consumed, + const Arg* const args[], + int n) const; re2::Prog* ReverseProg() const; - mutable Mutex* mutex_; - string pattern_; // string regular expression - Options options_; // option flags + string pattern_; // string regular expression + Options options_; // option flags string prefix_; // required prefix (before regexp_) bool prefix_foldcase_; // prefix is ASCII case-insensitive re2::Regexp* entire_regexp_; // parsed regular expression re2::Regexp* suffix_regexp_; // parsed regular expression, prefix removed re2::Prog* prog_; // compiled program for regexp - mutable re2::Prog* rprog_; // reverse program for regexp - bool is_one_pass_; // can use prog_->SearchOnePass? - mutable const string* error_; // Error indicator - // (or points to empty string) - mutable ErrorCode error_code_; // Error code - mutable string error_arg_; // Fragment of regexp showing error - mutable int num_captures_; // Number of capturing groups + bool is_one_pass_; // can use prog_->SearchOnePass? + + mutable re2::Prog* rprog_; // reverse program for regexp + mutable const string* error_; // Error indicator + // (or points to empty string) + mutable ErrorCode error_code_; // Error code + mutable string error_arg_; // Fragment of regexp showing error + mutable int num_captures_; // Number of capturing groups // Map from capture names to indices - mutable const map* named_groups_; + mutable const std::map* named_groups_; // Map from capture indices to names - mutable const map* group_names_; + mutable const std::map* group_names_; - //DISALLOW_EVIL_CONSTRUCTORS(RE2); - RE2(const RE2&); - void operator=(const RE2&); + // Onces for lazy computations. + mutable std::once_flag rprog_once_; + mutable std::once_flag num_captures_once_; + mutable std::once_flag named_groups_once_; + mutable std::once_flag group_names_once_; + + RE2(const RE2&) = delete; + RE2& operator=(const RE2&) = delete; }; /***** Implementation details *****/ @@ -752,7 +760,7 @@ class RE2 { template class _RE2_MatchObject { public: - static inline bool Parse(const char* str, int n, void* dest) { + static inline bool Parse(const char* str, size_t n, void* dest) { if (dest == NULL) return true; T* object = reinterpret_cast(dest); return object->ParseFrom(str, n); @@ -767,65 +775,64 @@ class RE2::Arg { // Constructor specially designed for NULL arguments Arg(void*); - typedef bool (*Parser)(const char* str, int n, void* dest); + typedef bool (*Parser)(const char* str, size_t n, void* dest); // Type-specific parsers -#define MAKE_PARSER(type,name) \ - Arg(type* p) : arg_(p), parser_(name) { } \ - Arg(type* p, Parser parser) : arg_(p), parser_(parser) { } \ - +#define MAKE_PARSER(type, name) \ + Arg(type* p) : arg_(p), parser_(name) {} \ + Arg(type* p, Parser parser) : arg_(p), parser_(parser) {} MAKE_PARSER(char, parse_char); - MAKE_PARSER(signed char, parse_char); + MAKE_PARSER(signed char, parse_schar); MAKE_PARSER(unsigned char, parse_uchar); + MAKE_PARSER(float, parse_float); + MAKE_PARSER(double, parse_double); + MAKE_PARSER(string, parse_string); + MAKE_PARSER(StringPiece, parse_stringpiece); + MAKE_PARSER(short, parse_short); MAKE_PARSER(unsigned short, parse_ushort); MAKE_PARSER(int, parse_int); MAKE_PARSER(unsigned int, parse_uint); MAKE_PARSER(long, parse_long); MAKE_PARSER(unsigned long, parse_ulong); - #ifdef RE2_HAVE_LONGLONG MAKE_PARSER(long long, parse_longlong); MAKE_PARSER(unsigned long long, parse_ulonglong); - #endif - MAKE_PARSER(float, parse_float); - MAKE_PARSER(double, parse_double); - MAKE_PARSER(string, parse_string); - MAKE_PARSER(StringPiece, parse_stringpiece); #undef MAKE_PARSER - // Generic constructor - template Arg(T*, Parser parser); - // Generic constructor template + // Generic constructor templates template Arg(T* p) - : arg_(p), parser_(_RE2_MatchObject::Parse) { - } + : arg_(p), parser_(_RE2_MatchObject::Parse) { } + template Arg(T* p, Parser parser) + : arg_(p), parser_(parser) { } // Parse the data - bool Parse(const char* str, int n) const; + bool Parse(const char* str, size_t n) const; private: void* arg_; Parser parser_; - static bool parse_null (const char* str, int n, void* dest); - static bool parse_char (const char* str, int n, void* dest); - static bool parse_uchar (const char* str, int n, void* dest); - static bool parse_float (const char* str, int n, void* dest); - static bool parse_double (const char* str, int n, void* dest); - static bool parse_string (const char* str, int n, void* dest); - static bool parse_stringpiece (const char* str, int n, void* dest); + static bool parse_null (const char* str, size_t n, void* dest); + static bool parse_char (const char* str, size_t n, void* dest); + static bool parse_schar (const char* str, size_t n, void* dest); + static bool parse_uchar (const char* str, size_t n, void* dest); + static bool parse_float (const char* str, size_t n, void* dest); + static bool parse_double (const char* str, size_t n, void* dest); + static bool parse_string (const char* str, size_t n, void* dest); + static bool parse_stringpiece (const char* str, size_t n, void* dest); -#define DECLARE_INTEGER_PARSER(name) \ - private: \ - static bool parse_ ## name(const char* str, int n, void* dest); \ - static bool parse_ ## name ## _radix( \ - const char* str, int n, void* dest, int radix); \ - public: \ - static bool parse_ ## name ## _hex(const char* str, int n, void* dest); \ - static bool parse_ ## name ## _octal(const char* str, int n, void* dest); \ - static bool parse_ ## name ## _cradix(const char* str, int n, void* dest) +#define DECLARE_INTEGER_PARSER(name) \ + private: \ + static bool parse_##name(const char* str, size_t n, void* dest); \ + static bool parse_##name##_radix(const char* str, size_t n, void* dest, \ + int radix); \ + \ + public: \ + static bool parse_##name##_hex(const char* str, size_t n, void* dest); \ + static bool parse_##name##_octal(const char* str, size_t n, void* dest); \ + static bool parse_##name##_cradix(const char* str, size_t n, void* dest) DECLARE_INTEGER_PARSER(short); DECLARE_INTEGER_PARSER(ushort); @@ -833,29 +840,31 @@ class RE2::Arg { DECLARE_INTEGER_PARSER(uint); DECLARE_INTEGER_PARSER(long); DECLARE_INTEGER_PARSER(ulong); - #ifdef RE2_HAVE_LONGLONG DECLARE_INTEGER_PARSER(longlong); DECLARE_INTEGER_PARSER(ulonglong); - #endif #undef DECLARE_INTEGER_PARSER + }; inline RE2::Arg::Arg() : arg_(NULL), parser_(parse_null) { } inline RE2::Arg::Arg(void* p) : arg_(p), parser_(parse_null) { } -inline bool RE2::Arg::Parse(const char* str, int n) const { +inline bool RE2::Arg::Parse(const char* str, size_t n) const { return (*parser_)(str, n, arg_); } // This part of the parser, appropriate only for ints, deals with bases -#define MAKE_INTEGER_PARSER(type, name) \ - inline RE2::Arg RE2::Hex(type* ptr) { \ - return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _hex); } \ - inline RE2::Arg RE2::Octal(type* ptr) { \ - return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _octal); } \ - inline RE2::Arg RE2::CRadix(type* ptr) { \ - return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _cradix); } +#define MAKE_INTEGER_PARSER(type, name) \ + inline RE2::Arg RE2::Hex(type* ptr) { \ + return RE2::Arg(ptr, RE2::Arg::parse_##name##_hex); \ + } \ + inline RE2::Arg RE2::Octal(type* ptr) { \ + return RE2::Arg(ptr, RE2::Arg::parse_##name##_octal); \ + } \ + inline RE2::Arg RE2::CRadix(type* ptr) { \ + return RE2::Arg(ptr, RE2::Arg::parse_##name##_cradix); \ + } MAKE_INTEGER_PARSER(short, short) MAKE_INTEGER_PARSER(unsigned short, ushort) @@ -863,15 +872,70 @@ MAKE_INTEGER_PARSER(int, int) MAKE_INTEGER_PARSER(unsigned int, uint) MAKE_INTEGER_PARSER(long, long) MAKE_INTEGER_PARSER(unsigned long, ulong) -#ifdef RE2_HAVE_LONGLONG MAKE_INTEGER_PARSER(long long, longlong) MAKE_INTEGER_PARSER(unsigned long long, ulonglong) -#endif #undef MAKE_INTEGER_PARSER +#ifndef SWIG + +// Silence warnings about missing initializers for members of LazyRE2. +// Note that we test for Clang first because it defines __GNUC__ as well. +#if defined(__clang__) +#elif defined(__GNUC__) && __GNUC__ >= 6 +#pragma GCC diagnostic ignored "-Wmissing-field-initializers" +#endif + +// Helper for writing global or static RE2s safely. +// Write +// static LazyRE2 re = {".*"}; +// and then use *re instead of writing +// static RE2 re(".*"); +// The former is more careful about multithreaded +// situations than the latter. +// +// N.B. This class never deletes the RE2 object that +// it constructs: that's a feature, so that it can be used +// for global and function static variables. +class LazyRE2 { + private: + struct NoArg {}; + + public: + typedef RE2 element_type; // support std::pointer_traits + + // Constructor omitted to preserve braced initialization in C++98. + + // Pretend to be a pointer to Type (never NULL due to on-demand creation): + RE2& operator*() const { return *get(); } + RE2* operator->() const { return get(); } + + // Named accessor/initializer: + RE2* get() const { + std::call_once(once_, &LazyRE2::Init, this); + return ptr_; + } + + // All data fields must be public to support {"foo"} initialization. + const char* pattern_; + RE2::CannedOptions options_; + NoArg barrier_against_excess_initializers_; + + mutable RE2* ptr_; + mutable std::once_flag once_; + + private: + static void Init(const LazyRE2* lazy_re2) { + lazy_re2->ptr_ = new RE2(lazy_re2->pattern_, lazy_re2->options_); + } + + void operator=(const LazyRE2&); // disallowed +}; +#endif // SWIG + } // namespace re2 using re2::RE2; +using re2::LazyRE2; -#endif /* RE2_RE2_H */ +#endif // RE2_RE2_H_ diff --git a/contrib/libre2/re2/regexp.cc b/contrib/libre2/re2/regexp.cc index 13874afb907..34209bcaa0b 100644 --- a/contrib/libre2/re2/regexp.cc +++ b/contrib/libre2/re2/regexp.cc @@ -5,8 +5,21 @@ // Regular expression representation. // Tested by parse_test.cc -#include "util/util.h" #include "re2/regexp.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "util/util.h" +#include "util/logging.h" +#include "util/mutex.h" +#include "util/utf.h" #include "re2/stringpiece.h" #include "re2/walker-inl.h" @@ -14,9 +27,9 @@ namespace re2 { // Constructor. Allocates vectors as appropriate for operator. Regexp::Regexp(RegexpOp op, ParseFlags parse_flags) - : op_(op), + : op_(static_cast(op)), simple_(false), - parse_flags_(static_cast(parse_flags)), + parse_flags_(static_cast(parse_flags)), ref_(1), nsub_(0), down_(NULL) { @@ -43,7 +56,8 @@ Regexp::~Regexp() { delete[] runes_; break; case kRegexpCharClass: - cc_->Delete(); + if (cc_) + cc_->Delete(); delete ccb_; break; } @@ -59,30 +73,29 @@ bool Regexp::QuickDestroy() { return false; } -static map *ref_map; -GLOBAL_MUTEX(ref_mutex); +// Lazily allocated. +static Mutex* ref_mutex; +static std::map* ref_map; int Regexp::Ref() { if (ref_ < kMaxRef) return ref_; - GLOBAL_MUTEX_LOCK(ref_mutex); - int r = 0; - if (ref_map != NULL) { - r = (*ref_map)[this]; - } - GLOBAL_MUTEX_UNLOCK(ref_mutex); - return r; + MutexLock l(ref_mutex); + return (*ref_map)[this]; } // Increments reference count, returns object as convenience. Regexp* Regexp::Incref() { if (ref_ >= kMaxRef-1) { + static std::once_flag ref_once; + std::call_once(ref_once, []() { + ref_mutex = new Mutex; + ref_map = new std::map; + }); + // Store ref count in overflow map. - GLOBAL_MUTEX_LOCK(ref_mutex); - if (ref_map == NULL) { - ref_map = new map; - } + MutexLock l(ref_mutex); if (ref_ == kMaxRef) { // already overflowed (*ref_map)[this]++; @@ -91,7 +104,6 @@ Regexp* Regexp::Incref() { (*ref_map)[this] = kMaxRef; ref_ = kMaxRef; } - GLOBAL_MUTEX_UNLOCK(ref_mutex); return this; } @@ -103,15 +115,14 @@ Regexp* Regexp::Incref() { void Regexp::Decref() { if (ref_ == kMaxRef) { // Ref count is stored in overflow map. - GLOBAL_MUTEX_LOCK(ref_mutex); + MutexLock l(ref_mutex); int r = (*ref_map)[this] - 1; if (r < kMaxRef) { - ref_ = r; + ref_ = static_cast(r); ref_map->erase(this); } else { (*ref_map)[this] = r; } - GLOBAL_MUTEX_UNLOCK(ref_mutex); return; } ref_--; @@ -179,31 +190,45 @@ Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) { return re; } -Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) { - if (sub->op() == kRegexpPlus && sub->parse_flags() == flags) +Regexp* Regexp::StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags) { + // Squash **, ++ and ??. + if (op == sub->op() && flags == sub->parse_flags()) return sub; - Regexp* re = new Regexp(kRegexpPlus, flags); + + // Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because + // op is Star/Plus/Quest, we just have to check that sub->op() is too. + if ((sub->op() == kRegexpStar || + sub->op() == kRegexpPlus || + sub->op() == kRegexpQuest) && + flags == sub->parse_flags()) { + // If sub is Star, no need to rewrite it. + if (sub->op() == kRegexpStar) + return sub; + + // Rewrite sub to Star. + Regexp* re = new Regexp(kRegexpStar, flags); + re->AllocSub(1); + re->sub()[0] = sub->sub()[0]->Incref(); + sub->Decref(); // We didn't consume the reference after all. + return re; + } + + Regexp* re = new Regexp(op, flags); re->AllocSub(1); re->sub()[0] = sub; return re; } +Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) { + return StarPlusOrQuest(kRegexpPlus, sub, flags); +} + Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) { - if (sub->op() == kRegexpStar && sub->parse_flags() == flags) - return sub; - Regexp* re = new Regexp(kRegexpStar, flags); - re->AllocSub(1); - re->sub()[0] = sub; - return re; + return StarPlusOrQuest(kRegexpStar, sub, flags); } Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) { - if (sub->op() == kRegexpQuest && sub->parse_flags() == flags) - return sub; - Regexp* re = new Regexp(kRegexpQuest, flags); - re->AllocSub(1); - re->sub()[0] = sub; - return re; + return StarPlusOrQuest(kRegexpQuest, sub, flags); } Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub, @@ -211,6 +236,13 @@ Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub, if (nsub == 1) return sub[0]; + if (nsub == 0) { + if (op == kRegexpAlternate) + return new Regexp(kRegexpNoMatch, flags); + else + return new Regexp(kRegexpEmptyMatch, flags); + } + Regexp** subcopy = NULL; if (op == kRegexpAlternate && can_factor) { // Going to edit sub; make a copy so we don't step on caller. @@ -405,7 +437,7 @@ bool Regexp::Equal(Regexp* a, Regexp* b) { // The stack (vector) has pairs of regexps waiting to // be compared. The regexps are only equal if // all the pairs end up being equal. - vector stk; + std::vector stk; for (;;) { // Invariant: TopEqual(a, b) == true. @@ -445,10 +477,11 @@ bool Regexp::Equal(Regexp* a, Regexp* b) { continue; } - int n = stk.size(); + size_t n = stk.size(); if (n == 0) break; + DCHECK_GE(n, 2); a = stk[n-2]; b = stk[n-1]; stk.resize(n-2); @@ -517,7 +550,9 @@ class NumCapturesWalker : public Regexp::Walker { private: int ncapture_; - DISALLOW_EVIL_CONSTRUCTORS(NumCapturesWalker); + + NumCapturesWalker(const NumCapturesWalker&) = delete; + NumCapturesWalker& operator=(const NumCapturesWalker&) = delete; }; int Regexp::NumCaptures() { @@ -532,8 +567,8 @@ class NamedCapturesWalker : public Regexp::Walker { NamedCapturesWalker() : map_(NULL) {} ~NamedCapturesWalker() { delete map_; } - map* TakeMap() { - map* m = map_; + std::map* TakeMap() { + std::map* m = map_; map_ = NULL; return m; } @@ -542,7 +577,7 @@ class NamedCapturesWalker : public Regexp::Walker { if (re->op() == kRegexpCapture && re->name() != NULL) { // Allocate map once we find a name. if (map_ == NULL) - map_ = new map; + map_ = new std::map; // Record first occurrence of each name. // (The rule is that if you have the same name @@ -560,11 +595,13 @@ class NamedCapturesWalker : public Regexp::Walker { } private: - map* map_; - DISALLOW_EVIL_CONSTRUCTORS(NamedCapturesWalker); + std::map* map_; + + NamedCapturesWalker(const NamedCapturesWalker&) = delete; + NamedCapturesWalker& operator=(const NamedCapturesWalker&) = delete; }; -map* Regexp::NamedCaptures() { +std::map* Regexp::NamedCaptures() { NamedCapturesWalker w; w.Walk(this, 0); return w.TakeMap(); @@ -576,8 +613,8 @@ class CaptureNamesWalker : public Regexp::Walker { CaptureNamesWalker() : map_(NULL) {} ~CaptureNamesWalker() { delete map_; } - map* TakeMap() { - map* m = map_; + std::map* TakeMap() { + std::map* m = map_; map_ = NULL; return m; } @@ -586,7 +623,7 @@ class CaptureNamesWalker : public Regexp::Walker { if (re->op() == kRegexpCapture && re->name() != NULL) { // Allocate map once we find a name. if (map_ == NULL) - map_ = new map; + map_ = new std::map; (*map_)[re->cap()] = *re->name(); } @@ -600,11 +637,13 @@ class CaptureNamesWalker : public Regexp::Walker { } private: - map* map_; - DISALLOW_EVIL_CONSTRUCTORS(CaptureNamesWalker); + std::map* map_; + + CaptureNamesWalker(const CaptureNamesWalker&) = delete; + CaptureNamesWalker& operator=(const CaptureNamesWalker&) = delete; }; -map* Regexp::CaptureNames() { +std::map* Regexp::CaptureNames() { CaptureNamesWalker w; w.Walk(this, 0); return w.TakeMap(); @@ -614,7 +653,7 @@ map* Regexp::CaptureNames() { // with a fixed string prefix. If so, returns the prefix and // the regexp that remains after the prefix. The prefix might // be ASCII case-insensitive. -bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) { +bool Regexp::RequiredPrefix(string* prefix, bool* foldcase, Regexp** suffix) { // No need for a walker: the regexp must be of the form // 1. some number of ^ anchors // 2. a literal char or string @@ -643,7 +682,7 @@ bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) { if (re->parse_flags() & Latin1) { prefix->resize(re->nrunes_); for (int j = 0; j < re->nrunes_; j++) - (*prefix)[j] = re->runes_[j]; + (*prefix)[j] = static_cast(re->runes_[j]); } else { // Convert to UTF-8 in place. // Assume worst-case space and then trim. @@ -652,7 +691,7 @@ bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) { for (int j = 0; j < re->nrunes_; j++) { Rune r = re->runes_[j]; if (r < Runeself) - *p++ = r; + *p++ = static_cast(r); else p += runetochar(p, &r); } @@ -662,14 +701,14 @@ bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) { case kRegexpLiteral: if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) { - prefix->append(1, re->rune_); + prefix->append(1, static_cast(re->rune_)); } else { char buf[UTFmax]; prefix->append(buf, runetochar(buf, &re->rune_)); } break; } - *foldcase = (sub[i]->parse_flags() & FoldCase); + *foldcase = (sub[i]->parse_flags() & FoldCase) != 0; i++; // The rest. @@ -704,13 +743,13 @@ bool CharClassBuilder::AddRange(Rune lo, Rune hi) { if (lo <= 'z' && hi >= 'A') { // Overlaps some alpha, maybe not all. // Update bitmaps telling which ASCII letters are in the set. - Rune lo1 = max(lo, 'A'); - Rune hi1 = min(hi, 'Z'); + Rune lo1 = std::max(lo, 'A'); + Rune hi1 = std::min(hi, 'Z'); if (lo1 <= hi1) upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A'); - lo1 = max(lo, 'a'); - hi1 = min(hi, 'z'); + lo1 = std::max(lo, 'a'); + hi1 = std::min(hi, 'z'); if (lo1 <= hi1) lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a'); } @@ -826,7 +865,7 @@ void CharClassBuilder::RemoveAbove(Rune r) { void CharClassBuilder::Negate() { // Build up negation and then copy in. // Could edit ranges in place, but C++ won't let me. - vector v; + std::vector v; v.reserve(ranges_.size() + 1); // In negation, first range begins at 0, unless @@ -863,7 +902,7 @@ void CharClassBuilder::Negate() { CharClass* CharClass::New(int maxranges) { CharClass* cc; - uint8* data = new uint8[sizeof *cc + maxranges*sizeof cc->ranges_[0]]; + uint8_t* data = new uint8_t[sizeof *cc + maxranges*sizeof cc->ranges_[0]]; cc = reinterpret_cast(data); cc->ranges_ = reinterpret_cast(data + sizeof *cc); cc->nranges_ = 0; @@ -873,7 +912,7 @@ CharClass* CharClass::New(int maxranges) { } void CharClass::Delete() { - uint8 *data = reinterpret_cast(this); + uint8_t* data = reinterpret_cast(this); delete[] data; } @@ -915,7 +954,7 @@ bool CharClass::Contains(Rune r) { } CharClass* CharClassBuilder::GetCharClass() { - CharClass* cc = CharClass::New(ranges_.size()); + CharClass* cc = CharClass::New(static_cast(ranges_.size())); int n = 0; for (iterator it = begin(); it != end(); ++it) cc->ranges_[n++] = *it; diff --git a/contrib/libre2/re2/regexp.h b/contrib/libre2/re2/regexp.h index 331c017673c..2ca96cd3eb3 100644 --- a/contrib/libre2/re2/regexp.h +++ b/contrib/libre2/re2/regexp.h @@ -2,6 +2,9 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +#ifndef RE2_REGEXP_H_ +#define RE2_REGEXP_H_ + // --- SPONSORED LINK -------------------------------------------------- // If you want to use this library for regular expression matching, // you should use re2/re2.h, which provides a class RE2 that @@ -83,10 +86,14 @@ // form accessible to clients, so that client code can analyze the // parsed regular expressions. -#ifndef RE2_REGEXP_H__ -#define RE2_REGEXP_H__ +#include +#include +#include +#include #include "util/util.h" +#include "util/logging.h" +#include "util/utf.h" #include "re2/stringpiece.h" namespace re2 { @@ -185,10 +192,10 @@ class RegexpStatus { RegexpStatus() : code_(kRegexpSuccess), tmp_(NULL) {} ~RegexpStatus() { delete tmp_; } - void set_code(enum RegexpStatusCode code) { code_ = code; } + void set_code(RegexpStatusCode code) { code_ = code; } void set_error_arg(const StringPiece& error_arg) { error_arg_ = error_arg; } void set_tmp(string* tmp) { delete tmp_; tmp_ = tmp; } - enum RegexpStatusCode code() const { return code_; } + RegexpStatusCode code() const { return code_; } const StringPiece& error_arg() const { return error_arg_; } bool ok() const { return code() == kRegexpSuccess; } @@ -197,23 +204,21 @@ class RegexpStatus { // Returns text equivalent of code, e.g.: // "Bad character class" - static string CodeText(enum RegexpStatusCode code); + static string CodeText(RegexpStatusCode code); // Returns text describing error, e.g.: // "Bad character class: [z-a]" string Text() const; private: - enum RegexpStatusCode code_; // Kind of error + RegexpStatusCode code_; // Kind of error StringPiece error_arg_; // Piece of regexp containing syntax error. string* tmp_; // Temporary storage, possibly where error_arg_ is. - DISALLOW_EVIL_CONSTRUCTORS(RegexpStatus); + RegexpStatus(const RegexpStatus&) = delete; + RegexpStatus& operator=(const RegexpStatus&) = delete; }; -// Walker to implement Simplify. -class SimplifyWalker; - // Compiled form; see prog.h class Prog; @@ -261,7 +266,9 @@ class CharClass { int nrunes_; RuneRange *ranges_; int nranges_; - DISALLOW_EVIL_CONSTRUCTORS(CharClass); + + CharClass(const CharClass&) = delete; + CharClass& operator=(const CharClass&) = delete; }; class Regexp { @@ -269,51 +276,52 @@ class Regexp { // Flags for parsing. Can be ORed together. enum ParseFlags { - NoParseFlags = 0, - FoldCase = 1<<0, // Fold case during matching (case-insensitive). - Literal = 1<<1, // Treat s as literal string instead of a regexp. - ClassNL = 1<<2, // Allow char classes like [^a-z] and \D and \s - // and [[:space:]] to match newline. - DotNL = 1<<3, // Allow . to match newline. - MatchNL = ClassNL | DotNL, - OneLine = 1<<4, // Treat ^ and $ as only matching at beginning and - // end of text, not around embedded newlines. - // (Perl's default) - Latin1 = 1<<5, // Regexp and text are in Latin1, not UTF-8. - NonGreedy = 1<<6, // Repetition operators are non-greedy by default. - PerlClasses = 1<<7, // Allow Perl character classes like \d. - PerlB = 1<<8, // Allow Perl's \b and \B. - PerlX = 1<<9, // Perl extensions: - // non-capturing parens - (?: ) - // non-greedy operators - *? +? ?? {}? - // flag edits - (?i) (?-i) (?i: ) - // i - FoldCase - // m - !OneLine - // s - DotNL - // U - NonGreedy - // line ends: \A \z - // \Q and \E to disable/enable metacharacters - // (?Pexpr) for named captures - // \C to match any single byte - UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group - // and \P{Han} for its negation. - NeverNL = 1<<11, // Never match NL, even if the regexp mentions - // it explicitly. - NeverCapture = 1<<12, // Parse all parens as non-capturing. + NoParseFlags = 0, + FoldCase = 1<<0, // Fold case during matching (case-insensitive). + Literal = 1<<1, // Treat s as literal string instead of a regexp. + ClassNL = 1<<2, // Allow char classes like [^a-z] and \D and \s + // and [[:space:]] to match newline. + DotNL = 1<<3, // Allow . to match newline. + MatchNL = ClassNL | DotNL, + OneLine = 1<<4, // Treat ^ and $ as only matching at beginning and + // end of text, not around embedded newlines. + // (Perl's default) + Latin1 = 1<<5, // Regexp and text are in Latin1, not UTF-8. + NonGreedy = 1<<6, // Repetition operators are non-greedy by default. + PerlClasses = 1<<7, // Allow Perl character classes like \d. + PerlB = 1<<8, // Allow Perl's \b and \B. + PerlX = 1<<9, // Perl extensions: + // non-capturing parens - (?: ) + // non-greedy operators - *? +? ?? {}? + // flag edits - (?i) (?-i) (?i: ) + // i - FoldCase + // m - !OneLine + // s - DotNL + // U - NonGreedy + // line ends: \A \z + // \Q and \E to disable/enable metacharacters + // (?Pexpr) for named captures + // \C to match any single byte + UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group + // and \P{Han} for its negation. + NeverNL = 1<<11, // Never match NL, even if the regexp mentions + // it explicitly. + NeverCapture = 1<<12, // Parse all parens as non-capturing. // As close to Perl as we can get. - LikePerl = ClassNL | OneLine | PerlClasses | PerlB | PerlX | - UnicodeGroups, + LikePerl = ClassNL | OneLine | PerlClasses | PerlB | PerlX | + UnicodeGroups, // Internal use only. - WasDollar = 1<<15, // on kRegexpEndText: was $ in regexp text + WasDollar = 1<<13, // on kRegexpEndText: was $ in regexp text + AllParseFlags = (1<<14)-1, }; // Get. No set, Regexps are logically immutable once created. RegexpOp op() { return static_cast(op_); } int nsub() { return nsub_; } - bool simple() { return simple_; } - enum ParseFlags parse_flags() { return static_cast(parse_flags_); } + bool simple() { return simple_ != 0; } + ParseFlags parse_flags() { return static_cast(parse_flags_); } int Ref(); // For testing. Regexp** sub() { @@ -353,6 +361,7 @@ class Regexp { // removed. The result will capture exactly the same // subexpressions the original did, unless formatted with ToString. Regexp* Simplify(); + friend class CoalesceWalker; friend class SimplifyWalker; // Parses the regexp src and then simplifies it and sets *dst to the @@ -369,12 +378,12 @@ class Regexp { // Returns a map from names to capturing group indices, // or NULL if the regexp contains no named capture groups. // The caller is responsible for deleting the map. - map* NamedCaptures(); + std::map* NamedCaptures(); // Returns a map from capturing group indices to capturing group // names or NULL if the regexp contains no named capture groups. The // caller is responsible for deleting the map. - map* CaptureNames(); + std::map* CaptureNames(); // Returns a string representation of the current regexp, // using as few parentheses as possible. @@ -410,8 +419,8 @@ class Regexp { // Construction and execution of prog will // stay within approximately max_mem bytes of memory. // If max_mem <= 0, a reasonable default is used. - Prog* CompileToProg(int64 max_mem); - Prog* CompileToReverseProg(int64 max_mem); + Prog* CompileToProg(int64_t max_mem); + Prog* CompileToReverseProg(int64_t max_mem); // Whether to expect this library to find exactly the same answer as PCRE // when running this regexp. Most regexps do mimic PCRE exactly, but a few @@ -427,7 +436,9 @@ class Regexp { // begin with a non-empty fixed string (perhaps after ASCII // case-folding). If so, returns the prefix and the sub-regexp that // follows it. - bool RequiredPrefix(string* prefix, bool *foldcase, Regexp** suffix); + // Callers should expect *prefix, *foldcase and *suffix to be "zeroed" + // regardless of the return value. + bool RequiredPrefix(string* prefix, bool* foldcase, Regexp** suffix); private: // Constructor allocates vectors as appropriate for operator. @@ -441,6 +452,7 @@ class Regexp { // Helpers for Parse. Listed here so they can edit Regexps. class ParseState; + friend class ParseState; friend bool ParseCharClass(StringPiece* s, Regexp** out_re, RegexpStatus* status); @@ -451,6 +463,10 @@ class Regexp { // Computes whether Regexp is already simple. bool ComputeSimple(); + // Constructor that generates a Star, Plus or Quest, + // squashing the pair if sub is also a Star, Plus or Quest. + static Regexp* StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags); + // Constructor that generates a concatenation or alternation, // enforcing the limit on the number of subexpressions for // a particular Regexp. @@ -478,8 +494,7 @@ class Regexp { // Simplifies an alternation of literal strings by factoring out // common prefixes. static int FactorAlternation(Regexp** sub, int nsub, ParseFlags flags); - static int FactorAlternationRecursive(Regexp** sub, int nsub, - ParseFlags flags, int maxdepth); + friend class FactorAlternationImpl; // Is a == b? Only efficient on regexps that have not been through // Simplify yet - the expansion of a kRegexpRepeat will make this @@ -488,11 +503,10 @@ class Regexp { // Allocate space for n sub-regexps. void AllocSub(int n) { - if (n < 0 || static_cast(n) != n) - LOG(FATAL) << "Cannot AllocSub " << n; + DCHECK(n >= 0 && static_cast(n) == n); if (n > 1) submany_ = new Regexp*[n]; - nsub_ = n; + nsub_ = static_cast(n); } // Add Rune to LiteralString @@ -502,38 +516,38 @@ class Regexp { void Swap(Regexp *that); // Operator. See description of operators above. - // uint8 instead of RegexpOp to control space usage. - uint8 op_; + // uint8_t instead of RegexpOp to control space usage. + uint8_t op_; // Is this regexp structure already simple // (has it been returned by Simplify)? - // uint8 instead of bool to control space usage. - uint8 simple_; + // uint8_t instead of bool to control space usage. + uint8_t simple_; // Flags saved from parsing and used during execution. // (Only FoldCase is used.) - // uint16 instead of ParseFlags to control space usage. - uint16 parse_flags_; + // uint16_t instead of ParseFlags to control space usage. + uint16_t parse_flags_; // Reference count. Exists so that SimplifyRegexp can build // regexp structures that are dags rather than trees to avoid // exponential blowup in space requirements. - // uint16 to control space usage. + // uint16_t to control space usage. // The standard regexp routines will never generate a - // ref greater than the maximum repeat count (100), + // ref greater than the maximum repeat count (kMaxRepeat), // but even so, Incref and Decref consult an overflow map // when ref_ reaches kMaxRef. - uint16 ref_; - static const uint16 kMaxRef = 0xffff; + uint16_t ref_; + static const uint16_t kMaxRef = 0xffff; // Subexpressions. - // uint16 to control space usage. + // uint16_t to control space usage. // Concat and Alternate handle larger numbers of subexpressions // by building concatenation or alternation trees. // Other routines should call Concat or Alternate instead of // filling in sub() by hand. - uint16 nsub_; - static const uint16 kMaxNsub = 0xffff; + uint16_t nsub_; + static const uint16_t kMaxNsub = 0xffff; union { Regexp** submany_; // if nsub_ > 1 Regexp* subone_; // if nsub_ == 1 @@ -568,11 +582,12 @@ class Regexp { void *the_union_[2]; // as big as any other element, for memset }; - DISALLOW_EVIL_CONSTRUCTORS(Regexp); + Regexp(const Regexp&) = delete; + Regexp& operator=(const Regexp&) = delete; }; // Character class set: contains non-overlapping, non-abutting RuneRanges. -typedef set RuneRangeSet; +typedef std::set RuneRangeSet; class CharClassBuilder { public: @@ -597,37 +612,41 @@ class CharClassBuilder { void AddRangeFlags(Rune lo, Rune hi, Regexp::ParseFlags parse_flags); private: - static const uint32 AlphaMask = (1<<26) - 1; - uint32 upper_; // bitmap of A-Z - uint32 lower_; // bitmap of a-z + static const uint32_t AlphaMask = (1<<26) - 1; + uint32_t upper_; // bitmap of A-Z + uint32_t lower_; // bitmap of a-z int nrunes_; RuneRangeSet ranges_; - DISALLOW_EVIL_CONSTRUCTORS(CharClassBuilder); + + CharClassBuilder(const CharClassBuilder&) = delete; + CharClassBuilder& operator=(const CharClassBuilder&) = delete; }; -// Tell g++ that bitwise ops on ParseFlags produce ParseFlags. -inline Regexp::ParseFlags operator|(Regexp::ParseFlags a, Regexp::ParseFlags b) -{ - return static_cast(static_cast(a) | static_cast(b)); +// Bitwise ops on ParseFlags produce ParseFlags. +inline Regexp::ParseFlags operator|(Regexp::ParseFlags a, + Regexp::ParseFlags b) { + return static_cast( + static_cast(a) | static_cast(b)); } -inline Regexp::ParseFlags operator^(Regexp::ParseFlags a, Regexp::ParseFlags b) -{ - return static_cast(static_cast(a) ^ static_cast(b)); +inline Regexp::ParseFlags operator^(Regexp::ParseFlags a, + Regexp::ParseFlags b) { + return static_cast( + static_cast(a) ^ static_cast(b)); } -inline Regexp::ParseFlags operator&(Regexp::ParseFlags a, Regexp::ParseFlags b) -{ - return static_cast(static_cast(a) & static_cast(b)); +inline Regexp::ParseFlags operator&(Regexp::ParseFlags a, + Regexp::ParseFlags b) { + return static_cast( + static_cast(a) & static_cast(b)); } -inline Regexp::ParseFlags operator~(Regexp::ParseFlags a) -{ - return static_cast(~static_cast(a)); +inline Regexp::ParseFlags operator~(Regexp::ParseFlags a) { + // Attempting to produce a value out of enum's range has undefined behaviour. + return static_cast( + ~static_cast(a) & static_cast(Regexp::AllParseFlags)); } - - } // namespace re2 -#endif // RE2_REGEXP_H__ +#endif // RE2_REGEXP_H_ diff --git a/contrib/libre2/re2/set.cc b/contrib/libre2/re2/set.cc index 8eec9c83320..8f736c49eae 100644 --- a/contrib/libre2/re2/set.cc +++ b/contrib/libre2/re2/set.cc @@ -4,36 +4,42 @@ #include "re2/set.h" +#include +#include +#include + #include "util/util.h" +#include "util/logging.h" #include "re2/stringpiece.h" #include "re2/prog.h" #include "re2/re2.h" #include "re2/regexp.h" -using namespace re2; +namespace re2 { RE2::Set::Set(const RE2::Options& options, RE2::Anchor anchor) { options_.Copy(options); + options_.set_never_capture(true); // might unblock some optimisations anchor_ = anchor; prog_ = NULL; compiled_ = false; + size_ = 0; } RE2::Set::~Set() { - for (size_t i = 0; i < re_.size(); i++) - re_[i]->Decref(); + for (size_t i = 0; i < elem_.size(); i++) + elem_[i].second->Decref(); delete prog_; } int RE2::Set::Add(const StringPiece& pattern, string* error) { if (compiled_) { - LOG(DFATAL) << "RE2::Set::Add after Compile"; + LOG(DFATAL) << "RE2::Set::Add() called after compiling"; return -1; } Regexp::ParseFlags pf = static_cast( options_.ParseFlags()); - RegexpStatus status; re2::Regexp* re = Regexp::Parse(pattern, pf, &status); if (re == NULL) { @@ -45,7 +51,7 @@ int RE2::Set::Add(const StringPiece& pattern, string* error) { } // Concatenate with match index and push on vector. - int n = re_.size(); + int n = static_cast(elem_.size()); re2::Regexp* m = re2::Regexp::HaveMatch(n, pf); if (re->op() == kRegexpConcat) { int nsub = re->nsub(); @@ -62,52 +68,87 @@ int RE2::Set::Add(const StringPiece& pattern, string* error) { sub[1] = m; re = re2::Regexp::Concat(sub, 2, pf); } - re_.push_back(re); + elem_.emplace_back(pattern.ToString(), re); return n; } bool RE2::Set::Compile() { if (compiled_) { - LOG(DFATAL) << "RE2::Set::Compile multiple times"; + LOG(DFATAL) << "RE2::Set::Compile() called more than once"; return false; } compiled_ = true; + size_ = static_cast(elem_.size()); + + // Sort the elements by their patterns. This is good enough for now + // until we have a Regexp comparison function. (Maybe someday...) + std::sort(elem_.begin(), elem_.end(), + [](const Elem& a, const Elem& b) -> bool { + return a.first < b.first; + }); + + re2::Regexp** sub = new re2::Regexp*[size_]; + for (size_t i = 0; i < elem_.size(); i++) + sub[i] = elem_[i].second; + elem_.clear(); + elem_.shrink_to_fit(); Regexp::ParseFlags pf = static_cast( options_.ParseFlags()); - re2::Regexp* re = re2::Regexp::Alternate(const_cast(&re_[0]), - re_.size(), pf); - re_.clear(); - re2::Regexp* sre = re->Simplify(); - re->Decref(); - re = sre; - if (re == NULL) { - if (options_.log_errors()) - LOG(ERROR) << "Error simplifying during Compile."; - return false; - } + re2::Regexp* re = re2::Regexp::Alternate(sub, size_, pf); + delete[] sub; - prog_ = Prog::CompileSet(options_, anchor_, re); + prog_ = Prog::CompileSet(re, anchor_, options_.max_mem()); + re->Decref(); return prog_ != NULL; } -bool RE2::Set::Match(const StringPiece& text, vector* v) const { - if (!compiled_) { - LOG(DFATAL) << "RE2::Set::Match without Compile"; - return false; - } - v->clear(); - bool failed; - bool ret = prog_->SearchDFA(text, text, Prog::kAnchored, - Prog::kManyMatch, NULL, &failed, v); - if (failed) - LOG(DFATAL) << "RE2::Set::Match: DFA ran out of cache space"; +bool RE2::Set::Match(const StringPiece& text, std::vector* v) const { + return Match(text, v, NULL); +} - if (ret == false) - return false; - if (v->size() == 0) { - LOG(DFATAL) << "RE2::Set::Match: match but unknown regexp set"; +bool RE2::Set::Match(const StringPiece& text, std::vector* v, + ErrorInfo* error_info) const { + if (!compiled_) { + LOG(DFATAL) << "RE2::Set::Match() called before compiling"; + if (error_info != NULL) + error_info->kind = kNotCompiled; return false; } + bool dfa_failed = false; + std::unique_ptr matches; + if (v != NULL) { + matches.reset(new SparseSet(size_)); + v->clear(); + } + bool ret = prog_->SearchDFA(text, text, Prog::kAnchored, Prog::kManyMatch, + NULL, &dfa_failed, matches.get()); + if (dfa_failed) { + if (options_.log_errors()) + LOG(ERROR) << "DFA out of memory: size " << prog_->size() << ", " + << "bytemap range " << prog_->bytemap_range() << ", " + << "list count " << prog_->list_count(); + if (error_info != NULL) + error_info->kind = kOutOfMemory; + return false; + } + if (ret == false) { + if (error_info != NULL) + error_info->kind = kNoError; + return false; + } + if (v != NULL) { + if (matches->empty()) { + LOG(DFATAL) << "RE2::Set::Match() matched, but no matches returned?!"; + if (error_info != NULL) + error_info->kind = kInconsistent; + return false; + } + v->assign(matches->begin(), matches->end()); + } + if (error_info != NULL) + error_info->kind = kNoError; return true; } + +} // namespace re2 diff --git a/contrib/libre2/re2/set.h b/contrib/libre2/re2/set.h index d7164257f16..a8c2caa4a87 100644 --- a/contrib/libre2/re2/set.h +++ b/contrib/libre2/re2/set.h @@ -2,54 +2,79 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -#ifndef RE2_SET_H -#define RE2_SET_H +#ifndef RE2_SET_H_ +#define RE2_SET_H_ +#include #include #include #include "re2/re2.h" namespace re2 { -using std::vector; +class Prog; +class Regexp; +} // namespace re2 + +namespace re2 { // An RE2::Set represents a collection of regexps that can // be searched for simultaneously. class RE2::Set { public: + enum ErrorKind { + kNoError = 0, + kNotCompiled, // The set is not compiled. + kOutOfMemory, // The DFA ran out of memory. + kInconsistent, // The result is inconsistent. This should never happen. + }; + + struct ErrorInfo { + ErrorKind kind; + }; + Set(const RE2::Options& options, RE2::Anchor anchor); ~Set(); - // Add adds regexp pattern to the set, interpreted using the RE2 options. - // (The RE2 constructor's default options parameter is RE2::UTF8.) - // Add returns the regexp index that will be used to identify - // it in the result of Match, or -1 if the regexp cannot be parsed. + // Adds pattern to the set using the options passed to the constructor. + // Returns the index that will identify the regexp in the output of Match(), + // or -1 if the regexp cannot be parsed. // Indices are assigned in sequential order starting from 0. - // Error returns do not increment the index. - // If an error occurs and error != NULL, *error will hold an error message. + // Errors do not increment the index; if error is not NULL, *error will hold + // the error message from the parser. int Add(const StringPiece& pattern, string* error); - // Compile prepares the Set for matching. - // Add must not be called again after Compile. - // Compile must be called before FullMatch or PartialMatch. - // Compile may return false if it runs out of memory. + // Compiles the set in preparation for matching. + // Returns false if the compiler runs out of memory. + // Add() must not be called again after Compile(). + // Compile() must be called before Match(). bool Compile(); - // Match returns true if text matches any of the regexps in the set. - // If so, it fills v with the indices of the matching regexps. - bool Match(const StringPiece& text, vector* v) const; + // Returns true if text matches at least one of the regexps in the set. + // Fills v (if not NULL) with the indices of the matching regexps. + // Callers must not expect v to be sorted. + bool Match(const StringPiece& text, std::vector* v) const; + + // As above, but populates error_info (if not NULL) when none of the regexps + // in the set matched. This can inform callers when DFA execution fails, for + // example, because they might wish to handle that case differently. + bool Match(const StringPiece& text, std::vector* v, + ErrorInfo* error_info) const; private: + typedef std::pair Elem; + RE2::Options options_; RE2::Anchor anchor_; - vector re_; + std::vector elem_; re2::Prog* prog_; bool compiled_; - //DISALLOW_EVIL_CONSTRUCTORS(Set); - Set(const Set&); - void operator=(const Set&); + int size_; + + Set(const Set&) = delete; + Set& operator=(const Set&) = delete; }; } // namespace re2 -#endif // RE2_SET_H +#endif // RE2_SET_H_ diff --git a/contrib/libre2/re2/simplify.cc b/contrib/libre2/re2/simplify.cc index faf32084e05..910ebcc2763 100644 --- a/contrib/libre2/re2/simplify.cc +++ b/contrib/libre2/re2/simplify.cc @@ -6,7 +6,11 @@ // to use simple extended regular expression features. // Also sort and simplify character classes. +#include + #include "util/util.h" +#include "util/logging.h" +#include "util/utf.h" #include "re2/regexp.h" #include "re2/walker-inl.h" @@ -61,7 +65,7 @@ bool Regexp::ComputeSimple() { // These are simple as long as the subpieces are simple. subs = sub(); for (int i = 0; i < nsub_; i++) - if (!subs[i]->simple_) + if (!subs[i]->simple()) return false; return true; case kRegexpCharClass: @@ -71,12 +75,12 @@ bool Regexp::ComputeSimple() { return !cc_->empty() && !cc_->full(); case kRegexpCapture: subs = sub(); - return subs[0]->simple_; + return subs[0]->simple(); case kRegexpStar: case kRegexpPlus: case kRegexpQuest: subs = sub(); - if (!subs[0]->simple_) + if (!subs[0]->simple()) return false; switch (subs[0]->op_) { case kRegexpStar: @@ -96,6 +100,37 @@ bool Regexp::ComputeSimple() { return false; } +// Walker subclass used by Simplify. +// Coalesces runs of star/plus/quest/repeat of the same literal along with any +// occurrences of that literal into repeats of that literal. It also works for +// char classes, any char and any byte. +// PostVisit creates the coalesced result, which should then be simplified. +class CoalesceWalker : public Regexp::Walker { + public: + CoalesceWalker() {} + virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg, + Regexp** child_args, int nchild_args); + virtual Regexp* Copy(Regexp* re); + virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg); + + private: + // These functions are declared inside CoalesceWalker so that + // they can edit the private fields of the Regexps they construct. + + // Returns true if r1 and r2 can be coalesced. In particular, ensures that + // the parse flags are consistent. (They will not be checked again later.) + static bool CanCoalesce(Regexp* r1, Regexp* r2); + + // Coalesces *r1ptr and *r2ptr. In most cases, the array elements afterwards + // will be empty match and the coalesced op. In other cases, where part of a + // literal string was removed to be coalesced, the array elements afterwards + // will be the coalesced op and the remainder of the literal string. + static void DoCoalesce(Regexp** r1ptr, Regexp** r2ptr); + + CoalesceWalker(const CoalesceWalker&) = delete; + CoalesceWalker& operator=(const CoalesceWalker&) = delete; +}; + // Walker subclass used by Simplify. // The simplify walk is purely post-recursive: given the simplified children, // PostVisit creates the simplified result. @@ -104,9 +139,7 @@ class SimplifyWalker : public Regexp::Walker { public: SimplifyWalker() {} virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop); - virtual Regexp* PostVisit(Regexp* re, - Regexp* parent_arg, - Regexp* pre_arg, + virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg, Regexp** child_args, int nchild_args); virtual Regexp* Copy(Regexp* re); virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg); @@ -130,7 +163,8 @@ class SimplifyWalker : public Regexp::Walker { // Caller must Decref return value when done with it. static Regexp* SimplifyCharClass(Regexp* re); - DISALLOW_EVIL_CONSTRUCTORS(SimplifyWalker); + SimplifyWalker(const SimplifyWalker&) = delete; + SimplifyWalker& operator=(const SimplifyWalker&) = delete; }; // Simplifies a regular expression, returning a new regexp. @@ -143,14 +177,261 @@ class SimplifyWalker : public Regexp::Walker { // Caller must Decref() return value when done with it. Regexp* Regexp::Simplify() { - if (simple_) - return Incref(); - SimplifyWalker w; - return w.Walk(this, NULL); + CoalesceWalker cw; + Regexp* cre = cw.Walk(this, NULL); + if (cre == NULL) + return cre; + SimplifyWalker sw; + Regexp* sre = sw.Walk(cre, NULL); + cre->Decref(); + return sre; } #define Simplify DontCallSimplify // Avoid accidental recursion +// Utility function for PostVisit implementations that compares re->sub() with +// child_args to determine whether any child_args changed. In the common case, +// where nothing changed, calls Decref() for all child_args and returns false, +// so PostVisit must return re->Incref(). Otherwise, returns true. +static bool ChildArgsChanged(Regexp* re, Regexp** child_args) { + for (int i = 0; i < re->nsub(); i++) { + Regexp* sub = re->sub()[i]; + Regexp* newsub = child_args[i]; + if (newsub != sub) + return true; + } + for (int i = 0; i < re->nsub(); i++) { + Regexp* newsub = child_args[i]; + newsub->Decref(); + } + return false; +} + +Regexp* CoalesceWalker::Copy(Regexp* re) { + return re->Incref(); +} + +Regexp* CoalesceWalker::ShortVisit(Regexp* re, Regexp* parent_arg) { + // This should never be called, since we use Walk and not + // WalkExponential. + LOG(DFATAL) << "CoalesceWalker::ShortVisit called"; + return re->Incref(); +} + +Regexp* CoalesceWalker::PostVisit(Regexp* re, + Regexp* parent_arg, + Regexp* pre_arg, + Regexp** child_args, + int nchild_args) { + if (re->nsub() == 0) + return re->Incref(); + + if (re->op() != kRegexpConcat) { + if (!ChildArgsChanged(re, child_args)) + return re->Incref(); + + // Something changed. Build a new op. + Regexp* nre = new Regexp(re->op(), re->parse_flags()); + nre->AllocSub(re->nsub()); + Regexp** nre_subs = nre->sub(); + for (int i = 0; i < re->nsub(); i++) + nre_subs[i] = child_args[i]; + // Repeats and Captures have additional data that must be copied. + if (re->op() == kRegexpRepeat) { + nre->min_ = re->min(); + nre->max_ = re->max(); + } else if (re->op() == kRegexpCapture) { + nre->cap_ = re->cap(); + } + return nre; + } + + bool can_coalesce = false; + for (int i = 0; i < re->nsub(); i++) { + if (i+1 < re->nsub() && + CanCoalesce(child_args[i], child_args[i+1])) { + can_coalesce = true; + break; + } + } + if (!can_coalesce) { + if (!ChildArgsChanged(re, child_args)) + return re->Incref(); + + // Something changed. Build a new op. + Regexp* nre = new Regexp(re->op(), re->parse_flags()); + nre->AllocSub(re->nsub()); + Regexp** nre_subs = nre->sub(); + for (int i = 0; i < re->nsub(); i++) + nre_subs[i] = child_args[i]; + return nre; + } + + for (int i = 0; i < re->nsub(); i++) { + if (i+1 < re->nsub() && + CanCoalesce(child_args[i], child_args[i+1])) + DoCoalesce(&child_args[i], &child_args[i+1]); + } + // Determine how many empty matches were left by DoCoalesce. + int n = 0; + for (int i = n; i < re->nsub(); i++) { + if (child_args[i]->op() == kRegexpEmptyMatch) + n++; + } + // Build a new op. + Regexp* nre = new Regexp(re->op(), re->parse_flags()); + nre->AllocSub(re->nsub() - n); + Regexp** nre_subs = nre->sub(); + for (int i = 0, j = 0; i < re->nsub(); i++) { + if (child_args[i]->op() == kRegexpEmptyMatch) { + child_args[i]->Decref(); + continue; + } + nre_subs[j] = child_args[i]; + j++; + } + return nre; +} + +bool CoalesceWalker::CanCoalesce(Regexp* r1, Regexp* r2) { + // r1 must be a star/plus/quest/repeat of a literal, char class, any char or + // any byte. + if ((r1->op() == kRegexpStar || + r1->op() == kRegexpPlus || + r1->op() == kRegexpQuest || + r1->op() == kRegexpRepeat) && + (r1->sub()[0]->op() == kRegexpLiteral || + r1->sub()[0]->op() == kRegexpCharClass || + r1->sub()[0]->op() == kRegexpAnyChar || + r1->sub()[0]->op() == kRegexpAnyByte)) { + // r2 must be a star/plus/quest/repeat of the same literal, char class, + // any char or any byte. + if ((r2->op() == kRegexpStar || + r2->op() == kRegexpPlus || + r2->op() == kRegexpQuest || + r2->op() == kRegexpRepeat) && + Regexp::Equal(r1->sub()[0], r2->sub()[0]) && + // The parse flags must be consistent. + ((r1->parse_flags() & Regexp::NonGreedy) == + (r2->parse_flags() & Regexp::NonGreedy))) { + return true; + } + // ... OR an occurrence of that literal, char class, any char or any byte + if (Regexp::Equal(r1->sub()[0], r2)) { + return true; + } + // ... OR a literal string that begins with that literal. + if (r1->sub()[0]->op() == kRegexpLiteral && + r2->op() == kRegexpLiteralString && + r2->runes()[0] == r1->sub()[0]->rune() && + // The parse flags must be consistent. + ((r1->sub()[0]->parse_flags() & Regexp::FoldCase) == + (r2->parse_flags() & Regexp::FoldCase))) { + return true; + } + } + return false; +} + +void CoalesceWalker::DoCoalesce(Regexp** r1ptr, Regexp** r2ptr) { + Regexp* r1 = *r1ptr; + Regexp* r2 = *r2ptr; + + Regexp* nre = Regexp::Repeat( + r1->sub()[0]->Incref(), r1->parse_flags(), 0, 0); + + switch (r1->op()) { + case kRegexpStar: + nre->min_ = 0; + nre->max_ = -1; + break; + + case kRegexpPlus: + nre->min_ = 1; + nre->max_ = -1; + break; + + case kRegexpQuest: + nre->min_ = 0; + nre->max_ = 1; + break; + + case kRegexpRepeat: + nre->min_ = r1->min(); + nre->max_ = r1->max(); + break; + + default: + LOG(DFATAL) << "DoCoalesce failed: r1->op() is " << r1->op(); + nre->Decref(); + return; + } + + switch (r2->op()) { + case kRegexpStar: + nre->max_ = -1; + goto LeaveEmpty; + + case kRegexpPlus: + nre->min_++; + nre->max_ = -1; + goto LeaveEmpty; + + case kRegexpQuest: + if (nre->max() != -1) + nre->max_++; + goto LeaveEmpty; + + case kRegexpRepeat: + nre->min_ += r2->min(); + if (r2->max() == -1) + nre->max_ = -1; + else if (nre->max() != -1) + nre->max_ += r2->max(); + goto LeaveEmpty; + + case kRegexpLiteral: + case kRegexpCharClass: + case kRegexpAnyChar: + case kRegexpAnyByte: + nre->min_++; + if (nre->max() != -1) + nre->max_++; + goto LeaveEmpty; + + LeaveEmpty: + *r1ptr = new Regexp(kRegexpEmptyMatch, Regexp::NoParseFlags); + *r2ptr = nre; + break; + + case kRegexpLiteralString: { + Rune r = r1->sub()[0]->rune(); + // Determine how much of the literal string is removed. + // We know that we have at least one rune. :) + int n = 1; + while (n < r2->nrunes() && r2->runes()[n] == r) + n++; + nre->min_ += n; + if (nre->max() != -1) + nre->max_ += n; + if (n == r2->nrunes()) + goto LeaveEmpty; + *r1ptr = nre; + *r2ptr = Regexp::LiteralString( + &r2->runes()[n], r2->nrunes() - n, r2->parse_flags()); + break; + } + + default: + LOG(DFATAL) << "DoCoalesce failed: r2->op() is " << r2->op(); + nre->Decref(); + return; + } + + r1->Decref(); + r2->Decref(); +} + Regexp* SimplifyWalker::Copy(Regexp* re) { return re->Incref(); } @@ -163,7 +444,7 @@ Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) { } Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) { - if (re->simple_) { + if (re->simple()) { *stop = true; return re->Incref(); } @@ -196,29 +477,14 @@ Regexp* SimplifyWalker::PostVisit(Regexp* re, case kRegexpConcat: case kRegexpAlternate: { // These are simple as long as the subpieces are simple. - // Two passes to avoid allocation in the common case. - bool changed = false; - Regexp** subs = re->sub(); - for (int i = 0; i < re->nsub_; i++) { - Regexp* sub = subs[i]; - Regexp* newsub = child_args[i]; - if (newsub != sub) { - changed = true; - break; - } - } - if (!changed) { - for (int i = 0; i < re->nsub_; i++) { - Regexp* newsub = child_args[i]; - newsub->Decref(); - } + if (!ChildArgsChanged(re, child_args)) { re->simple_ = true; return re->Incref(); } Regexp* nre = new Regexp(re->op(), re->parse_flags()); - nre->AllocSub(re->nsub_); + nre->AllocSub(re->nsub()); Regexp** nre_subs = nre->sub(); - for (int i = 0; i nsub_; i++) + for (int i = 0; i < re->nsub(); i++) nre_subs[i] = child_args[i]; nre->simple_ = true; return nre; @@ -234,7 +500,7 @@ Regexp* SimplifyWalker::PostVisit(Regexp* re, Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags()); nre->AllocSub(1); nre->sub()[0] = newsub; - nre->cap_ = re->cap_; + nre->cap_ = re->cap(); nre->simple_ = true; return nre; } @@ -323,13 +589,12 @@ Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max, return Regexp::Plus(re->Incref(), f); // General case: x{4,} is xxxx+ - Regexp* nre = new Regexp(kRegexpConcat, f); - nre->AllocSub(min); - VLOG(1) << "Simplify " << min; - Regexp** nre_subs = nre->sub(); + Regexp** nre_subs = new Regexp*[min]; for (int i = 0; i < min-1; i++) nre_subs[i] = re->Incref(); nre_subs[min-1] = Regexp::Plus(re->Incref(), f); + Regexp* nre = Regexp::Concat(nre_subs, min, f); + delete[] nre_subs; return nre; } @@ -348,11 +613,11 @@ Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max, // Build leading prefix: xx. Capturing only on the last one. Regexp* nre = NULL; if (min > 0) { - nre = new Regexp(kRegexpConcat, f); - nre->AllocSub(min); - Regexp** nre_subs = nre->sub(); + Regexp** nre_subs = new Regexp*[min]; for (int i = 0; i < min; i++) nre_subs[i] = re->Incref(); + nre = Regexp::Concat(nre_subs, min, f); + delete[] nre_subs; } // Build and attach suffix: (x(x(x)?)?)? diff --git a/contrib/libre2/re2/stringpiece.cc b/contrib/libre2/re2/stringpiece.cc new file mode 100644 index 00000000000..ef2e2874ead --- /dev/null +++ b/contrib/libre2/re2/stringpiece.cc @@ -0,0 +1,65 @@ +// Copyright 2004 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "re2/stringpiece.h" + +#include + +#include "util/util.h" + +namespace re2 { + +const StringPiece::size_type StringPiece::npos; // initialized in stringpiece.h + +StringPiece::size_type StringPiece::copy(char* buf, size_type n, + size_type pos) const { + size_type ret = std::min(size_ - pos, n); + memcpy(buf, data_ + pos, ret); + return ret; +} + +StringPiece StringPiece::substr(size_type pos, size_type n) const { + if (pos > size_) pos = size_; + if (n > size_ - pos) n = size_ - pos; + return StringPiece(data_ + pos, n); +} + +StringPiece::size_type StringPiece::find(const StringPiece& s, + size_type pos) const { + if (pos > size_) return npos; + const_pointer result = std::search(data_ + pos, data_ + size_, + s.data_, s.data_ + s.size_); + size_type xpos = result - data_; + return xpos + s.size_ <= size_ ? xpos : npos; +} + +StringPiece::size_type StringPiece::find(char c, size_type pos) const { + if (size_ <= 0 || pos >= size_) return npos; + const_pointer result = std::find(data_ + pos, data_ + size_, c); + return result != data_ + size_ ? result - data_ : npos; +} + +StringPiece::size_type StringPiece::rfind(const StringPiece& s, + size_type pos) const { + if (size_ < s.size_) return npos; + if (s.size_ == 0) return std::min(size_, pos); + const_pointer last = data_ + std::min(size_ - s.size_, pos) + s.size_; + const_pointer result = std::find_end(data_, last, s.data_, s.data_ + s.size_); + return result != last ? result - data_ : npos; +} + +StringPiece::size_type StringPiece::rfind(char c, size_type pos) const { + if (size_ <= 0) return npos; + for (size_t i = std::min(pos + 1, size_); i != 0;) { + if (data_[--i] == c) return i; + } + return npos; +} + +std::ostream& operator<<(std::ostream& o, const StringPiece& p) { + o.write(p.data(), p.size()); + return o; +} + +} // namespace re2 diff --git a/contrib/libre2/re2/stringpiece.h b/contrib/libre2/re2/stringpiece.h index ab9297c6d93..133cc827243 100644 --- a/contrib/libre2/re2/stringpiece.h +++ b/contrib/libre2/re2/stringpiece.h @@ -2,6 +2,9 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +#ifndef RE2_STRINGPIECE_H_ +#define RE2_STRINGPIECE_H_ + // A string-like object that points to a sized piece of memory. // // Functions or methods may use const StringPiece& parameters to accept either @@ -16,140 +19,145 @@ // // Arghh! I wish C++ literals were "string". -#ifndef STRINGS_STRINGPIECE_H__ -#define STRINGS_STRINGPIECE_H__ - +#include #include -#include +#include #include +#include #include namespace re2 { class StringPiece { - private: - const char* ptr_; - int length_; - public: + typedef char value_type; + typedef char* pointer; + typedef const char* const_pointer; + typedef char& reference; + typedef const char& const_reference; + typedef const char* const_iterator; + typedef const_iterator iterator; + typedef std::reverse_iterator const_reverse_iterator; + typedef const_reverse_iterator reverse_iterator; + typedef size_t size_type; + typedef ptrdiff_t difference_type; + static const size_type npos = static_cast(-1); + // We provide non-explicit singleton constructors so users can pass // in a "const char*" or a "string" wherever a "StringPiece" is // expected. - StringPiece() : ptr_(NULL), length_(0) { } - StringPiece(const char* str) - : ptr_(str), length_((str == NULL) ? 0 : static_cast(strlen(str))) { } + StringPiece() + : data_(NULL), size_(0) {} StringPiece(const std::string& str) - : ptr_(str.data()), length_(static_cast(str.size())) { } - StringPiece(const char* offset, int len) : ptr_(offset), length_(len) { } + : data_(str.data()), size_(str.size()) {} + StringPiece(const char* str) + : data_(str), size_(str == NULL ? 0 : strlen(str)) {} + StringPiece(const char* str, size_type len) + : data_(str), size_(len) {} - // data() may return a pointer to a buffer with embedded NULs, and the - // returned buffer may or may not be null terminated. Therefore it is - // typically a mistake to pass data() to a routine that expects a NUL - // terminated string. - const char* data() const { return ptr_; } - int size() const { return length_; } - int length() const { return length_; } - bool empty() const { return length_ == 0; } + const_iterator begin() const { return data_; } + const_iterator end() const { return data_ + size_; } + const_reverse_iterator rbegin() const { + return const_reverse_iterator(data_ + size_); + } + const_reverse_iterator rend() const { + return const_reverse_iterator(data_); + } + + size_type size() const { return size_; } + size_type length() const { return size_; } + bool empty() const { return size_ == 0; } + + const_reference operator[](size_type i) const { return data_[i]; } + const_pointer data() const { return data_; } + + void remove_prefix(size_type n) { + data_ += n; + size_ -= n; + } + + void remove_suffix(size_type n) { + size_ -= n; + } - void clear() { ptr_ = NULL; length_ = 0; } - void set(const char* data, int len) { ptr_ = data; length_ = len; } void set(const char* str) { - ptr_ = str; - if (str != NULL) - length_ = static_cast(strlen(str)); - else - length_ = 0; - } - void set(const void* data, int len) { - ptr_ = reinterpret_cast(data); - length_ = len; + data_ = str; + size_ = str == NULL ? 0 : strlen(str); } - char operator[](int i) const { return ptr_[i]; } - - void remove_prefix(int n) { - ptr_ += n; - length_ -= n; - } - - void remove_suffix(int n) { - length_ -= n; - } - - int compare(const StringPiece& x) const { - int r = memcmp(ptr_, x.ptr_, std::min(length_, x.length_)); - if (r == 0) { - if (length_ < x.length_) r = -1; - else if (length_ > x.length_) r = +1; - } - return r; + void set(const char* str, size_type len) { + data_ = str; + size_ = len; } std::string as_string() const { - return std::string(data(), size()); + return std::string(data_, size_); } + // We also define ToString() here, since many other string-like // interfaces name the routine that converts to a C++ string // "ToString", and it's confusing to have the method that does that // for a StringPiece be called "as_string()". We also leave the // "as_string()" method defined here for existing code. std::string ToString() const { - return std::string(data(), size()); + return std::string(data_, size_); } - void CopyToString(std::string* target) const; - void AppendToString(std::string* target) const; + void CopyToString(std::string* target) const { + target->assign(data_, size_); + } - // Does "this" start with "x" + void AppendToString(std::string* target) const { + target->append(data_, size_); + } + + size_type copy(char* buf, size_type n, size_type pos = 0) const; + StringPiece substr(size_type pos = 0, size_type n = npos) const; + + int compare(const StringPiece& x) const { + size_type min_size = std::min(size(), x.size()); + if (min_size > 0) { + int r = memcmp(data(), x.data(), min_size); + if (r < 0) return -1; + if (r > 0) return 1; + } + if (size() < x.size()) return -1; + if (size() > x.size()) return 1; + return 0; + } + + // Does "this" start with "x"? bool starts_with(const StringPiece& x) const { - return ((length_ >= x.length_) && - (memcmp(ptr_, x.ptr_, x.length_) == 0)); + return x.empty() || + (size() >= x.size() && memcmp(data(), x.data(), x.size()) == 0); } - // Does "this" end with "x" + // Does "this" end with "x"? bool ends_with(const StringPiece& x) const { - return ((length_ >= x.length_) && - (memcmp(ptr_ + (length_-x.length_), x.ptr_, x.length_) == 0)); + return x.empty() || + (size() >= x.size() && + memcmp(data() + (size() - x.size()), x.data(), x.size()) == 0); } - // standard STL container boilerplate - typedef char value_type; - typedef const char* pointer; - typedef const char& reference; - typedef const char& const_reference; - typedef size_t size_type; - typedef ptrdiff_t difference_type; - static const size_type npos; - typedef const char* const_iterator; - typedef const char* iterator; - typedef std::reverse_iterator const_reverse_iterator; - typedef std::reverse_iterator reverse_iterator; - iterator begin() const { return ptr_; } - iterator end() const { return ptr_ + length_; } - const_reverse_iterator rbegin() const { - return const_reverse_iterator(ptr_ + length_); + bool contains(const StringPiece& s) const { + return find(s) != npos; } - const_reverse_iterator rend() const { - return const_reverse_iterator(ptr_); - } - // STLS says return size_type, but Google says return int - int max_size() const { return length_; } - int capacity() const { return length_; } - int copy(char* buf, size_type n, size_type pos = 0) const; + size_type find(const StringPiece& s, size_type pos = 0) const; + size_type find(char c, size_type pos = 0) const; + size_type rfind(const StringPiece& s, size_type pos = npos) const; + size_type rfind(char c, size_type pos = npos) const; - int find(const StringPiece& s, size_type pos = 0) const; - int find(char c, size_type pos = 0) const; - int rfind(const StringPiece& s, size_type pos = npos) const; - int rfind(char c, size_type pos = npos) const; - - StringPiece substr(size_type pos, size_type n = npos) const; - - static bool _equal(const StringPiece&, const StringPiece&); + private: + const_pointer data_; + size_type size_; }; inline bool operator==(const StringPiece& x, const StringPiece& y) { - return StringPiece::_equal(x, y); + StringPiece::size_type len = x.size(); + if (len != y.size()) return false; + return x.data() == y.data() || len == 0 || + memcmp(x.data(), y.data(), len) == 0; } inline bool operator!=(const StringPiece& x, const StringPiece& y) { @@ -157,9 +165,9 @@ inline bool operator!=(const StringPiece& x, const StringPiece& y) { } inline bool operator<(const StringPiece& x, const StringPiece& y) { - const int r = memcmp(x.data(), y.data(), - std::min(x.size(), y.size())); - return ((r < 0) || ((r == 0) && (x.size() < y.size()))); + StringPiece::size_type min_size = std::min(x.size(), y.size()); + int r = min_size == 0 ? 0 : memcmp(x.data(), y.data(), min_size); + return (r < 0) || (r == 0 && x.size() < y.size()); } inline bool operator>(const StringPiece& x, const StringPiece& y) { @@ -174,9 +182,9 @@ inline bool operator>=(const StringPiece& x, const StringPiece& y) { return !(x < y); } +// Allow StringPiece to be logged. +std::ostream& operator<<(std::ostream& o, const StringPiece& p); + } // namespace re2 -// allow StringPiece to be logged -extern std::ostream& operator<<(std::ostream& o, const re2::StringPiece& piece); - -#endif // STRINGS_STRINGPIECE_H__ +#endif // RE2_STRINGPIECE_H_ diff --git a/contrib/libre2/re2/tostring.cc b/contrib/libre2/re2/tostring.cc index 555524f291b..278c310136e 100644 --- a/contrib/libre2/re2/tostring.cc +++ b/contrib/libre2/re2/tostring.cc @@ -5,7 +5,13 @@ // Format a regular expression structure as a string. // Tested by parse_test.cc +#include +#include + #include "util/util.h" +#include "util/logging.h" +#include "util/strutil.h" +#include "util/utf.h" #include "re2/regexp.h" #include "re2/walker-inl.h" @@ -42,7 +48,8 @@ class ToStringWalker : public Regexp::Walker { private: string* t_; // The string the walker appends to. - DISALLOW_EVIL_CONSTRUCTORS(ToStringWalker); + ToStringWalker(const ToStringWalker&) = delete; + ToStringWalker& operator=(const ToStringWalker&) = delete; }; string Regexp::ToString() { @@ -94,6 +101,8 @@ int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) { case kRegexpCapture: t_->append("("); + if (re->cap() == 0) + LOG(DFATAL) << "kRegexpCapture cap() == 0"; if (re->name()) { t_->append("?P<"); t_->append(*re->name()); @@ -120,13 +129,12 @@ int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) { static void AppendLiteral(string *t, Rune r, bool foldcase) { if (r != 0 && r < 0x80 && strchr("(){}[]*+?|.^$\\", r)) { t->append(1, '\\'); - t->append(1, r); + t->append(1, static_cast(r)); } else if (foldcase && 'a' <= r && r <= 'z') { - if ('a' <= r && r <= 'z') - r += 'A' - 'a'; + r -= 'a' - 'A'; t->append(1, '['); - t->append(1, r); - t->append(1, r + 'a' - 'A'); + t->append(1, static_cast(r)); + t->append(1, static_cast(r) + 'a' - 'A'); t->append(1, ']'); } else { AppendCCRange(t, r, r); @@ -154,12 +162,14 @@ int ToStringWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg, break; case kRegexpLiteral: - AppendLiteral(t_, re->rune(), re->parse_flags() & Regexp::FoldCase); + AppendLiteral(t_, re->rune(), + (re->parse_flags() & Regexp::FoldCase) != 0); break; case kRegexpLiteralString: for (int i = 0; i < re->nrunes(); i++) - AppendLiteral(t_, re->runes()[i], re->parse_flags() & Regexp::FoldCase); + AppendLiteral(t_, re->runes()[i], + (re->parse_flags() & Regexp::FoldCase) != 0); if (prec < PrecConcat) t_->append(")"); break; @@ -297,7 +307,7 @@ static void AppendCCChar(string* t, Rune r) { if (0x20 <= r && r <= 0x7E) { if (strchr("[]^-\\", r)) t->append("\\"); - t->append(1, r); + t->append(1, static_cast(r)); return; } switch (r) { diff --git a/contrib/libre2/re2/unicode.py b/contrib/libre2/re2/unicode.py index 6dfe87bbcef..2899c87ae80 100644 --- a/contrib/libre2/re2/unicode.py +++ b/contrib/libre2/re2/unicode.py @@ -9,7 +9,7 @@ import re import urllib2 # Directory or URL where Unicode tables reside. -_UNICODE_DIR = "http://www.unicode.org/Public/6.3.0/ucd" +_UNICODE_DIR = "http://www.unicode.org/Public/10.0.0/ucd" # Largest valid Unicode code value. _RUNE_MAX = 0x10FFFF diff --git a/contrib/libre2/re2/unicode_casefold.cc b/contrib/libre2/re2/unicode_casefold.cc index 2293cc75c18..1686943e59c 100644 --- a/contrib/libre2/re2/unicode_casefold.cc +++ b/contrib/libre2/re2/unicode_casefold.cc @@ -7,7 +7,7 @@ namespace re2 { -// 1034 groups, 2089 pairs, 289 ranges +// 1295 groups, 2620 pairs, 343 ranges const CaseFold unicode_casefold[] = { { 65, 90, 32 }, { 97, 106, -32 }, @@ -105,13 +105,17 @@ const CaseFold unicode_casefold[] = { { 598, 599, -205 }, { 601, 601, -202 }, { 603, 603, -203 }, + { 604, 604, 42319 }, { 608, 608, -205 }, + { 609, 609, 42315 }, { 611, 611, -207 }, { 613, 613, 42280 }, { 614, 614, 42308 }, { 616, 616, -209 }, { 617, 617, -211 }, + { 618, 618, 42308 }, { 619, 619, 10743 }, + { 620, 620, 42305 }, { 623, 623, -211 }, { 625, 625, 10749 }, { 626, 626, -213 }, @@ -119,15 +123,19 @@ const CaseFold unicode_casefold[] = { { 637, 637, 10727 }, { 640, 640, -218 }, { 643, 643, -218 }, + { 647, 647, 42282 }, { 648, 648, -218 }, { 649, 649, -69 }, { 650, 651, -217 }, { 652, 652, -71 }, { 658, 658, -219 }, + { 669, 669, 42261 }, + { 670, 670, 42258 }, { 837, 837, 84 }, { 880, 883, EvenOdd }, { 886, 887, EvenOdd }, { 891, 893, 130 }, + { 895, 895, 116 }, { 902, 902, 38 }, { 904, 906, 37 }, { 908, 908, 64 }, @@ -168,6 +176,7 @@ const CaseFold unicode_casefold[] = { { 1008, 1008, -86 }, { 1009, 1009, -80 }, { 1010, 1010, 7 }, + { 1011, 1011, -116 }, { 1012, 1012, -92 }, { 1013, 1013, -96 }, { 1015, 1016, OddEven }, @@ -176,19 +185,43 @@ const CaseFold unicode_casefold[] = { { 1021, 1023, -130 }, { 1024, 1039, 80 }, { 1040, 1071, 32 }, - { 1072, 1103, -32 }, + { 1072, 1073, -32 }, + { 1074, 1074, 6222 }, + { 1075, 1075, -32 }, + { 1076, 1076, 6221 }, + { 1077, 1085, -32 }, + { 1086, 1086, 6212 }, + { 1087, 1088, -32 }, + { 1089, 1090, 6210 }, + { 1091, 1097, -32 }, + { 1098, 1098, 6204 }, + { 1099, 1103, -32 }, { 1104, 1119, -80 }, - { 1120, 1153, EvenOdd }, + { 1120, 1122, EvenOdd }, + { 1123, 1123, 6180 }, + { 1124, 1153, EvenOdd }, { 1162, 1215, EvenOdd }, { 1216, 1216, 15 }, { 1217, 1230, OddEven }, { 1231, 1231, -15 }, - { 1232, 1319, EvenOdd }, + { 1232, 1327, EvenOdd }, { 1329, 1366, 48 }, { 1377, 1414, -48 }, { 4256, 4293, 7264 }, { 4295, 4295, 7264 }, { 4301, 4301, 7264 }, + { 5024, 5103, 38864 }, + { 5104, 5109, 8 }, + { 5112, 5117, -8 }, + { 7296, 7296, -6254 }, + { 7297, 7297, -6253 }, + { 7298, 7298, -6244 }, + { 7299, 7299, -6242 }, + { 7300, 7300, EvenOdd }, + { 7301, 7301, -6243 }, + { 7302, 7302, -6236 }, + { 7303, 7303, -6181 }, + { 7304, 7304, 35266 }, { 7545, 7545, 35332 }, { 7549, 7549, 3814 }, { 7680, 7776, EvenOdd }, @@ -282,8 +315,10 @@ const CaseFold unicode_casefold[] = { { 11520, 11557, -7264 }, { 11559, 11559, -7264 }, { 11565, 11565, -7264 }, - { 42560, 42605, EvenOdd }, - { 42624, 42647, EvenOdd }, + { 42560, 42570, EvenOdd }, + { 42571, 42571, -35267 }, + { 42572, 42605, EvenOdd }, + { 42624, 42651, EvenOdd }, { 42786, 42799, EvenOdd }, { 42802, 42863, EvenOdd }, { 42873, 42876, OddEven }, @@ -292,16 +327,35 @@ const CaseFold unicode_casefold[] = { { 42891, 42892, OddEven }, { 42893, 42893, -42280 }, { 42896, 42899, EvenOdd }, - { 42912, 42921, EvenOdd }, + { 42902, 42921, EvenOdd }, { 42922, 42922, -42308 }, + { 42923, 42923, -42319 }, + { 42924, 42924, -42315 }, + { 42925, 42925, -42305 }, + { 42926, 42926, -42308 }, + { 42928, 42928, -42258 }, + { 42929, 42929, -42282 }, + { 42930, 42930, -42261 }, + { 42931, 42931, 928 }, + { 42932, 42935, EvenOdd }, + { 43859, 43859, -928 }, + { 43888, 43967, -38864 }, { 65313, 65338, 32 }, { 65345, 65370, -32 }, { 66560, 66599, 40 }, { 66600, 66639, -40 }, + { 66736, 66771, 40 }, + { 66776, 66811, -40 }, + { 68736, 68786, 64 }, + { 68800, 68850, -64 }, + { 71840, 71871, 32 }, + { 71872, 71903, -32 }, + { 125184, 125217, 34 }, + { 125218, 125251, -34 }, }; -const int num_unicode_casefold = 289; +const int num_unicode_casefold = 343; -// 1034 groups, 1055 pairs, 167 ranges +// 1295 groups, 1325 pairs, 191 ranges const CaseFold unicode_tolower[] = { { 65, 90, 32 }, { 181, 181, 775 }, @@ -370,6 +424,7 @@ const CaseFold unicode_tolower[] = { { 837, 837, 116 }, { 880, 882, EvenOddSkip }, { 886, 886, EvenOdd }, + { 895, 895, 116 }, { 902, 902, 38 }, { 904, 906, 37 }, { 908, 908, 64 }, @@ -397,11 +452,20 @@ const CaseFold unicode_tolower[] = { { 1162, 1214, EvenOddSkip }, { 1216, 1216, 15 }, { 1217, 1229, OddEvenSkip }, - { 1232, 1318, EvenOddSkip }, + { 1232, 1326, EvenOddSkip }, { 1329, 1366, 48 }, { 4256, 4293, 7264 }, { 4295, 4295, 7264 }, { 4301, 4301, 7264 }, + { 5112, 5117, -8 }, + { 7296, 7296, -6222 }, + { 7297, 7297, -6221 }, + { 7298, 7298, -6212 }, + { 7299, 7300, -6210 }, + { 7301, 7301, -6211 }, + { 7302, 7302, -6204 }, + { 7303, 7303, -6180 }, + { 7304, 7304, 35267 }, { 7680, 7828, EvenOddSkip }, { 7835, 7835, -58 }, { 7838, 7838, -7615 }, @@ -457,7 +521,7 @@ const CaseFold unicode_tolower[] = { { 11499, 11501, OddEvenSkip }, { 11506, 11506, EvenOdd }, { 42560, 42604, EvenOddSkip }, - { 42624, 42646, EvenOddSkip }, + { 42624, 42650, EvenOddSkip }, { 42786, 42798, EvenOddSkip }, { 42802, 42862, EvenOddSkip }, { 42873, 42875, OddEvenSkip }, @@ -466,12 +530,26 @@ const CaseFold unicode_tolower[] = { { 42891, 42891, OddEven }, { 42893, 42893, -42280 }, { 42896, 42898, EvenOddSkip }, - { 42912, 42920, EvenOddSkip }, + { 42902, 42920, EvenOddSkip }, { 42922, 42922, -42308 }, + { 42923, 42923, -42319 }, + { 42924, 42924, -42315 }, + { 42925, 42925, -42305 }, + { 42926, 42926, -42308 }, + { 42928, 42928, -42258 }, + { 42929, 42929, -42282 }, + { 42930, 42930, -42261 }, + { 42931, 42931, 928 }, + { 42932, 42934, EvenOddSkip }, + { 43888, 43967, -38864 }, { 65313, 65338, 32 }, { 66560, 66599, 40 }, + { 66736, 66771, 40 }, + { 68736, 68786, 64 }, + { 71840, 71871, 32 }, + { 125184, 125217, 34 }, }; -const int num_unicode_tolower = 167; +const int num_unicode_tolower = 191; diff --git a/contrib/libre2/re2/unicode_casefold.h b/contrib/libre2/re2/unicode_casefold.h index 7f438aabf7c..8bdbb42fbc1 100644 --- a/contrib/libre2/re2/unicode_casefold.h +++ b/contrib/libre2/re2/unicode_casefold.h @@ -2,6 +2,9 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +#ifndef RE2_UNICODE_CASEFOLD_H_ +#define RE2_UNICODE_CASEFOLD_H_ + // Unicode case folding tables. // The Unicode case folding tables encode the mapping from one Unicode point @@ -16,7 +19,7 @@ // 'K' -> 'K' // // Like everything Unicode, these tables are big. If we represent the table -// as a sorted list of uint32 pairs, it has 2049 entries and is 16 kB. +// as a sorted list of uint32_t pairs, it has 2049 entries and is 16 kB. // Most table entries look like the ones around them: // 'A' maps to 'A'+32, 'B' maps to 'B'+32, etc. // Instead of listing all the pairs explicitly, we make a list of ranges @@ -36,10 +39,10 @@ // The grouped form also allows for efficient fold range calculations // rather than looping one character at a time. -#ifndef RE2_UNICODE_CASEFOLD_H__ -#define RE2_UNICODE_CASEFOLD_H__ +#include #include "util/util.h" +#include "util/utf.h" namespace re2 { @@ -51,9 +54,9 @@ enum { }; struct CaseFold { - uint32 lo; - uint32 hi; - int32 delta; + Rune lo; + Rune hi; + int32_t delta; }; extern const CaseFold unicode_casefold[]; @@ -72,4 +75,4 @@ extern Rune ApplyFold(const CaseFold *f, Rune r); } // namespace re2 -#endif // RE2_UNICODE_CASEFOLD_H__ +#endif // RE2_UNICODE_CASEFOLD_H_ diff --git a/contrib/libre2/re2/unicode_groups.cc b/contrib/libre2/re2/unicode_groups.cc index 0df585e3542..ba104d2bd3f 100644 --- a/contrib/libre2/re2/unicode_groups.cc +++ b/contrib/libre2/re2/unicode_groups.cc @@ -53,6 +53,7 @@ static const URange16 Ps_range16[] = { { 11812, 11812 }, { 11814, 11814 }, { 11816, 11816 }, + { 11842, 11842 }, { 12296, 12296 }, { 12298, 12298 }, { 12300, 12300 }, @@ -63,7 +64,7 @@ static const URange16 Ps_range16[] = { { 12312, 12312 }, { 12314, 12314 }, { 12317, 12317 }, - { 64830, 64830 }, + { 64831, 64831 }, { 65047, 65047 }, { 65077, 65077 }, { 65079, 65079 }, @@ -97,7 +98,7 @@ static const URange32 Nl_range32[] = { { 66369, 66369 }, { 66378, 66378 }, { 66513, 66517 }, - { 74752, 74850 }, + { 74752, 74862 }, }; static const URange16 No_range16[] = { { 178, 179 }, @@ -107,7 +108,8 @@ static const URange16 No_range16[] = { { 2930, 2935 }, { 3056, 3058 }, { 3192, 3198 }, - { 3440, 3445 }, + { 3416, 3422 }, + { 3440, 3448 }, { 3882, 3891 }, { 4969, 4988 }, { 6128, 6137 }, @@ -132,18 +134,35 @@ static const URange16 No_range16[] = { static const URange32 No_range32[] = { { 65799, 65843 }, { 65909, 65912 }, - { 65930, 65930 }, + { 65930, 65931 }, + { 66273, 66299 }, { 66336, 66339 }, { 67672, 67679 }, + { 67705, 67711 }, + { 67751, 67759 }, + { 67835, 67839 }, { 67862, 67867 }, + { 68028, 68029 }, + { 68032, 68047 }, + { 68050, 68095 }, { 68160, 68167 }, { 68221, 68222 }, + { 68253, 68255 }, + { 68331, 68335 }, { 68440, 68447 }, { 68472, 68479 }, + { 68521, 68527 }, + { 68858, 68863 }, { 69216, 69246 }, { 69714, 69733 }, + { 70113, 70132 }, + { 71482, 71483 }, + { 71914, 71922 }, + { 72794, 72812 }, + { 93019, 93025 }, { 119648, 119665 }, - { 127232, 127242 }, + { 125127, 125135 }, + { 127232, 127244 }, }; static const URange16 Lo_range16[] = { { 170, 170 }, @@ -168,14 +187,14 @@ static const URange16 Lo_range16[] = { { 1994, 2026 }, { 2048, 2069 }, { 2112, 2136 }, - { 2208, 2208 }, - { 2210, 2220 }, + { 2144, 2154 }, + { 2208, 2228 }, + { 2230, 2237 }, { 2308, 2361 }, { 2365, 2365 }, { 2384, 2384 }, { 2392, 2401 }, - { 2418, 2423 }, - { 2425, 2431 }, + { 2418, 2432 }, { 2437, 2444 }, { 2447, 2448 }, { 2451, 2472 }, @@ -187,6 +206,7 @@ static const URange16 Lo_range16[] = { { 2524, 2525 }, { 2527, 2529 }, { 2544, 2545 }, + { 2556, 2556 }, { 2565, 2570 }, { 2575, 2576 }, { 2579, 2600 }, @@ -206,6 +226,7 @@ static const URange16 Lo_range16[] = { { 2749, 2749 }, { 2768, 2768 }, { 2784, 2785 }, + { 2809, 2809 }, { 2821, 2828 }, { 2831, 2832 }, { 2835, 2856 }, @@ -230,11 +251,11 @@ static const URange16 Lo_range16[] = { { 3077, 3084 }, { 3086, 3088 }, { 3090, 3112 }, - { 3114, 3123 }, - { 3125, 3129 }, + { 3114, 3129 }, { 3133, 3133 }, - { 3160, 3161 }, + { 3160, 3162 }, { 3168, 3169 }, + { 3200, 3200 }, { 3205, 3212 }, { 3214, 3216 }, { 3218, 3240 }, @@ -249,7 +270,8 @@ static const URange16 Lo_range16[] = { { 3346, 3386 }, { 3389, 3389 }, { 3406, 3406 }, - { 3424, 3425 }, + { 3412, 3414 }, + { 3423, 3425 }, { 3450, 3455 }, { 3461, 3478 }, { 3482, 3505 }, @@ -306,11 +328,11 @@ static const URange16 Lo_range16[] = { { 4882, 4885 }, { 4888, 4954 }, { 4992, 5007 }, - { 5024, 5108 }, { 5121, 5740 }, { 5743, 5759 }, { 5761, 5786 }, { 5792, 5866 }, + { 5873, 5880 }, { 5888, 5900 }, { 5902, 5905 }, { 5920, 5937 }, @@ -321,14 +343,15 @@ static const URange16 Lo_range16[] = { { 6108, 6108 }, { 6176, 6210 }, { 6212, 6263 }, - { 6272, 6312 }, + { 6272, 6276 }, + { 6279, 6312 }, { 6314, 6314 }, { 6320, 6389 }, - { 6400, 6428 }, + { 6400, 6430 }, { 6480, 6509 }, { 6512, 6516 }, { 6528, 6571 }, - { 6593, 6599 }, + { 6576, 6601 }, { 6656, 6678 }, { 6688, 6740 }, { 6917, 6963 }, @@ -359,12 +382,12 @@ static const URange16 Lo_range16[] = { { 12447, 12447 }, { 12449, 12538 }, { 12543, 12543 }, - { 12549, 12589 }, + { 12549, 12590 }, { 12593, 12686 }, { 12704, 12730 }, { 12784, 12799 }, { 13312, 19893 }, - { 19968, 40908 }, + { 19968, 40938 }, { 40960, 40980 }, { 40982, 42124 }, { 42192, 42231 }, @@ -373,6 +396,8 @@ static const URange16 Lo_range16[] = { { 42538, 42539 }, { 42606, 42606 }, { 42656, 42725 }, + { 42895, 42895 }, + { 42999, 42999 }, { 43003, 43009 }, { 43011, 43013 }, { 43015, 43018 }, @@ -381,17 +406,21 @@ static const URange16 Lo_range16[] = { { 43138, 43187 }, { 43250, 43255 }, { 43259, 43259 }, + { 43261, 43261 }, { 43274, 43301 }, { 43312, 43334 }, { 43360, 43388 }, { 43396, 43442 }, + { 43488, 43492 }, + { 43495, 43503 }, + { 43514, 43518 }, { 43520, 43560 }, { 43584, 43586 }, { 43588, 43595 }, { 43616, 43631 }, { 43633, 43638 }, { 43642, 43642 }, - { 43648, 43695 }, + { 43646, 43695 }, { 43697, 43697 }, { 43701, 43702 }, { 43705, 43709 }, @@ -443,19 +472,29 @@ static const URange32 Lo_range32[] = { { 65664, 65786 }, { 66176, 66204 }, { 66208, 66256 }, - { 66304, 66334 }, - { 66352, 66368 }, + { 66304, 66335 }, + { 66349, 66368 }, { 66370, 66377 }, + { 66384, 66421 }, { 66432, 66461 }, { 66464, 66499 }, { 66504, 66511 }, { 66640, 66717 }, + { 66816, 66855 }, + { 66864, 66915 }, + { 67072, 67382 }, + { 67392, 67413 }, + { 67424, 67431 }, { 67584, 67589 }, { 67592, 67592 }, { 67594, 67637 }, { 67639, 67640 }, { 67644, 67644 }, { 67647, 67669 }, + { 67680, 67702 }, + { 67712, 67742 }, + { 67808, 67826 }, + { 67828, 67829 }, { 67840, 67861 }, { 67872, 67897 }, { 67968, 68023 }, @@ -465,23 +504,89 @@ static const URange32 Lo_range32[] = { { 68117, 68119 }, { 68121, 68147 }, { 68192, 68220 }, + { 68224, 68252 }, + { 68288, 68295 }, + { 68297, 68324 }, { 68352, 68405 }, { 68416, 68437 }, { 68448, 68466 }, + { 68480, 68497 }, { 68608, 68680 }, { 69635, 69687 }, { 69763, 69807 }, { 69840, 69864 }, { 69891, 69926 }, + { 69968, 70002 }, + { 70006, 70006 }, { 70019, 70066 }, { 70081, 70084 }, + { 70106, 70106 }, + { 70108, 70108 }, + { 70144, 70161 }, + { 70163, 70187 }, + { 70272, 70278 }, + { 70280, 70280 }, + { 70282, 70285 }, + { 70287, 70301 }, + { 70303, 70312 }, + { 70320, 70366 }, + { 70405, 70412 }, + { 70415, 70416 }, + { 70419, 70440 }, + { 70442, 70448 }, + { 70450, 70451 }, + { 70453, 70457 }, + { 70461, 70461 }, + { 70480, 70480 }, + { 70493, 70497 }, + { 70656, 70708 }, + { 70727, 70730 }, + { 70784, 70831 }, + { 70852, 70853 }, + { 70855, 70855 }, + { 71040, 71086 }, + { 71128, 71131 }, + { 71168, 71215 }, + { 71236, 71236 }, { 71296, 71338 }, - { 73728, 74606 }, + { 71424, 71449 }, + { 71935, 71935 }, + { 72192, 72192 }, + { 72203, 72242 }, + { 72250, 72250 }, + { 72272, 72272 }, + { 72284, 72323 }, + { 72326, 72329 }, + { 72384, 72440 }, + { 72704, 72712 }, + { 72714, 72750 }, + { 72768, 72768 }, + { 72818, 72847 }, + { 72960, 72966 }, + { 72968, 72969 }, + { 72971, 73008 }, + { 73030, 73030 }, + { 73728, 74649 }, + { 74880, 75075 }, { 77824, 78894 }, + { 82944, 83526 }, { 92160, 92728 }, + { 92736, 92766 }, + { 92880, 92909 }, + { 92928, 92975 }, + { 93027, 93047 }, + { 93053, 93071 }, { 93952, 94020 }, { 94032, 94032 }, - { 110592, 110593 }, + { 94208, 100332 }, + { 100352, 101106 }, + { 110592, 110878 }, + { 110960, 111355 }, + { 113664, 113770 }, + { 113776, 113788 }, + { 113792, 113800 }, + { 113808, 113817 }, + { 124928, 125124 }, { 126464, 126467 }, { 126469, 126495 }, { 126497, 126498 }, @@ -518,6 +623,8 @@ static const URange32 Lo_range32[] = { { 131072, 173782 }, { 173824, 177972 }, { 177984, 178205 }, + { 178208, 183969 }, + { 183984, 191456 }, { 194560, 195101 }, }; static const URange16 Ll_range16[] = { @@ -786,7 +893,13 @@ static const URange16 Ll_range16[] = { { 1315, 1315 }, { 1317, 1317 }, { 1319, 1319 }, + { 1321, 1321 }, + { 1323, 1323 }, + { 1325, 1325 }, + { 1327, 1327 }, { 1377, 1415 }, + { 5112, 5117 }, + { 7296, 7304 }, { 7424, 7467 }, { 7531, 7543 }, { 7545, 7578 }, @@ -1044,6 +1157,8 @@ static const URange16 Ll_range16[] = { { 42643, 42643 }, { 42645, 42645 }, { 42647, 42647 }, + { 42649, 42649 }, + { 42651, 42651 }, { 42787, 42787 }, { 42789, 42789 }, { 42791, 42791 }, @@ -1093,19 +1208,32 @@ static const URange16 Ll_range16[] = { { 42892, 42892 }, { 42894, 42894 }, { 42897, 42897 }, - { 42899, 42899 }, + { 42899, 42901 }, + { 42903, 42903 }, + { 42905, 42905 }, + { 42907, 42907 }, + { 42909, 42909 }, + { 42911, 42911 }, { 42913, 42913 }, { 42915, 42915 }, { 42917, 42917 }, { 42919, 42919 }, { 42921, 42921 }, + { 42933, 42933 }, + { 42935, 42935 }, { 43002, 43002 }, + { 43824, 43866 }, + { 43872, 43877 }, + { 43888, 43967 }, { 64256, 64262 }, { 64275, 64279 }, { 65345, 65370 }, }; static const URange32 Ll_range32[] = { { 66600, 66639 }, + { 66776, 66811 }, + { 68800, 68850 }, + { 71872, 71903 }, { 119834, 119859 }, { 119886, 119892 }, { 119894, 119911 }, @@ -1134,6 +1262,7 @@ static const URange32 Ll_range32[] = { { 120746, 120770 }, { 120772, 120777 }, { 120779, 120779 }, + { 125218, 125251 }, }; static const URange16 Lm_range16[] = { { 688, 705 }, @@ -1177,19 +1306,24 @@ static const URange16 Lm_range16[] = { { 42232, 42237 }, { 42508, 42508 }, { 42623, 42623 }, + { 42652, 42653 }, { 42775, 42783 }, { 42864, 42864 }, { 42888, 42888 }, { 43000, 43001 }, { 43471, 43471 }, + { 43494, 43494 }, { 43632, 43632 }, { 43741, 43741 }, { 43763, 43764 }, + { 43868, 43871 }, { 65392, 65392 }, { 65438, 65439 }, }; static const URange32 Lm_range32[] = { + { 92992, 92995 }, { 94099, 94111 }, + { 94176, 94177 }, }; static const URange16 Nd_range16[] = { { 48, 57 }, @@ -1205,6 +1339,7 @@ static const URange16 Nd_range16[] = { { 3174, 3183 }, { 3302, 3311 }, { 3430, 3439 }, + { 3558, 3567 }, { 3664, 3673 }, { 3792, 3801 }, { 3872, 3881 }, @@ -1224,6 +1359,7 @@ static const URange16 Nd_range16[] = { { 43216, 43225 }, { 43264, 43273 }, { 43472, 43481 }, + { 43504, 43513 }, { 43600, 43609 }, { 44016, 44025 }, { 65296, 65305 }, @@ -1234,8 +1370,19 @@ static const URange32 Nd_range32[] = { { 69872, 69881 }, { 69942, 69951 }, { 70096, 70105 }, + { 70384, 70393 }, + { 70736, 70745 }, + { 70864, 70873 }, + { 71248, 71257 }, { 71360, 71369 }, + { 71472, 71481 }, + { 71904, 71913 }, + { 72784, 72793 }, + { 73040, 73049 }, + { 92768, 92777 }, + { 93008, 93017 }, { 120782, 120831 }, + { 125264, 125273 }, }; static const URange16 Pc_range16[] = { { 95, 95 }, @@ -1405,6 +1552,7 @@ static const URange16 Lu_range16[] = { { 880, 880 }, { 882, 882 }, { 886, 886 }, + { 895, 895 }, { 902, 902 }, { 904, 906 }, { 908, 908 }, @@ -1524,10 +1672,15 @@ static const URange16 Lu_range16[] = { { 1314, 1314 }, { 1316, 1316 }, { 1318, 1318 }, + { 1320, 1320 }, + { 1322, 1322 }, + { 1324, 1324 }, + { 1326, 1326 }, { 1329, 1366 }, { 4256, 4293 }, { 4295, 4295 }, { 4301, 4301 }, + { 5024, 5109 }, { 7680, 7680 }, { 7682, 7682 }, { 7684, 7684 }, @@ -1778,6 +1931,8 @@ static const URange16 Lu_range16[] = { { 42642, 42642 }, { 42644, 42644 }, { 42646, 42646 }, + { 42648, 42648 }, + { 42650, 42650 }, { 42786, 42786 }, { 42788, 42788 }, { 42790, 42790 }, @@ -1827,16 +1982,26 @@ static const URange16 Lu_range16[] = { { 42893, 42893 }, { 42896, 42896 }, { 42898, 42898 }, + { 42902, 42902 }, + { 42904, 42904 }, + { 42906, 42906 }, + { 42908, 42908 }, + { 42910, 42910 }, { 42912, 42912 }, { 42914, 42914 }, { 42916, 42916 }, { 42918, 42918 }, { 42920, 42920 }, - { 42922, 42922 }, + { 42922, 42926 }, + { 42928, 42932 }, + { 42934, 42934 }, { 65313, 65338 }, }; static const URange32 Lu_range32[] = { { 66560, 66599 }, + { 66736, 66771 }, + { 68736, 68786 }, + { 71840, 71871 }, { 119808, 119833 }, { 119860, 119885 }, { 119912, 119937 }, @@ -1868,6 +2033,7 @@ static const URange32 Lu_range32[] = { { 120662, 120686 }, { 120720, 120744 }, { 120778, 120778 }, + { 125184, 125217 }, }; static const URange16 Pf_range16[] = { { 187, 187 }, @@ -1891,6 +2057,7 @@ static const URange16 Pd_range16[] = { { 11799, 11799 }, { 11802, 11802 }, { 11834, 11835 }, + { 11840, 11840 }, { 12316, 12316 }, { 12336, 12336 }, { 12448, 12448 }, @@ -1953,7 +2120,7 @@ static const URange16 Pe_range16[] = { { 12313, 12313 }, { 12315, 12315 }, { 12318, 12319 }, - { 64831, 64831 }, + { 64830, 64830 }, { 65048, 65048 }, { 65078, 65078 }, { 65080, 65080 }, @@ -2019,6 +2186,7 @@ static const URange16 Po_range16[] = { { 2142, 2142 }, { 2404, 2405 }, { 2416, 2416 }, + { 2557, 2557 }, { 2800, 2800 }, { 3572, 3572 }, { 3663, 3663 }, @@ -2068,6 +2236,9 @@ static const URange16 Po_range16[] = { { 11806, 11807 }, { 11818, 11822 }, { 11824, 11833 }, + { 11836, 11839 }, + { 11841, 11841 }, + { 11843, 11849 }, { 12289, 12291 }, { 12349, 12349 }, { 12539, 12539 }, @@ -2079,6 +2250,7 @@ static const URange16 Po_range16[] = { { 43124, 43127 }, { 43214, 43215 }, { 43256, 43258 }, + { 43260, 43260 }, { 43310, 43311 }, { 43359, 43359 }, { 43457, 43469 }, @@ -2112,21 +2284,51 @@ static const URange32 Po_range32[] = { { 65792, 65794 }, { 66463, 66463 }, { 66512, 66512 }, + { 66927, 66927 }, { 67671, 67671 }, { 67871, 67871 }, { 67903, 67903 }, { 68176, 68184 }, { 68223, 68223 }, + { 68336, 68342 }, { 68409, 68415 }, + { 68505, 68508 }, { 69703, 69709 }, { 69819, 69820 }, { 69822, 69825 }, { 69952, 69955 }, - { 70085, 70088 }, - { 74864, 74867 }, + { 70004, 70005 }, + { 70085, 70089 }, + { 70093, 70093 }, + { 70107, 70107 }, + { 70109, 70111 }, + { 70200, 70205 }, + { 70313, 70313 }, + { 70731, 70735 }, + { 70747, 70747 }, + { 70749, 70749 }, + { 70854, 70854 }, + { 71105, 71127 }, + { 71233, 71235 }, + { 71264, 71276 }, + { 71484, 71486 }, + { 72255, 72262 }, + { 72346, 72348 }, + { 72350, 72354 }, + { 72769, 72773 }, + { 72816, 72817 }, + { 74864, 74868 }, + { 92782, 92783 }, + { 92917, 92917 }, + { 92983, 92987 }, + { 92996, 92996 }, + { 113823, 113823 }, + { 121479, 121483 }, + { 125278, 125279 }, }; static const URange16 Me_range16[] = { { 1160, 1161 }, + { 6846, 6846 }, { 8413, 8416 }, { 8418, 8420 }, { 42608, 42610 }, @@ -2135,10 +2337,11 @@ static const URange16 C_range16[] = { { 0, 31 }, { 127, 159 }, { 173, 173 }, - { 1536, 1540 }, + { 1536, 1541 }, { 1564, 1564 }, { 1757, 1757 }, { 1807, 1807 }, + { 2274, 2274 }, { 6158, 6158 }, { 8203, 8207 }, { 8234, 8238 }, @@ -2150,6 +2353,7 @@ static const URange16 C_range16[] = { }; static const URange32 C_range32[] = { { 69821, 69821 }, + { 113824, 113827 }, { 119155, 119162 }, { 917505, 917505 }, { 917536, 917631 }, @@ -2221,8 +2425,6 @@ static const URange16 Mc_range16[] = { { 6441, 6443 }, { 6448, 6449 }, { 6451, 6456 }, - { 6576, 6592 }, - { 6600, 6601 }, { 6681, 6682 }, { 6741, 6741 }, { 6743, 6743 }, @@ -2238,7 +2440,6 @@ static const URange16 Mc_range16[] = { { 7073, 7073 }, { 7078, 7079 }, { 7082, 7082 }, - { 7084, 7085 }, { 7143, 7143 }, { 7146, 7148 }, { 7150, 7150 }, @@ -2247,6 +2448,7 @@ static const URange16 Mc_range16[] = { { 7220, 7221 }, { 7393, 7393 }, { 7410, 7411 }, + { 7415, 7415 }, { 12334, 12335 }, { 43043, 43044 }, { 43047, 43047 }, @@ -2261,6 +2463,7 @@ static const URange16 Mc_range16[] = { { 43571, 43572 }, { 43597, 43597 }, { 43643, 43643 }, + { 43645, 43645 }, { 43755, 43755 }, { 43758, 43759 }, { 43765, 43765 }, @@ -2279,9 +2482,44 @@ static const URange32 Mc_range32[] = { { 70018, 70018 }, { 70067, 70069 }, { 70079, 70080 }, + { 70188, 70190 }, + { 70194, 70195 }, + { 70197, 70197 }, + { 70368, 70370 }, + { 70402, 70403 }, + { 70462, 70463 }, + { 70465, 70468 }, + { 70471, 70472 }, + { 70475, 70477 }, + { 70487, 70487 }, + { 70498, 70499 }, + { 70709, 70711 }, + { 70720, 70721 }, + { 70725, 70725 }, + { 70832, 70834 }, + { 70841, 70841 }, + { 70843, 70846 }, + { 70849, 70849 }, + { 71087, 71089 }, + { 71096, 71099 }, + { 71102, 71102 }, + { 71216, 71218 }, + { 71227, 71228 }, + { 71230, 71230 }, { 71340, 71340 }, { 71342, 71343 }, { 71350, 71350 }, + { 71456, 71457 }, + { 71462, 71462 }, + { 72199, 72200 }, + { 72249, 72249 }, + { 72279, 72280 }, + { 72343, 72343 }, + { 72751, 72751 }, + { 72766, 72766 }, + { 72873, 72873 }, + { 72881, 72881 }, + { 72884, 72884 }, { 94033, 94078 }, { 119141, 119142 }, { 119149, 119154 }, @@ -2310,8 +2548,8 @@ static const URange16 Mn_range16[] = { { 2085, 2087 }, { 2089, 2093 }, { 2137, 2139 }, - { 2276, 2302 }, - { 2304, 2306 }, + { 2260, 2273 }, + { 2275, 2306 }, { 2362, 2362 }, { 2364, 2364 }, { 2369, 2376 }, @@ -2337,6 +2575,7 @@ static const URange16 Mn_range16[] = { { 2759, 2760 }, { 2765, 2765 }, { 2786, 2787 }, + { 2810, 2815 }, { 2817, 2817 }, { 2876, 2876 }, { 2879, 2879 }, @@ -2347,16 +2586,20 @@ static const URange16 Mn_range16[] = { { 2946, 2946 }, { 3008, 3008 }, { 3021, 3021 }, + { 3072, 3072 }, { 3134, 3136 }, { 3142, 3144 }, { 3146, 3149 }, { 3157, 3158 }, { 3170, 3171 }, + { 3201, 3201 }, { 3260, 3260 }, { 3263, 3263 }, { 3270, 3270 }, { 3276, 3277 }, { 3298, 3299 }, + { 3328, 3329 }, + { 3387, 3388 }, { 3393, 3396 }, { 3405, 3405 }, { 3426, 3427 }, @@ -2402,6 +2645,7 @@ static const URange16 Mn_range16[] = { { 6089, 6099 }, { 6109, 6109 }, { 6155, 6157 }, + { 6277, 6278 }, { 6313, 6313 }, { 6432, 6434 }, { 6439, 6440 }, @@ -2416,6 +2660,7 @@ static const URange16 Mn_range16[] = { { 6757, 6764 }, { 6771, 6780 }, { 6783, 6783 }, + { 6832, 6845 }, { 6912, 6915 }, { 6964, 6964 }, { 6966, 6970 }, @@ -2425,7 +2670,7 @@ static const URange16 Mn_range16[] = { { 7040, 7041 }, { 7074, 7077 }, { 7080, 7081 }, - { 7083, 7083 }, + { 7083, 7085 }, { 7142, 7142 }, { 7144, 7145 }, { 7149, 7149 }, @@ -2437,8 +2682,9 @@ static const URange16 Mn_range16[] = { { 7394, 7400 }, { 7405, 7405 }, { 7412, 7412 }, - { 7616, 7654 }, - { 7676, 7679 }, + { 7416, 7417 }, + { 7616, 7673 }, + { 7675, 7679 }, { 8400, 8412 }, { 8417, 8417 }, { 8421, 8432 }, @@ -2449,13 +2695,13 @@ static const URange16 Mn_range16[] = { { 12441, 12442 }, { 42607, 42607 }, { 42612, 42621 }, - { 42655, 42655 }, + { 42654, 42655 }, { 42736, 42737 }, { 43010, 43010 }, { 43014, 43014 }, { 43019, 43019 }, { 43045, 43046 }, - { 43204, 43204 }, + { 43204, 43205 }, { 43232, 43249 }, { 43302, 43309 }, { 43335, 43345 }, @@ -2463,11 +2709,13 @@ static const URange16 Mn_range16[] = { { 43443, 43443 }, { 43446, 43449 }, { 43452, 43452 }, + { 43493, 43493 }, { 43561, 43566 }, { 43569, 43570 }, { 43573, 43574 }, { 43587, 43587 }, { 43596, 43596 }, + { 43644, 43644 }, { 43696, 43696 }, { 43698, 43700 }, { 43703, 43704 }, @@ -2480,35 +2728,105 @@ static const URange16 Mn_range16[] = { { 44013, 44013 }, { 64286, 64286 }, { 65024, 65039 }, - { 65056, 65062 }, + { 65056, 65071 }, }; static const URange32 Mn_range32[] = { { 66045, 66045 }, + { 66272, 66272 }, + { 66422, 66426 }, { 68097, 68099 }, { 68101, 68102 }, { 68108, 68111 }, { 68152, 68154 }, { 68159, 68159 }, + { 68325, 68326 }, { 69633, 69633 }, { 69688, 69702 }, - { 69760, 69761 }, + { 69759, 69761 }, { 69811, 69814 }, { 69817, 69818 }, { 69888, 69890 }, { 69927, 69931 }, { 69933, 69940 }, + { 70003, 70003 }, { 70016, 70017 }, { 70070, 70078 }, + { 70090, 70092 }, + { 70191, 70193 }, + { 70196, 70196 }, + { 70198, 70199 }, + { 70206, 70206 }, + { 70367, 70367 }, + { 70371, 70378 }, + { 70400, 70401 }, + { 70460, 70460 }, + { 70464, 70464 }, + { 70502, 70508 }, + { 70512, 70516 }, + { 70712, 70719 }, + { 70722, 70724 }, + { 70726, 70726 }, + { 70835, 70840 }, + { 70842, 70842 }, + { 70847, 70848 }, + { 70850, 70851 }, + { 71090, 71093 }, + { 71100, 71101 }, + { 71103, 71104 }, + { 71132, 71133 }, + { 71219, 71226 }, + { 71229, 71229 }, + { 71231, 71232 }, { 71339, 71339 }, { 71341, 71341 }, { 71344, 71349 }, { 71351, 71351 }, + { 71453, 71455 }, + { 71458, 71461 }, + { 71463, 71467 }, + { 72193, 72198 }, + { 72201, 72202 }, + { 72243, 72248 }, + { 72251, 72254 }, + { 72263, 72263 }, + { 72273, 72278 }, + { 72281, 72283 }, + { 72330, 72342 }, + { 72344, 72345 }, + { 72752, 72758 }, + { 72760, 72765 }, + { 72767, 72767 }, + { 72850, 72871 }, + { 72874, 72880 }, + { 72882, 72883 }, + { 72885, 72886 }, + { 73009, 73014 }, + { 73018, 73018 }, + { 73020, 73021 }, + { 73023, 73029 }, + { 73031, 73031 }, + { 92912, 92916 }, + { 92976, 92982 }, { 94095, 94098 }, + { 113821, 113822 }, { 119143, 119145 }, { 119163, 119170 }, { 119173, 119179 }, { 119210, 119213 }, { 119362, 119364 }, + { 121344, 121398 }, + { 121403, 121452 }, + { 121461, 121461 }, + { 121476, 121476 }, + { 121499, 121503 }, + { 121505, 121519 }, + { 122880, 122886 }, + { 122888, 122904 }, + { 122907, 122913 }, + { 122915, 122916 }, + { 122918, 122922 }, + { 125136, 125142 }, + { 125252, 125258 }, { 917760, 917999 }, }; static const URange16 M_range16[] = { @@ -2535,8 +2853,8 @@ static const URange16 M_range16[] = { { 2085, 2087 }, { 2089, 2093 }, { 2137, 2139 }, - { 2276, 2302 }, - { 2304, 2307 }, + { 2260, 2273 }, + { 2275, 2307 }, { 2362, 2364 }, { 2366, 2383 }, { 2385, 2391 }, @@ -2562,6 +2880,7 @@ static const URange16 M_range16[] = { { 2759, 2761 }, { 2763, 2765 }, { 2786, 2787 }, + { 2810, 2815 }, { 2817, 2819 }, { 2876, 2876 }, { 2878, 2884 }, @@ -2574,20 +2893,21 @@ static const URange16 M_range16[] = { { 3014, 3016 }, { 3018, 3021 }, { 3031, 3031 }, - { 3073, 3075 }, + { 3072, 3075 }, { 3134, 3140 }, { 3142, 3144 }, { 3146, 3149 }, { 3157, 3158 }, { 3170, 3171 }, - { 3202, 3203 }, + { 3201, 3203 }, { 3260, 3260 }, { 3262, 3268 }, { 3270, 3272 }, { 3274, 3277 }, { 3285, 3286 }, { 3298, 3299 }, - { 3330, 3331 }, + { 3328, 3331 }, + { 3387, 3388 }, { 3390, 3396 }, { 3398, 3400 }, { 3402, 3405 }, @@ -2633,15 +2953,15 @@ static const URange16 M_range16[] = { { 6068, 6099 }, { 6109, 6109 }, { 6155, 6157 }, + { 6277, 6278 }, { 6313, 6313 }, { 6432, 6443 }, { 6448, 6459 }, - { 6576, 6592 }, - { 6600, 6601 }, { 6679, 6683 }, { 6741, 6750 }, { 6752, 6780 }, { 6783, 6783 }, + { 6832, 6846 }, { 6912, 6916 }, { 6964, 6980 }, { 7019, 7027 }, @@ -2653,8 +2973,9 @@ static const URange16 M_range16[] = { { 7380, 7400 }, { 7405, 7405 }, { 7410, 7412 }, - { 7616, 7654 }, - { 7676, 7679 }, + { 7415, 7417 }, + { 7616, 7673 }, + { 7675, 7679 }, { 8400, 8432 }, { 11503, 11505 }, { 11647, 11647 }, @@ -2663,23 +2984,24 @@ static const URange16 M_range16[] = { { 12441, 12442 }, { 42607, 42610 }, { 42612, 42621 }, - { 42655, 42655 }, + { 42654, 42655 }, { 42736, 42737 }, { 43010, 43010 }, { 43014, 43014 }, { 43019, 43019 }, { 43043, 43047 }, { 43136, 43137 }, - { 43188, 43204 }, + { 43188, 43205 }, { 43232, 43249 }, { 43302, 43309 }, { 43335, 43347 }, { 43392, 43395 }, { 43443, 43456 }, + { 43493, 43493 }, { 43561, 43574 }, { 43587, 43587 }, { 43596, 43597 }, - { 43643, 43643 }, + { 43643, 43645 }, { 43696, 43696 }, { 43698, 43700 }, { 43703, 43704 }, @@ -2691,32 +3013,87 @@ static const URange16 M_range16[] = { { 44012, 44013 }, { 64286, 64286 }, { 65024, 65039 }, - { 65056, 65062 }, + { 65056, 65071 }, }; static const URange32 M_range32[] = { { 66045, 66045 }, + { 66272, 66272 }, + { 66422, 66426 }, { 68097, 68099 }, { 68101, 68102 }, { 68108, 68111 }, { 68152, 68154 }, { 68159, 68159 }, + { 68325, 68326 }, { 69632, 69634 }, { 69688, 69702 }, - { 69760, 69762 }, + { 69759, 69762 }, { 69808, 69818 }, { 69888, 69890 }, { 69927, 69940 }, + { 70003, 70003 }, { 70016, 70018 }, { 70067, 70080 }, + { 70090, 70092 }, + { 70188, 70199 }, + { 70206, 70206 }, + { 70367, 70378 }, + { 70400, 70403 }, + { 70460, 70460 }, + { 70462, 70468 }, + { 70471, 70472 }, + { 70475, 70477 }, + { 70487, 70487 }, + { 70498, 70499 }, + { 70502, 70508 }, + { 70512, 70516 }, + { 70709, 70726 }, + { 70832, 70851 }, + { 71087, 71093 }, + { 71096, 71104 }, + { 71132, 71133 }, + { 71216, 71232 }, { 71339, 71351 }, + { 71453, 71467 }, + { 72193, 72202 }, + { 72243, 72249 }, + { 72251, 72254 }, + { 72263, 72263 }, + { 72273, 72283 }, + { 72330, 72345 }, + { 72751, 72758 }, + { 72760, 72767 }, + { 72850, 72871 }, + { 72873, 72886 }, + { 73009, 73014 }, + { 73018, 73018 }, + { 73020, 73021 }, + { 73023, 73029 }, + { 73031, 73031 }, + { 92912, 92916 }, + { 92976, 92982 }, { 94033, 94078 }, { 94095, 94098 }, + { 113821, 113822 }, { 119141, 119145 }, { 119149, 119154 }, { 119163, 119170 }, { 119173, 119179 }, { 119210, 119213 }, { 119362, 119364 }, + { 121344, 121398 }, + { 121403, 121452 }, + { 121461, 121461 }, + { 121476, 121476 }, + { 121499, 121503 }, + { 121505, 121519 }, + { 122880, 122886 }, + { 122888, 122904 }, + { 122907, 122913 }, + { 122915, 122916 }, + { 122918, 122922 }, + { 125136, 125142 }, + { 125252, 125258 }, { 917760, 917999 }, }; static const URange16 L_range16[] = { @@ -2735,13 +3112,14 @@ static const URange16 L_range16[] = { { 880, 884 }, { 886, 887 }, { 890, 893 }, + { 895, 895 }, { 902, 902 }, { 904, 906 }, { 908, 908 }, { 910, 929 }, { 931, 1013 }, { 1015, 1153 }, - { 1162, 1319 }, + { 1162, 1327 }, { 1329, 1366 }, { 1369, 1369 }, { 1377, 1415 }, @@ -2767,14 +3145,14 @@ static const URange16 L_range16[] = { { 2084, 2084 }, { 2088, 2088 }, { 2112, 2136 }, - { 2208, 2208 }, - { 2210, 2220 }, + { 2144, 2154 }, + { 2208, 2228 }, + { 2230, 2237 }, { 2308, 2361 }, { 2365, 2365 }, { 2384, 2384 }, { 2392, 2401 }, - { 2417, 2423 }, - { 2425, 2431 }, + { 2417, 2432 }, { 2437, 2444 }, { 2447, 2448 }, { 2451, 2472 }, @@ -2786,6 +3164,7 @@ static const URange16 L_range16[] = { { 2524, 2525 }, { 2527, 2529 }, { 2544, 2545 }, + { 2556, 2556 }, { 2565, 2570 }, { 2575, 2576 }, { 2579, 2600 }, @@ -2805,6 +3184,7 @@ static const URange16 L_range16[] = { { 2749, 2749 }, { 2768, 2768 }, { 2784, 2785 }, + { 2809, 2809 }, { 2821, 2828 }, { 2831, 2832 }, { 2835, 2856 }, @@ -2829,11 +3209,11 @@ static const URange16 L_range16[] = { { 3077, 3084 }, { 3086, 3088 }, { 3090, 3112 }, - { 3114, 3123 }, - { 3125, 3129 }, + { 3114, 3129 }, { 3133, 3133 }, - { 3160, 3161 }, + { 3160, 3162 }, { 3168, 3169 }, + { 3200, 3200 }, { 3205, 3212 }, { 3214, 3216 }, { 3218, 3240 }, @@ -2848,7 +3228,8 @@ static const URange16 L_range16[] = { { 3346, 3386 }, { 3389, 3389 }, { 3406, 3406 }, - { 3424, 3425 }, + { 3412, 3414 }, + { 3423, 3425 }, { 3450, 3455 }, { 3461, 3478 }, { 3482, 3505 }, @@ -2909,11 +3290,13 @@ static const URange16 L_range16[] = { { 4882, 4885 }, { 4888, 4954 }, { 4992, 5007 }, - { 5024, 5108 }, + { 5024, 5109 }, + { 5112, 5117 }, { 5121, 5740 }, { 5743, 5759 }, { 5761, 5786 }, { 5792, 5866 }, + { 5873, 5880 }, { 5888, 5900 }, { 5902, 5905 }, { 5920, 5937 }, @@ -2924,14 +3307,15 @@ static const URange16 L_range16[] = { { 6103, 6103 }, { 6108, 6108 }, { 6176, 6263 }, - { 6272, 6312 }, + { 6272, 6276 }, + { 6279, 6312 }, { 6314, 6314 }, { 6320, 6389 }, - { 6400, 6428 }, + { 6400, 6430 }, { 6480, 6509 }, { 6512, 6516 }, { 6528, 6571 }, - { 6593, 6599 }, + { 6576, 6601 }, { 6656, 6678 }, { 6688, 6740 }, { 6823, 6823 }, @@ -2943,6 +3327,7 @@ static const URange16 L_range16[] = { { 7168, 7203 }, { 7245, 7247 }, { 7258, 7293 }, + { 7296, 7304 }, { 7401, 7404 }, { 7406, 7409 }, { 7413, 7414 }, @@ -3010,26 +3395,25 @@ static const URange16 L_range16[] = { { 12445, 12447 }, { 12449, 12538 }, { 12540, 12543 }, - { 12549, 12589 }, + { 12549, 12590 }, { 12593, 12686 }, { 12704, 12730 }, { 12784, 12799 }, { 13312, 19893 }, - { 19968, 40908 }, + { 19968, 40938 }, { 40960, 42124 }, { 42192, 42237 }, { 42240, 42508 }, { 42512, 42527 }, { 42538, 42539 }, { 42560, 42606 }, - { 42623, 42647 }, + { 42623, 42653 }, { 42656, 42725 }, { 42775, 42783 }, { 42786, 42888 }, - { 42891, 42894 }, - { 42896, 42899 }, - { 42912, 42922 }, - { 43000, 43009 }, + { 42891, 42926 }, + { 42928, 42935 }, + { 42999, 43009 }, { 43011, 43013 }, { 43015, 43018 }, { 43020, 43042 }, @@ -3037,17 +3421,21 @@ static const URange16 L_range16[] = { { 43138, 43187 }, { 43250, 43255 }, { 43259, 43259 }, + { 43261, 43261 }, { 43274, 43301 }, { 43312, 43334 }, { 43360, 43388 }, { 43396, 43442 }, { 43471, 43471 }, + { 43488, 43492 }, + { 43494, 43503 }, + { 43514, 43518 }, { 43520, 43560 }, { 43584, 43586 }, { 43588, 43595 }, { 43616, 43638 }, { 43642, 43642 }, - { 43648, 43695 }, + { 43646, 43695 }, { 43697, 43697 }, { 43701, 43702 }, { 43705, 43709 }, @@ -3061,7 +3449,9 @@ static const URange16 L_range16[] = { { 43793, 43798 }, { 43808, 43814 }, { 43816, 43822 }, - { 43968, 44002 }, + { 43824, 43866 }, + { 43868, 43877 }, + { 43888, 44002 }, { 44032, 55203 }, { 55216, 55238 }, { 55243, 55291 }, @@ -3101,19 +3491,31 @@ static const URange32 L_range32[] = { { 65664, 65786 }, { 66176, 66204 }, { 66208, 66256 }, - { 66304, 66334 }, - { 66352, 66368 }, + { 66304, 66335 }, + { 66349, 66368 }, { 66370, 66377 }, + { 66384, 66421 }, { 66432, 66461 }, { 66464, 66499 }, { 66504, 66511 }, { 66560, 66717 }, + { 66736, 66771 }, + { 66776, 66811 }, + { 66816, 66855 }, + { 66864, 66915 }, + { 67072, 67382 }, + { 67392, 67413 }, + { 67424, 67431 }, { 67584, 67589 }, { 67592, 67592 }, { 67594, 67637 }, { 67639, 67640 }, { 67644, 67644 }, { 67647, 67669 }, + { 67680, 67702 }, + { 67712, 67742 }, + { 67808, 67826 }, + { 67828, 67829 }, { 67840, 67861 }, { 67872, 67897 }, { 67968, 68023 }, @@ -3123,24 +3525,94 @@ static const URange32 L_range32[] = { { 68117, 68119 }, { 68121, 68147 }, { 68192, 68220 }, + { 68224, 68252 }, + { 68288, 68295 }, + { 68297, 68324 }, { 68352, 68405 }, { 68416, 68437 }, { 68448, 68466 }, + { 68480, 68497 }, { 68608, 68680 }, + { 68736, 68786 }, + { 68800, 68850 }, { 69635, 69687 }, { 69763, 69807 }, { 69840, 69864 }, { 69891, 69926 }, + { 69968, 70002 }, + { 70006, 70006 }, { 70019, 70066 }, { 70081, 70084 }, + { 70106, 70106 }, + { 70108, 70108 }, + { 70144, 70161 }, + { 70163, 70187 }, + { 70272, 70278 }, + { 70280, 70280 }, + { 70282, 70285 }, + { 70287, 70301 }, + { 70303, 70312 }, + { 70320, 70366 }, + { 70405, 70412 }, + { 70415, 70416 }, + { 70419, 70440 }, + { 70442, 70448 }, + { 70450, 70451 }, + { 70453, 70457 }, + { 70461, 70461 }, + { 70480, 70480 }, + { 70493, 70497 }, + { 70656, 70708 }, + { 70727, 70730 }, + { 70784, 70831 }, + { 70852, 70853 }, + { 70855, 70855 }, + { 71040, 71086 }, + { 71128, 71131 }, + { 71168, 71215 }, + { 71236, 71236 }, { 71296, 71338 }, - { 73728, 74606 }, + { 71424, 71449 }, + { 71840, 71903 }, + { 71935, 71935 }, + { 72192, 72192 }, + { 72203, 72242 }, + { 72250, 72250 }, + { 72272, 72272 }, + { 72284, 72323 }, + { 72326, 72329 }, + { 72384, 72440 }, + { 72704, 72712 }, + { 72714, 72750 }, + { 72768, 72768 }, + { 72818, 72847 }, + { 72960, 72966 }, + { 72968, 72969 }, + { 72971, 73008 }, + { 73030, 73030 }, + { 73728, 74649 }, + { 74880, 75075 }, { 77824, 78894 }, + { 82944, 83526 }, { 92160, 92728 }, + { 92736, 92766 }, + { 92880, 92909 }, + { 92928, 92975 }, + { 92992, 92995 }, + { 93027, 93047 }, + { 93053, 93071 }, { 93952, 94020 }, { 94032, 94032 }, { 94099, 94111 }, - { 110592, 110593 }, + { 94176, 94177 }, + { 94208, 100332 }, + { 100352, 101106 }, + { 110592, 110878 }, + { 110960, 111355 }, + { 113664, 113770 }, + { 113776, 113788 }, + { 113792, 113800 }, + { 113808, 113817 }, { 119808, 119892 }, { 119894, 119964 }, { 119966, 119967 }, @@ -3171,6 +3643,8 @@ static const URange32 L_range32[] = { { 120714, 120744 }, { 120746, 120770 }, { 120772, 120779 }, + { 124928, 125124 }, + { 125184, 125251 }, { 126464, 126467 }, { 126469, 126495 }, { 126497, 126498 }, @@ -3207,6 +3681,8 @@ static const URange32 L_range32[] = { { 131072, 173782 }, { 173824, 177972 }, { 177984, 178205 }, + { 178208, 183969 }, + { 183984, 191456 }, { 194560, 195101 }, }; static const URange16 N_range16[] = { @@ -3228,7 +3704,9 @@ static const URange16 N_range16[] = { { 3174, 3183 }, { 3192, 3198 }, { 3302, 3311 }, - { 3430, 3445 }, + { 3416, 3422 }, + { 3430, 3448 }, + { 3558, 3567 }, { 3664, 3673 }, { 3792, 3801 }, { 3872, 3891 }, @@ -3271,6 +3749,7 @@ static const URange16 N_range16[] = { { 43216, 43225 }, { 43264, 43273 }, { 43472, 43481 }, + { 43504, 43513 }, { 43600, 43609 }, { 44016, 44025 }, { 65296, 65305 }, @@ -3278,28 +3757,53 @@ static const URange16 N_range16[] = { static const URange32 N_range32[] = { { 65799, 65843 }, { 65856, 65912 }, - { 65930, 65930 }, + { 65930, 65931 }, + { 66273, 66299 }, { 66336, 66339 }, { 66369, 66369 }, { 66378, 66378 }, { 66513, 66517 }, { 66720, 66729 }, { 67672, 67679 }, + { 67705, 67711 }, + { 67751, 67759 }, + { 67835, 67839 }, { 67862, 67867 }, + { 68028, 68029 }, + { 68032, 68047 }, + { 68050, 68095 }, { 68160, 68167 }, { 68221, 68222 }, + { 68253, 68255 }, + { 68331, 68335 }, { 68440, 68447 }, { 68472, 68479 }, + { 68521, 68527 }, + { 68858, 68863 }, { 69216, 69246 }, { 69714, 69743 }, { 69872, 69881 }, { 69942, 69951 }, { 70096, 70105 }, + { 70113, 70132 }, + { 70384, 70393 }, + { 70736, 70745 }, + { 70864, 70873 }, + { 71248, 71257 }, { 71360, 71369 }, - { 74752, 74850 }, + { 71472, 71483 }, + { 71904, 71922 }, + { 72784, 72812 }, + { 73040, 73049 }, + { 74752, 74862 }, + { 92768, 92777 }, + { 93008, 93017 }, + { 93019, 93025 }, { 119648, 119665 }, { 120782, 120831 }, - { 127232, 127242 }, + { 125127, 125135 }, + { 125264, 125273 }, + { 127232, 127244 }, }; static const URange16 Sk_range16[] = { { 94, 94 }, @@ -3325,11 +3829,15 @@ static const URange16 Sk_range16[] = { { 42752, 42774 }, { 42784, 42785 }, { 42889, 42890 }, + { 43867, 43867 }, { 64434, 64449 }, { 65342, 65342 }, { 65344, 65344 }, { 65507, 65507 }, }; +static const URange32 Sk_range32[] = { + { 127995, 127999 }, +}; static const URange16 P_range16[] = { { 33, 35 }, { 37, 42 }, @@ -3367,6 +3875,7 @@ static const URange16 P_range16[] = { { 2142, 2142 }, { 2404, 2405 }, { 2416, 2416 }, + { 2557, 2557 }, { 2800, 2800 }, { 3572, 3572 }, { 3663, 3663 }, @@ -3416,7 +3925,7 @@ static const URange16 P_range16[] = { { 11518, 11519 }, { 11632, 11632 }, { 11776, 11822 }, - { 11824, 11835 }, + { 11824, 11849 }, { 12289, 12291 }, { 12296, 12305 }, { 12308, 12319 }, @@ -3432,6 +3941,7 @@ static const URange16 P_range16[] = { { 43124, 43127 }, { 43214, 43215 }, { 43256, 43258 }, + { 43260, 43260 }, { 43310, 43311 }, { 43359, 43359 }, { 43457, 43469 }, @@ -3462,18 +3972,47 @@ static const URange32 P_range32[] = { { 65792, 65794 }, { 66463, 66463 }, { 66512, 66512 }, + { 66927, 66927 }, { 67671, 67671 }, { 67871, 67871 }, { 67903, 67903 }, { 68176, 68184 }, { 68223, 68223 }, + { 68336, 68342 }, { 68409, 68415 }, + { 68505, 68508 }, { 69703, 69709 }, { 69819, 69820 }, { 69822, 69825 }, { 69952, 69955 }, - { 70085, 70088 }, - { 74864, 74867 }, + { 70004, 70005 }, + { 70085, 70089 }, + { 70093, 70093 }, + { 70107, 70107 }, + { 70109, 70111 }, + { 70200, 70205 }, + { 70313, 70313 }, + { 70731, 70735 }, + { 70747, 70747 }, + { 70749, 70749 }, + { 70854, 70854 }, + { 71105, 71127 }, + { 71233, 71235 }, + { 71264, 71276 }, + { 71484, 71486 }, + { 72255, 72262 }, + { 72346, 72348 }, + { 72350, 72354 }, + { 72769, 72773 }, + { 72816, 72817 }, + { 74864, 74868 }, + { 92782, 92783 }, + { 92917, 92917 }, + { 92983, 92987 }, + { 92996, 92996 }, + { 113823, 113823 }, + { 121479, 121483 }, + { 125278, 125279 }, }; static const URange16 S_range16[] = { { 36, 36 }, @@ -3500,7 +4039,7 @@ static const URange16 S_range16[] = { { 900, 901 }, { 1014, 1014 }, { 1154, 1154 }, - { 1423, 1423 }, + { 1421, 1423 }, { 1542, 1544 }, { 1547, 1547 }, { 1550, 1551 }, @@ -3514,6 +4053,7 @@ static const URange16 S_range16[] = { { 2928, 2928 }, { 3059, 3066 }, { 3199, 3199 }, + { 3407, 3407 }, { 3449, 3449 }, { 3647, 3647 }, { 3841, 3843 }, @@ -3544,7 +4084,7 @@ static const URange16 S_range16[] = { { 8274, 8274 }, { 8314, 8316 }, { 8330, 8332 }, - { 8352, 8378 }, + { 8352, 8383 }, { 8448, 8449 }, { 8451, 8454 }, { 8456, 8457 }, @@ -3559,21 +4099,24 @@ static const URange16 S_range16[] = { { 8512, 8516 }, { 8522, 8525 }, { 8527, 8527 }, + { 8586, 8587 }, { 8592, 8967 }, { 8972, 9000 }, - { 9003, 9203 }, - { 9216, 9254 }, + { 9003, 9254 }, { 9280, 9290 }, { 9372, 9449 }, - { 9472, 9983 }, - { 9985, 10087 }, + { 9472, 10087 }, { 10132, 10180 }, { 10183, 10213 }, { 10224, 10626 }, { 10649, 10711 }, { 10716, 10747 }, - { 10750, 11084 }, - { 11088, 11097 }, + { 10750, 11123 }, + { 11126, 11157 }, + { 11160, 11193 }, + { 11197, 11208 }, + { 11210, 11218 }, + { 11244, 11247 }, { 11493, 11498 }, { 11904, 11929 }, { 11931, 12019 }, @@ -3603,6 +4146,7 @@ static const URange16 S_range16[] = { { 43048, 43051 }, { 43062, 43065 }, { 43639, 43641 }, + { 43867, 43867 }, { 64297, 64297 }, { 64434, 64449 }, { 65020, 65021 }, @@ -3623,15 +4167,23 @@ static const URange16 S_range16[] = { static const URange32 S_range32[] = { { 65847, 65855 }, { 65913, 65929 }, + { 65932, 65934 }, { 65936, 65947 }, + { 65952, 65952 }, { 66000, 66044 }, + { 67703, 67704 }, + { 68296, 68296 }, + { 71487, 71487 }, + { 92988, 92991 }, + { 92997, 92997 }, + { 113820, 113820 }, { 118784, 119029 }, { 119040, 119078 }, { 119081, 119140 }, { 119146, 119148 }, { 119171, 119172 }, { 119180, 119209 }, - { 119214, 119261 }, + { 119214, 119272 }, { 119296, 119361 }, { 119365, 119365 }, { 119552, 119638 }, @@ -3645,38 +4197,43 @@ static const URange32 S_range32[] = { { 120713, 120713 }, { 120745, 120745 }, { 120771, 120771 }, + { 120832, 121343 }, + { 121399, 121402 }, + { 121453, 121460 }, + { 121462, 121475 }, + { 121477, 121478 }, { 126704, 126705 }, { 126976, 127019 }, { 127024, 127123 }, { 127136, 127150 }, - { 127153, 127166 }, + { 127153, 127167 }, { 127169, 127183 }, - { 127185, 127199 }, + { 127185, 127221 }, { 127248, 127278 }, { 127280, 127339 }, - { 127344, 127386 }, + { 127344, 127404 }, { 127462, 127490 }, - { 127504, 127546 }, + { 127504, 127547 }, { 127552, 127560 }, { 127568, 127569 }, - { 127744, 127776 }, - { 127792, 127797 }, - { 127799, 127868 }, - { 127872, 127891 }, - { 127904, 127940 }, - { 127942, 127946 }, - { 127968, 127984 }, - { 128000, 128062 }, - { 128064, 128064 }, - { 128066, 128247 }, - { 128249, 128252 }, - { 128256, 128317 }, - { 128320, 128323 }, - { 128336, 128359 }, - { 128507, 128576 }, - { 128581, 128591 }, - { 128640, 128709 }, + { 127584, 127589 }, + { 127744, 128724 }, + { 128736, 128748 }, + { 128752, 128760 }, { 128768, 128883 }, + { 128896, 128980 }, + { 129024, 129035 }, + { 129040, 129095 }, + { 129104, 129113 }, + { 129120, 129159 }, + { 129168, 129197 }, + { 129280, 129291 }, + { 129296, 129342 }, + { 129344, 129356 }, + { 129360, 129387 }, + { 129408, 129431 }, + { 129472, 129472 }, + { 129488, 129510 }, }; static const URange16 So_range16[] = { { 166, 166 }, @@ -3684,6 +4241,7 @@ static const URange16 So_range16[] = { { 174, 174 }, { 176, 176 }, { 1154, 1154 }, + { 1421, 1422 }, { 1550, 1551 }, { 1758, 1758 }, { 1769, 1769 }, @@ -3694,6 +4252,7 @@ static const URange16 So_range16[] = { { 3059, 3064 }, { 3066, 3066 }, { 3199, 3199 }, + { 3407, 3407 }, { 3449, 3449 }, { 3841, 3843 }, { 3859, 3859 }, @@ -3726,6 +4285,7 @@ static const URange16 So_range16[] = { { 8522, 8522 }, { 8524, 8525 }, { 8527, 8527 }, + { 8586, 8587 }, { 8597, 8601 }, { 8604, 8607 }, { 8609, 8610 }, @@ -3741,21 +4301,24 @@ static const URange16 So_range16[] = { { 9003, 9083 }, { 9085, 9114 }, { 9140, 9179 }, - { 9186, 9203 }, - { 9216, 9254 }, + { 9186, 9254 }, { 9280, 9290 }, { 9372, 9449 }, { 9472, 9654 }, { 9656, 9664 }, { 9666, 9719 }, { 9728, 9838 }, - { 9840, 9983 }, - { 9985, 10087 }, + { 9840, 10087 }, { 10132, 10175 }, { 10240, 10495 }, { 11008, 11055 }, { 11077, 11078 }, - { 11088, 11097 }, + { 11085, 11123 }, + { 11126, 11157 }, + { 11160, 11193 }, + { 11197, 11208 }, + { 11210, 11218 }, + { 11244, 11247 }, { 11493, 11498 }, { 11904, 11929 }, { 11931, 12019 }, @@ -3791,49 +4354,63 @@ static const URange16 So_range16[] = { static const URange32 So_range32[] = { { 65847, 65855 }, { 65913, 65929 }, + { 65932, 65934 }, { 65936, 65947 }, + { 65952, 65952 }, { 66000, 66044 }, + { 67703, 67704 }, + { 68296, 68296 }, + { 71487, 71487 }, + { 92988, 92991 }, + { 92997, 92997 }, + { 113820, 113820 }, { 118784, 119029 }, { 119040, 119078 }, { 119081, 119140 }, { 119146, 119148 }, { 119171, 119172 }, { 119180, 119209 }, - { 119214, 119261 }, + { 119214, 119272 }, { 119296, 119361 }, { 119365, 119365 }, { 119552, 119638 }, + { 120832, 121343 }, + { 121399, 121402 }, + { 121453, 121460 }, + { 121462, 121475 }, + { 121477, 121478 }, { 126976, 127019 }, { 127024, 127123 }, { 127136, 127150 }, - { 127153, 127166 }, + { 127153, 127167 }, { 127169, 127183 }, - { 127185, 127199 }, + { 127185, 127221 }, { 127248, 127278 }, { 127280, 127339 }, - { 127344, 127386 }, + { 127344, 127404 }, { 127462, 127490 }, - { 127504, 127546 }, + { 127504, 127547 }, { 127552, 127560 }, { 127568, 127569 }, - { 127744, 127776 }, - { 127792, 127797 }, - { 127799, 127868 }, - { 127872, 127891 }, - { 127904, 127940 }, - { 127942, 127946 }, - { 127968, 127984 }, - { 128000, 128062 }, - { 128064, 128064 }, - { 128066, 128247 }, - { 128249, 128252 }, - { 128256, 128317 }, - { 128320, 128323 }, - { 128336, 128359 }, - { 128507, 128576 }, - { 128581, 128591 }, - { 128640, 128709 }, + { 127584, 127589 }, + { 127744, 127994 }, + { 128000, 128724 }, + { 128736, 128748 }, + { 128752, 128760 }, { 128768, 128883 }, + { 128896, 128980 }, + { 129024, 129035 }, + { 129040, 129095 }, + { 129104, 129113 }, + { 129120, 129159 }, + { 129168, 129197 }, + { 129280, 129291 }, + { 129296, 129342 }, + { 129344, 129356 }, + { 129360, 129387 }, + { 129408, 129431 }, + { 129472, 129472 }, + { 129488, 129510 }, }; static const URange16 Sm_range16[] = { { 43, 43 }, @@ -3914,7 +4491,7 @@ static const URange16 Sc_range16[] = { { 3065, 3065 }, { 3647, 3647 }, { 6107, 6107 }, - { 8352, 8378 }, + { 8352, 8383 }, { 43064, 43064 }, { 65020, 65020 }, { 65129, 65129 }, @@ -3948,10 +4525,11 @@ static const URange16 Cc_range16[] = { }; static const URange16 Cf_range16[] = { { 173, 173 }, - { 1536, 1540 }, + { 1536, 1541 }, { 1564, 1564 }, { 1757, 1757 }, { 1807, 1807 }, + { 2274, 2274 }, { 6158, 6158 }, { 8203, 8207 }, { 8234, 8238 }, @@ -3962,6 +4540,7 @@ static const URange16 Cf_range16[] = { }; static const URange32 Cf_range32[] = { { 69821, 69821 }, + { 113824, 113827 }, { 119155, 119162 }, { 917505, 917505 }, { 917536, 917631 }, @@ -3981,33 +4560,46 @@ static const URange16 Zs_range16[] = { { 8287, 8287 }, { 12288, 12288 }, }; +static const URange32 Tangut_range32[] = { + { 94176, 94176 }, + { 94208, 100332 }, + { 100352, 101106 }, +}; static const URange16 Thaana_range16[] = { { 1920, 1969 }, }; +static const URange32 Adlam_range32[] = { + { 125184, 125258 }, + { 125264, 125273 }, + { 125278, 125279 }, +}; static const URange16 Telugu_range16[] = { - { 3073, 3075 }, + { 3072, 3075 }, { 3077, 3084 }, { 3086, 3088 }, { 3090, 3112 }, - { 3114, 3123 }, - { 3125, 3129 }, + { 3114, 3129 }, { 3133, 3140 }, { 3142, 3144 }, { 3146, 3149 }, { 3157, 3158 }, - { 3160, 3161 }, + { 3160, 3162 }, { 3168, 3171 }, { 3174, 3183 }, { 3192, 3199 }, }; static const URange16 Cyrillic_range16[] = { { 1024, 1156 }, - { 1159, 1319 }, + { 1159, 1327 }, + { 7296, 7304 }, { 7467, 7467 }, { 7544, 7544 }, { 11744, 11775 }, - { 42560, 42647 }, - { 42655, 42655 }, + { 42560, 42655 }, + { 65070, 65071 }, +}; +static const URange32 Zanabazar_Square_range32[] = { + { 72192, 72263 }, }; static const URange16 Hangul_range16[] = { { 4352, 4607 }, @@ -4068,22 +4660,25 @@ static const URange16 Inherited_range16[] = { { 1611, 1621 }, { 1648, 1648 }, { 2385, 2386 }, + { 6832, 6846 }, { 7376, 7378 }, { 7380, 7392 }, { 7394, 7400 }, { 7405, 7405 }, { 7412, 7412 }, - { 7616, 7654 }, - { 7676, 7679 }, + { 7416, 7417 }, + { 7616, 7673 }, + { 7675, 7679 }, { 8204, 8205 }, { 8400, 8432 }, { 12330, 12333 }, { 12441, 12442 }, { 65024, 65039 }, - { 65056, 65062 }, + { 65056, 65069 }, }; static const URange32 Inherited_range32[] = { { 66045, 66045 }, + { 66272, 66272 }, { 119143, 119145 }, { 119163, 119170 }, { 119173, 119179 }, @@ -4092,7 +4687,19 @@ static const URange32 Inherited_range32[] = { }; static const URange32 Meroitic_Cursive_range32[] = { { 68000, 68023 }, - { 68030, 68031 }, + { 68028, 68047 }, + { 68050, 68095 }, +}; +static const URange32 Bhaiksuki_range32[] = { + { 72704, 72712 }, + { 72714, 72758 }, + { 72760, 72773 }, + { 72784, 72812 }, +}; +static const URange32 Ahom_range32[] = { + { 71424, 71449 }, + { 71453, 71467 }, + { 71472, 71487 }, }; static const URange16 Han_range16[] = { { 11904, 11929 }, @@ -4103,7 +4710,7 @@ static const URange16 Han_range16[] = { { 12321, 12329 }, { 12344, 12347 }, { 13312, 19893 }, - { 19968, 40908 }, + { 19968, 40938 }, { 63744, 64109 }, { 64112, 64217 }, }; @@ -4111,14 +4718,19 @@ static const URange32 Han_range32[] = { { 131072, 173782 }, { 173824, 177972 }, { 177984, 178205 }, + { 178208, 183969 }, + { 183984, 191456 }, { 194560, 195101 }, }; +static const URange32 Old_North_Arabian_range32[] = { + { 68224, 68255 }, +}; static const URange16 Armenian_range16[] = { { 1329, 1366 }, { 1369, 1375 }, { 1377, 1415 }, { 1418, 1418 }, - { 1423, 1423 }, + { 1421, 1423 }, { 64275, 64279 }, }; static const URange16 Tamil_range16[] = { @@ -4141,29 +4753,39 @@ static const URange16 Tamil_range16[] = { }; static const URange16 Bopomofo_range16[] = { { 746, 747 }, - { 12549, 12589 }, + { 12549, 12590 }, { 12704, 12730 }, }; +static const URange32 Bassa_Vah_range32[] = { + { 92880, 92909 }, + { 92912, 92917 }, +}; static const URange16 Sundanese_range16[] = { { 7040, 7103 }, { 7360, 7367 }, }; +static const URange32 Osage_range32[] = { + { 66736, 66771 }, + { 66776, 66811 }, +}; static const URange16 Tagalog_range16[] = { { 5888, 5900 }, { 5902, 5908 }, }; static const URange16 Malayalam_range16[] = { - { 3330, 3331 }, + { 3328, 3331 }, { 3333, 3340 }, { 3342, 3344 }, - { 3346, 3386 }, - { 3389, 3396 }, + { 3346, 3396 }, { 3398, 3400 }, - { 3402, 3406 }, - { 3415, 3415 }, - { 3424, 3427 }, - { 3430, 3445 }, - { 3449, 3455 }, + { 3402, 3407 }, + { 3412, 3427 }, + { 3430, 3455 }, +}; +static const URange32 Marchen_range32[] = { + { 72816, 72847 }, + { 72850, 72871 }, + { 72873, 72886 }, }; static const URange32 Carian_range32[] = { { 66208, 66256 }, @@ -4173,7 +4795,7 @@ static const URange16 Hiragana_range16[] = { { 12445, 12447 }, }; static const URange32 Hiragana_range32[] = { - { 110593, 110593 }, + { 110593, 110878 }, { 127488, 127488 }, }; static const URange16 Tagbanwa_range16[] = { @@ -4186,12 +4808,20 @@ static const URange16 Meetei_Mayek_range16[] = { { 43968, 44013 }, { 44016, 44025 }, }; +static const URange32 Pahawh_Hmong_range32[] = { + { 92928, 92997 }, + { 93008, 93017 }, + { 93019, 93025 }, + { 93027, 93047 }, + { 93053, 93071 }, +}; static const URange16 Tai_Le_range16[] = { { 6480, 6509 }, { 6512, 6516 }, }; static const URange16 Kayah_Li_range16[] = { - { 43264, 43311 }, + { 43264, 43309 }, + { 43311, 43311 }, }; static const URange16 Buginese_range16[] = { { 6656, 6683 }, @@ -4215,13 +4845,17 @@ static const URange16 Tai_Tham_range16[] = { { 6816, 6829 }, }; static const URange32 Old_Italic_range32[] = { - { 66304, 66334 }, - { 66336, 66339 }, + { 66304, 66339 }, + { 66349, 66351 }, }; static const URange32 Old_Persian_range32[] = { { 66464, 66499 }, { 66504, 66517 }, }; +static const URange32 Warang_Citi_range32[] = { + { 71840, 71922 }, + { 71935, 71935 }, +}; static const URange16 Latin_range16[] = { { 65, 90 }, { 97, 122 }, @@ -4246,16 +4880,17 @@ static const URange16 Latin_range16[] = { { 8544, 8584 }, { 11360, 11391 }, { 42786, 42887 }, - { 42891, 42894 }, - { 42896, 42899 }, - { 42912, 42922 }, - { 43000, 43007 }, + { 42891, 42926 }, + { 42928, 42935 }, + { 42999, 43007 }, + { 43824, 43866 }, + { 43868, 43876 }, { 64256, 64262 }, { 65313, 65338 }, { 65345, 65370 }, }; static const URange16 Saurashtra_range16[] = { - { 43136, 43204 }, + { 43136, 43205 }, { 43214, 43225 }, }; static const URange32 Shavian_range32[] = { @@ -4271,6 +4906,30 @@ static const URange16 Georgian_range16[] = { { 11559, 11559 }, { 11565, 11565 }, }; +static const URange32 Grantha_range32[] = { + { 70400, 70403 }, + { 70405, 70412 }, + { 70415, 70416 }, + { 70419, 70440 }, + { 70442, 70448 }, + { 70450, 70451 }, + { 70453, 70457 }, + { 70460, 70468 }, + { 70471, 70472 }, + { 70475, 70477 }, + { 70480, 70480 }, + { 70487, 70487 }, + { 70493, 70499 }, + { 70502, 70508 }, + { 70512, 70516 }, +}; +static const URange32 Duployan_range32[] = { + { 113664, 113770 }, + { 113776, 113788 }, + { 113792, 113800 }, + { 113808, 113817 }, + { 113820, 113823 }, +}; static const URange16 Batak_range16[] = { { 7104, 7155 }, { 7164, 7167 }, @@ -4278,9 +4937,8 @@ static const URange16 Batak_range16[] = { static const URange16 Devanagari_range16[] = { { 2304, 2384 }, { 2387, 2403 }, - { 2406, 2423 }, - { 2425, 2431 }, - { 43232, 43259 }, + { 2406, 2431 }, + { 43232, 43261 }, }; static const URange16 Thai_range16[] = { { 3585, 3642 }, @@ -4307,10 +4965,14 @@ static const URange32 Ugaritic_range32[] = { static const URange16 Braille_range16[] = { { 10240, 10495 }, }; +static const URange32 Anatolian_Hieroglyphs_range32[] = { + { 82944, 83526 }, +}; static const URange16 Greek_range16[] = { { 880, 883 }, { 885, 887 }, { 890, 893 }, + { 895, 895 }, { 900, 900 }, { 902, 902 }, { 904, 906 }, @@ -4339,14 +5001,20 @@ static const URange16 Greek_range16[] = { { 8178, 8180 }, { 8182, 8190 }, { 8486, 8486 }, + { 43877, 43877 }, }; static const URange32 Greek_range32[] = { - { 65856, 65930 }, + { 65856, 65934 }, + { 65952, 65952 }, { 119296, 119365 }, }; static const URange32 Lycian_range32[] = { { 66176, 66204 }, }; +static const URange32 Mende_Kikakui_range32[] = { + { 124928, 125124 }, + { 125127, 125142 }, +}; static const URange16 Tai_Viet_range16[] = { { 43648, 43714 }, { 43739, 43743 }, @@ -4371,14 +5039,18 @@ static const URange16 Syriac_range16[] = { { 1792, 1805 }, { 1807, 1866 }, { 1869, 1871 }, + { 2144, 2154 }, }; static const URange16 Runic_range16[] = { { 5792, 5866 }, - { 5870, 5872 }, + { 5870, 5880 }, }; static const URange32 Gothic_range32[] = { { 66352, 66378 }, }; +static const URange32 Mahajani_range32[] = { + { 69968, 70006 }, +}; static const URange16 Katakana_range16[] = { { 12449, 12538 }, { 12541, 12543 }, @@ -4404,15 +5076,25 @@ static const URange16 New_Tai_Lue_range16[] = { static const URange16 Ol_Chiki_range16[] = { { 7248, 7295 }, }; +static const URange32 Newa_range32[] = { + { 70656, 70745 }, + { 70747, 70747 }, + { 70749, 70749 }, +}; static const URange16 Limbu_range16[] = { - { 6400, 6428 }, + { 6400, 6430 }, { 6432, 6443 }, { 6448, 6459 }, { 6464, 6464 }, { 6468, 6479 }, }; +static const URange32 Pau_Cin_Hau_range32[] = { + { 72384, 72440 }, +}; static const URange16 Cherokee_range16[] = { - { 5024, 5108 }, + { 5024, 5109 }, + { 5112, 5117 }, + { 43888, 43967 }, }; static const URange32 Miao_range32[] = { { 93952, 94020 }, @@ -4436,8 +5118,8 @@ static const URange16 Oriya_range16[] = { { 2918, 2935 }, }; static const URange32 Sharada_range32[] = { - { 70016, 70088 }, - { 70096, 70105 }, + { 70016, 70093 }, + { 70096, 70111 }, }; static const URange16 Gujarati_range16[] = { { 2689, 2691 }, @@ -4453,11 +5135,24 @@ static const URange16 Gujarati_range16[] = { { 2768, 2768 }, { 2784, 2787 }, { 2790, 2801 }, + { 2809, 2815 }, +}; +static const URange32 Nushu_range32[] = { + { 94177, 94177 }, + { 110960, 111355 }, +}; +static const URange32 Modi_range32[] = { + { 71168, 71236 }, + { 71248, 71257 }, }; static const URange32 Inscriptional_Pahlavi_range32[] = { { 68448, 68466 }, { 68472, 68479 }, }; +static const URange32 Manichaean_range32[] = { + { 68288, 68326 }, + { 68331, 68342 }, +}; static const URange16 Khmer_range16[] = { { 6016, 6109 }, { 6112, 6121 }, @@ -4465,14 +5160,24 @@ static const URange16 Khmer_range16[] = { { 6624, 6655 }, }; static const URange32 Cuneiform_range32[] = { - { 73728, 74606 }, - { 74752, 74850 }, - { 74864, 74867 }, + { 73728, 74649 }, + { 74752, 74862 }, + { 74864, 74868 }, + { 74880, 75075 }, +}; +static const URange32 Khudawadi_range32[] = { + { 70320, 70378 }, + { 70384, 70393 }, }; static const URange16 Mandaic_range16[] = { { 2112, 2139 }, { 2142, 2142 }, }; +static const URange32 Hatran_range32[] = { + { 67808, 67826 }, + { 67828, 67829 }, + { 67835, 67839 }, +}; static const URange16 Syloti_Nagri_range16[] = { { 43008, 43051 }, }; @@ -4490,8 +5195,12 @@ static const URange32 Phoenician_range32[] = { { 67840, 67867 }, { 67871, 67871 }, }; +static const URange32 Nabataean_range32[] = { + { 67712, 67742 }, + { 67751, 67759 }, +}; static const URange16 Bengali_range16[] = { - { 2433, 2435 }, + { 2432, 2435 }, { 2437, 2444 }, { 2447, 2448 }, { 2451, 2472 }, @@ -4504,7 +5213,7 @@ static const URange16 Bengali_range16[] = { { 2519, 2519 }, { 2524, 2525 }, { 2527, 2531 }, - { 2534, 2555 }, + { 2534, 2557 }, }; static const URange32 Kaithi_range32[] = { { 69760, 69825 }, @@ -4513,6 +5222,13 @@ static const URange16 Glagolitic_range16[] = { { 11264, 11310 }, { 11312, 11358 }, }; +static const URange32 Glagolitic_range32[] = { + { 122880, 122886 }, + { 122888, 122904 }, + { 122907, 122913 }, + { 122915, 122916 }, + { 122918, 122922 }, +}; static const URange32 Imperial_Aramaic_range32[] = { { 67648, 67669 }, { 67671, 67679 }, @@ -4544,6 +5260,9 @@ static const URange16 Javanese_range16[] = { { 43472, 43481 }, { 43486, 43487 }, }; +static const URange32 Old_Permic_range32[] = { + { 66384, 66426 }, +}; static const URange16 Phags_Pa_range16[] = { { 43072, 43127 }, }; @@ -4556,7 +5275,7 @@ static const URange32 Cypriot_range32[] = { { 67647, 67647 }, }; static const URange16 Kannada_range16[] = { - { 3202, 3203 }, + { 3200, 3203 }, { 3205, 3212 }, { 3214, 3216 }, { 3218, 3240 }, @@ -4571,6 +5290,10 @@ static const URange16 Kannada_range16[] = { { 3302, 3311 }, { 3313, 3314 }, }; +static const URange32 Khojki_range32[] = { + { 70144, 70161 }, + { 70163, 70206 }, +}; static const URange16 Mongolian_range16[] = { { 6144, 6145 }, { 6148, 6148 }, @@ -4579,6 +5302,9 @@ static const URange16 Mongolian_range16[] = { { 6176, 6263 }, { 6272, 6314 }, }; +static const URange32 Mongolian_range32[] = { + { 71264, 71276 }, +}; static const URange16 Sinhala_range16[] = { { 3458, 3459 }, { 3461, 3478 }, @@ -4590,11 +5316,19 @@ static const URange16 Sinhala_range16[] = { { 3535, 3540 }, { 3542, 3542 }, { 3544, 3551 }, + { 3558, 3567 }, { 3570, 3572 }, }; +static const URange32 Sinhala_range32[] = { + { 70113, 70132 }, +}; static const URange32 Brahmi_range32[] = { { 69632, 69709 }, { 69714, 69743 }, + { 69759, 69759 }, +}; +static const URange32 Elbasan_range32[] = { + { 66816, 66855 }, }; static const URange32 Deseret_range32[] = { { 66560, 66639 }, @@ -4603,6 +5337,18 @@ static const URange16 Rejang_range16[] = { { 43312, 43347 }, { 43359, 43359 }, }; +static const URange32 SignWriting_range32[] = { + { 120832, 121483 }, + { 121499, 121503 }, + { 121505, 121519 }, +}; +static const URange32 Multani_range32[] = { + { 70272, 70278 }, + { 70280, 70280 }, + { 70282, 70285 }, + { 70287, 70301 }, + { 70303, 70313 }, +}; static const URange16 Yi_range16[] = { { 40960, 42124 }, { 42128, 42182 }, @@ -4634,6 +5380,15 @@ static const URange16 Lao_range16[] = { static const URange16 Hanunoo_range16[] = { { 5920, 5940 }, }; +static const URange32 Masaram_Gondi_range32[] = { + { 72960, 72966 }, + { 72968, 72969 }, + { 72971, 73014 }, + { 73018, 73018 }, + { 73020, 73021 }, + { 73023, 73031 }, + { 73040, 73049 }, +}; static const URange32 Linear_B_range32[] = { { 65536, 65547 }, { 65549, 65574 }, @@ -4643,6 +5398,11 @@ static const URange32 Linear_B_range32[] = { { 65616, 65629 }, { 65664, 65786 }, }; +static const URange32 Linear_A_range32[] = { + { 67072, 67382 }, + { 67392, 67413 }, + { 67424, 67431 }, +}; static const URange32 Old_Turkic_range32[] = { { 68608, 68680 }, }; @@ -4658,6 +5418,15 @@ static const URange32 Lydian_range32[] = { static const URange32 Egyptian_Hieroglyphs_range32[] = { { 77824, 78894 }, }; +static const URange32 Caucasian_Albanian_range32[] = { + { 66864, 66915 }, + { 66927, 66927 }, +}; +static const URange32 Old_Hungarian_range32[] = { + { 68736, 68786 }, + { 68800, 68850 }, + { 68858, 68863 }, +}; static const URange16 Samaritan_range16[] = { { 2048, 2093 }, { 2096, 2110 }, @@ -4668,6 +5437,18 @@ static const URange16 Lisu_range16[] = { static const URange16 Buhid_range16[] = { { 5952, 5971 }, }; +static const URange32 Palmyrene_range32[] = { + { 67680, 67711 }, +}; +static const URange32 Tirhuta_range32[] = { + { 70784, 70855 }, + { 70864, 70873 }, +}; +static const URange32 Mro_range32[] = { + { 92736, 92766 }, + { 92768, 92777 }, + { 92782, 92783 }, +}; static const URange16 Common_range16[] = { { 0, 64 }, { 91, 96 }, @@ -4684,12 +5465,13 @@ static const URange16 Common_range16[] = { { 901, 901 }, { 903, 903 }, { 1417, 1417 }, + { 1541, 1541 }, { 1548, 1548 }, { 1563, 1563 }, { 1567, 1567 }, { 1600, 1600 }, - { 1632, 1641 }, { 1757, 1757 }, + { 2274, 2274 }, { 2404, 2405 }, { 3647, 3647 }, { 4053, 4056 }, @@ -4702,27 +5484,29 @@ static const URange16 Common_range16[] = { { 7393, 7393 }, { 7401, 7404 }, { 7406, 7411 }, - { 7413, 7414 }, + { 7413, 7415 }, { 8192, 8203 }, { 8206, 8292 }, { 8294, 8304 }, { 8308, 8318 }, { 8320, 8334 }, - { 8352, 8378 }, + { 8352, 8383 }, { 8448, 8485 }, { 8487, 8489 }, { 8492, 8497 }, { 8499, 8525 }, { 8527, 8543 }, - { 8585, 8585 }, - { 8592, 9203 }, - { 9216, 9254 }, + { 8585, 8587 }, + { 8592, 9254 }, { 9280, 9290 }, - { 9312, 9983 }, - { 9985, 10239 }, - { 10496, 11084 }, - { 11088, 11097 }, - { 11776, 11835 }, + { 9312, 10239 }, + { 10496, 11123 }, + { 11126, 11157 }, + { 11160, 11193 }, + { 11197, 11208 }, + { 11210, 11218 }, + { 11244, 11247 }, + { 11776, 11849 }, { 12272, 12283 }, { 12288, 12292 }, { 12294, 12294 }, @@ -4741,9 +5525,10 @@ static const URange16 Common_range16[] = { { 42752, 42785 }, { 42888, 42890 }, { 43056, 43065 }, + { 43310, 43310 }, { 43471, 43471 }, + { 43867, 43867 }, { 64830, 64831 }, - { 65021, 65021 }, { 65040, 65049 }, { 65072, 65106 }, { 65108, 65126 }, @@ -4764,13 +5549,15 @@ static const URange32 Common_range32[] = { { 65847, 65855 }, { 65936, 65947 }, { 66000, 66044 }, + { 66273, 66299 }, + { 113824, 113827 }, { 118784, 119029 }, { 119040, 119078 }, { 119081, 119142 }, { 119146, 119162 }, { 119171, 119172 }, { 119180, 119209 }, - { 119214, 119261 }, + { 119214, 119272 }, { 119552, 119638 }, { 119648, 119665 }, { 119808, 119892 }, @@ -4797,36 +5584,36 @@ static const URange32 Common_range32[] = { { 126976, 127019 }, { 127024, 127123 }, { 127136, 127150 }, - { 127153, 127166 }, + { 127153, 127167 }, { 127169, 127183 }, - { 127185, 127199 }, - { 127232, 127242 }, + { 127185, 127221 }, + { 127232, 127244 }, { 127248, 127278 }, { 127280, 127339 }, - { 127344, 127386 }, + { 127344, 127404 }, { 127462, 127487 }, { 127489, 127490 }, - { 127504, 127546 }, + { 127504, 127547 }, { 127552, 127560 }, { 127568, 127569 }, - { 127744, 127776 }, - { 127792, 127797 }, - { 127799, 127868 }, - { 127872, 127891 }, - { 127904, 127940 }, - { 127942, 127946 }, - { 127968, 127984 }, - { 128000, 128062 }, - { 128064, 128064 }, - { 128066, 128247 }, - { 128249, 128252 }, - { 128256, 128317 }, - { 128320, 128323 }, - { 128336, 128359 }, - { 128507, 128576 }, - { 128581, 128591 }, - { 128640, 128709 }, + { 127584, 127589 }, + { 127744, 128724 }, + { 128736, 128748 }, + { 128752, 128760 }, { 128768, 128883 }, + { 128896, 128980 }, + { 129024, 129035 }, + { 129040, 129095 }, + { 129104, 129113 }, + { 129120, 129159 }, + { 129168, 129197 }, + { 129280, 129291 }, + { 129296, 129342 }, + { 129344, 129356 }, + { 129360, 129387 }, + { 129408, 129431 }, + { 129472, 129472 }, + { 129488, 129510 }, { 917505, 917505 }, { 917536, 917631 }, }; @@ -4847,19 +5634,19 @@ static const URange16 Arabic_range16[] = { { 1566, 1566 }, { 1568, 1599 }, { 1601, 1610 }, - { 1622, 1631 }, - { 1642, 1647 }, + { 1622, 1647 }, { 1649, 1756 }, { 1758, 1791 }, { 1872, 1919 }, - { 2208, 2208 }, - { 2210, 2220 }, - { 2276, 2302 }, + { 2208, 2228 }, + { 2230, 2237 }, + { 2260, 2273 }, + { 2275, 2303 }, { 64336, 64449 }, { 64467, 64829 }, { 64848, 64911 }, { 64914, 64967 }, - { 65008, 65020 }, + { 65008, 65021 }, { 65136, 65140 }, { 65142, 65276 }, }; @@ -4908,7 +5695,17 @@ static const URange32 Bamum_range32[] = { }; static const URange16 Myanmar_range16[] = { { 4096, 4255 }, - { 43616, 43643 }, + { 43488, 43518 }, + { 43616, 43647 }, +}; +static const URange32 Siddham_range32[] = { + { 71040, 71093 }, + { 71096, 71133 }, +}; +static const URange32 Soyombo_range32[] = { + { 72272, 72323 }, + { 72326, 72348 }, + { 72350, 72354 }, }; static const URange32 Avestan_range32[] = { { 68352, 68405 }, @@ -4925,131 +5722,171 @@ static const URange16 Hebrew_range16[] = { { 64323, 64324 }, { 64326, 64335 }, }; +static const URange32 Psalter_Pahlavi_range32[] = { + { 68480, 68497 }, + { 68505, 68508 }, + { 68521, 68527 }, +}; static const URange32 Takri_range32[] = { { 71296, 71351 }, { 71360, 71369 }, }; -// 3867 16-bit ranges, 723 32-bit ranges +// 3981 16-bit ranges, 1325 32-bit ranges const UGroup unicode_groups[] = { + { "Adlam", +1, 0, 0, Adlam_range32, 3 }, + { "Ahom", +1, 0, 0, Ahom_range32, 3 }, + { "Anatolian_Hieroglyphs", +1, 0, 0, Anatolian_Hieroglyphs_range32, 1 }, { "Arabic", +1, Arabic_range16, 22, Arabic_range32, 35 }, { "Armenian", +1, Armenian_range16, 6, 0, 0 }, { "Avestan", +1, 0, 0, Avestan_range32, 2 }, { "Balinese", +1, Balinese_range16, 2, 0, 0 }, { "Bamum", +1, Bamum_range16, 1, Bamum_range32, 1 }, + { "Bassa_Vah", +1, 0, 0, Bassa_Vah_range32, 2 }, { "Batak", +1, Batak_range16, 2, 0, 0 }, { "Bengali", +1, Bengali_range16, 14, 0, 0 }, + { "Bhaiksuki", +1, 0, 0, Bhaiksuki_range32, 4 }, { "Bopomofo", +1, Bopomofo_range16, 3, 0, 0 }, - { "Brahmi", +1, 0, 0, Brahmi_range32, 2 }, + { "Brahmi", +1, 0, 0, Brahmi_range32, 3 }, { "Braille", +1, Braille_range16, 1, 0, 0 }, { "Buginese", +1, Buginese_range16, 2, 0, 0 }, { "Buhid", +1, Buhid_range16, 1, 0, 0 }, - { "C", +1, C_range16, 15, C_range32, 6 }, + { "C", +1, C_range16, 16, C_range32, 7 }, { "Canadian_Aboriginal", +1, Canadian_Aboriginal_range16, 2, 0, 0 }, { "Carian", +1, 0, 0, Carian_range32, 1 }, + { "Caucasian_Albanian", +1, 0, 0, Caucasian_Albanian_range32, 2 }, { "Cc", +1, Cc_range16, 2, 0, 0 }, - { "Cf", +1, Cf_range16, 12, Cf_range32, 4 }, + { "Cf", +1, Cf_range16, 13, Cf_range32, 5 }, { "Chakma", +1, 0, 0, Chakma_range32, 2 }, { "Cham", +1, Cham_range16, 4, 0, 0 }, - { "Cherokee", +1, Cherokee_range16, 1, 0, 0 }, + { "Cherokee", +1, Cherokee_range16, 3, 0, 0 }, { "Co", +1, Co_range16, 1, Co_range32, 2 }, - { "Common", +1, Common_range16, 88, Common_range32, 70 }, + { "Common", +1, Common_range16, 92, Common_range32, 72 }, { "Coptic", +1, Coptic_range16, 3, 0, 0 }, { "Cs", +1, Cs_range16, 1, 0, 0 }, - { "Cuneiform", +1, 0, 0, Cuneiform_range32, 3 }, + { "Cuneiform", +1, 0, 0, Cuneiform_range32, 4 }, { "Cypriot", +1, 0, 0, Cypriot_range32, 6 }, - { "Cyrillic", +1, Cyrillic_range16, 7, 0, 0 }, + { "Cyrillic", +1, Cyrillic_range16, 8, 0, 0 }, { "Deseret", +1, 0, 0, Deseret_range32, 1 }, - { "Devanagari", +1, Devanagari_range16, 5, 0, 0 }, + { "Devanagari", +1, Devanagari_range16, 4, 0, 0 }, + { "Duployan", +1, 0, 0, Duployan_range32, 5 }, { "Egyptian_Hieroglyphs", +1, 0, 0, Egyptian_Hieroglyphs_range32, 1 }, + { "Elbasan", +1, 0, 0, Elbasan_range32, 1 }, { "Ethiopic", +1, Ethiopic_range16, 32, 0, 0 }, { "Georgian", +1, Georgian_range16, 8, 0, 0 }, - { "Glagolitic", +1, Glagolitic_range16, 2, 0, 0 }, + { "Glagolitic", +1, Glagolitic_range16, 2, Glagolitic_range32, 5 }, { "Gothic", +1, 0, 0, Gothic_range32, 1 }, - { "Greek", +1, Greek_range16, 31, Greek_range32, 2 }, - { "Gujarati", +1, Gujarati_range16, 13, 0, 0 }, + { "Grantha", +1, 0, 0, Grantha_range32, 15 }, + { "Greek", +1, Greek_range16, 33, Greek_range32, 3 }, + { "Gujarati", +1, Gujarati_range16, 14, 0, 0 }, { "Gurmukhi", +1, Gurmukhi_range16, 16, 0, 0 }, - { "Han", +1, Han_range16, 11, Han_range32, 4 }, + { "Han", +1, Han_range16, 11, Han_range32, 6 }, { "Hangul", +1, Hangul_range16, 14, 0, 0 }, { "Hanunoo", +1, Hanunoo_range16, 1, 0, 0 }, + { "Hatran", +1, 0, 0, Hatran_range32, 3 }, { "Hebrew", +1, Hebrew_range16, 9, 0, 0 }, { "Hiragana", +1, Hiragana_range16, 2, Hiragana_range32, 2 }, { "Imperial_Aramaic", +1, 0, 0, Imperial_Aramaic_range32, 2 }, - { "Inherited", +1, Inherited_range16, 18, Inherited_range32, 6 }, + { "Inherited", +1, Inherited_range16, 20, Inherited_range32, 7 }, { "Inscriptional_Pahlavi", +1, 0, 0, Inscriptional_Pahlavi_range32, 2 }, { "Inscriptional_Parthian", +1, 0, 0, Inscriptional_Parthian_range32, 2 }, { "Javanese", +1, Javanese_range16, 3, 0, 0 }, { "Kaithi", +1, 0, 0, Kaithi_range32, 1 }, { "Kannada", +1, Kannada_range16, 14, 0, 0 }, { "Katakana", +1, Katakana_range16, 7, Katakana_range32, 1 }, - { "Kayah_Li", +1, Kayah_Li_range16, 1, 0, 0 }, + { "Kayah_Li", +1, Kayah_Li_range16, 2, 0, 0 }, { "Kharoshthi", +1, 0, 0, Kharoshthi_range32, 8 }, { "Khmer", +1, Khmer_range16, 4, 0, 0 }, - { "L", +1, L_range16, 370, L_range32, 116 }, + { "Khojki", +1, 0, 0, Khojki_range32, 2 }, + { "Khudawadi", +1, 0, 0, Khudawadi_range32, 2 }, + { "L", +1, L_range16, 383, L_range32, 202 }, { "Lao", +1, Lao_range16, 18, 0, 0 }, - { "Latin", +1, Latin_range16, 30, 0, 0 }, + { "Latin", +1, Latin_range16, 31, 0, 0 }, { "Lepcha", +1, Lepcha_range16, 3, 0, 0 }, { "Limbu", +1, Limbu_range16, 5, 0, 0 }, + { "Linear_A", +1, 0, 0, Linear_A_range32, 3 }, { "Linear_B", +1, 0, 0, Linear_B_range32, 7 }, { "Lisu", +1, Lisu_range16, 1, 0, 0 }, - { "Ll", +1, Ll_range16, 582, Ll_range32, 29 }, - { "Lm", +1, Lm_range16, 51, Lm_range32, 1 }, - { "Lo", +1, Lo_range16, 286, Lo_range32, 85 }, + { "Ll", +1, Ll_range16, 600, Ll_range32, 33 }, + { "Lm", +1, Lm_range16, 54, Lm_range32, 3 }, + { "Lo", +1, Lo_range16, 296, Lo_range32, 163 }, { "Lt", +1, Lt_range16, 10, 0, 0 }, - { "Lu", +1, Lu_range16, 576, Lu_range32, 32 }, + { "Lu", +1, Lu_range16, 591, Lu_range32, 36 }, { "Lycian", +1, 0, 0, Lycian_range32, 1 }, { "Lydian", +1, 0, 0, Lydian_range32, 2 }, - { "M", +1, M_range16, 180, M_range32, 24 }, - { "Malayalam", +1, Malayalam_range16, 11, 0, 0 }, + { "M", +1, M_range16, 184, M_range32, 79 }, + { "Mahajani", +1, 0, 0, Mahajani_range32, 1 }, + { "Malayalam", +1, Malayalam_range16, 8, 0, 0 }, { "Mandaic", +1, Mandaic_range16, 2, 0, 0 }, - { "Mc", +1, Mc_range16, 111, Mc_range32, 15 }, - { "Me", +1, Me_range16, 4, 0, 0 }, + { "Manichaean", +1, 0, 0, Manichaean_range32, 2 }, + { "Marchen", +1, 0, 0, Marchen_range32, 3 }, + { "Masaram_Gondi", +1, 0, 0, Masaram_Gondi_range32, 7 }, + { "Mc", +1, Mc_range16, 110, Mc_range32, 50 }, + { "Me", +1, Me_range16, 5, 0, 0 }, { "Meetei_Mayek", +1, Meetei_Mayek_range16, 3, 0, 0 }, - { "Meroitic_Cursive", +1, 0, 0, Meroitic_Cursive_range32, 2 }, + { "Mende_Kikakui", +1, 0, 0, Mende_Kikakui_range32, 2 }, + { "Meroitic_Cursive", +1, 0, 0, Meroitic_Cursive_range32, 3 }, { "Meroitic_Hieroglyphs", +1, 0, 0, Meroitic_Hieroglyphs_range32, 1 }, { "Miao", +1, 0, 0, Miao_range32, 3 }, - { "Mn", +1, Mn_range16, 194, Mn_range32, 27 }, - { "Mongolian", +1, Mongolian_range16, 6, 0, 0 }, - { "Myanmar", +1, Myanmar_range16, 2, 0, 0 }, - { "N", +1, N_range16, 64, N_range32, 24 }, - { "Nd", +1, Nd_range16, 35, Nd_range32, 7 }, + { "Mn", +1, Mn_range16, 204, Mn_range32, 97 }, + { "Modi", +1, 0, 0, Modi_range32, 2 }, + { "Mongolian", +1, Mongolian_range16, 6, Mongolian_range32, 1 }, + { "Mro", +1, 0, 0, Mro_range32, 3 }, + { "Multani", +1, 0, 0, Multani_range32, 5 }, + { "Myanmar", +1, Myanmar_range16, 3, 0, 0 }, + { "N", +1, N_range16, 67, N_range32, 49 }, + { "Nabataean", +1, 0, 0, Nabataean_range32, 2 }, + { "Nd", +1, Nd_range16, 37, Nd_range32, 18 }, { "New_Tai_Lue", +1, New_Tai_Lue_range16, 4, 0, 0 }, + { "Newa", +1, 0, 0, Newa_range32, 3 }, { "Nko", +1, Nko_range16, 1, 0, 0 }, { "Nl", +1, Nl_range16, 7, Nl_range32, 5 }, - { "No", +1, No_range16, 28, No_range32, 14 }, + { "No", +1, No_range16, 29, No_range32, 31 }, + { "Nushu", +1, 0, 0, Nushu_range32, 2 }, { "Ogham", +1, Ogham_range16, 1, 0, 0 }, { "Ol_Chiki", +1, Ol_Chiki_range16, 1, 0, 0 }, + { "Old_Hungarian", +1, 0, 0, Old_Hungarian_range32, 3 }, { "Old_Italic", +1, 0, 0, Old_Italic_range32, 2 }, + { "Old_North_Arabian", +1, 0, 0, Old_North_Arabian_range32, 1 }, + { "Old_Permic", +1, 0, 0, Old_Permic_range32, 1 }, { "Old_Persian", +1, 0, 0, Old_Persian_range32, 2 }, { "Old_South_Arabian", +1, 0, 0, Old_South_Arabian_range32, 1 }, { "Old_Turkic", +1, 0, 0, Old_Turkic_range32, 1 }, { "Oriya", +1, Oriya_range16, 14, 0, 0 }, + { "Osage", +1, 0, 0, Osage_range32, 2 }, { "Osmanya", +1, 0, 0, Osmanya_range32, 2 }, - { "P", +1, P_range16, 126, P_range32, 15 }, + { "P", +1, P_range16, 128, P_range32, 44 }, + { "Pahawh_Hmong", +1, 0, 0, Pahawh_Hmong_range32, 5 }, + { "Palmyrene", +1, 0, 0, Palmyrene_range32, 1 }, + { "Pau_Cin_Hau", +1, 0, 0, Pau_Cin_Hau_range32, 1 }, { "Pc", +1, Pc_range16, 6, 0, 0 }, - { "Pd", +1, Pd_range16, 16, 0, 0 }, + { "Pd", +1, Pd_range16, 17, 0, 0 }, { "Pe", +1, Pe_range16, 72, 0, 0 }, { "Pf", +1, Pf_range16, 10, 0, 0 }, { "Phags_Pa", +1, Phags_Pa_range16, 1, 0, 0 }, { "Phoenician", +1, 0, 0, Phoenician_range32, 2 }, { "Pi", +1, Pi_range16, 11, 0, 0 }, - { "Po", +1, Po_range16, 120, Po_range32, 15 }, - { "Ps", +1, Ps_range16, 74, 0, 0 }, + { "Po", +1, Po_range16, 125, Po_range32, 44 }, + { "Ps", +1, Ps_range16, 75, 0, 0 }, + { "Psalter_Pahlavi", +1, 0, 0, Psalter_Pahlavi_range32, 3 }, { "Rejang", +1, Rejang_range16, 2, 0, 0 }, { "Runic", +1, Runic_range16, 2, 0, 0 }, - { "S", +1, S_range16, 143, S_range32, 56 }, + { "S", +1, S_range16, 148, S_range32, 69 }, { "Samaritan", +1, Samaritan_range16, 2, 0, 0 }, { "Saurashtra", +1, Saurashtra_range16, 2, 0, 0 }, { "Sc", +1, Sc_range16, 17, 0, 0 }, { "Sharada", +1, 0, 0, Sharada_range32, 2 }, { "Shavian", +1, 0, 0, Shavian_range32, 1 }, - { "Sinhala", +1, Sinhala_range16, 11, 0, 0 }, - { "Sk", +1, Sk_range16, 27, 0, 0 }, + { "Siddham", +1, 0, 0, Siddham_range32, 2 }, + { "SignWriting", +1, 0, 0, SignWriting_range32, 3 }, + { "Sinhala", +1, Sinhala_range16, 12, Sinhala_range32, 1 }, + { "Sk", +1, Sk_range16, 28, Sk_range32, 1 }, { "Sm", +1, Sm_range16, 53, Sm_range32, 11 }, - { "So", +1, So_range16, 108, So_range32, 45 }, + { "So", +1, So_range16, 114, So_range32, 59 }, { "Sora_Sompeng", +1, 0, 0, Sora_Sompeng_range32, 2 }, + { "Soyombo", +1, 0, 0, Soyombo_range32, 3 }, { "Sundanese", +1, Sundanese_range16, 2, 0, 0 }, { "Syloti_Nagri", +1, Syloti_Nagri_range16, 1, 0, 0 }, - { "Syriac", +1, Syriac_range16, 3, 0, 0 }, + { "Syriac", +1, Syriac_range16, 4, 0, 0 }, { "Tagalog", +1, Tagalog_range16, 2, 0, 0 }, { "Tagbanwa", +1, Tagbanwa_range16, 3, 0, 0 }, { "Tai_Le", +1, Tai_Le_range16, 2, 0, 0 }, @@ -5057,20 +5894,24 @@ const UGroup unicode_groups[] = { { "Tai_Viet", +1, Tai_Viet_range16, 2, 0, 0 }, { "Takri", +1, 0, 0, Takri_range32, 2 }, { "Tamil", +1, Tamil_range16, 16, 0, 0 }, - { "Telugu", +1, Telugu_range16, 14, 0, 0 }, + { "Tangut", +1, 0, 0, Tangut_range32, 3 }, + { "Telugu", +1, Telugu_range16, 13, 0, 0 }, { "Thaana", +1, Thaana_range16, 1, 0, 0 }, { "Thai", +1, Thai_range16, 2, 0, 0 }, { "Tibetan", +1, Tibetan_range16, 7, 0, 0 }, { "Tifinagh", +1, Tifinagh_range16, 3, 0, 0 }, + { "Tirhuta", +1, 0, 0, Tirhuta_range32, 2 }, { "Ugaritic", +1, 0, 0, Ugaritic_range32, 2 }, { "Vai", +1, Vai_range16, 1, 0, 0 }, + { "Warang_Citi", +1, 0, 0, Warang_Citi_range32, 2 }, { "Yi", +1, Yi_range16, 2, 0, 0 }, { "Z", +1, Z_range16, 8, 0, 0 }, + { "Zanabazar_Square", +1, 0, 0, Zanabazar_Square_range32, 1 }, { "Zl", +1, Zl_range16, 1, 0, 0 }, { "Zp", +1, Zp_range16, 1, 0, 0 }, { "Zs", +1, Zs_range16, 7, 0, 0 }, }; -const int num_unicode_groups = 138; +const int num_unicode_groups = 177; } // namespace re2 diff --git a/contrib/libre2/re2/unicode_groups.h b/contrib/libre2/re2/unicode_groups.h index 7f56331158b..75f55daa619 100644 --- a/contrib/libre2/re2/unicode_groups.h +++ b/contrib/libre2/re2/unicode_groups.h @@ -2,6 +2,9 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +#ifndef RE2_UNICODE_GROUPS_H_ +#define RE2_UNICODE_GROUPS_H_ + // Unicode character groups. // The codes get split into ranges of 16-bit codes @@ -15,23 +18,23 @@ // to 16.5 kB of data but make the data harder to use; // we don't bother. -#ifndef RE2_UNICODE_GROUPS_H__ -#define RE2_UNICODE_GROUPS_H__ +#include #include "util/util.h" +#include "util/utf.h" namespace re2 { struct URange16 { - uint16 lo; - uint16 hi; + uint16_t lo; + uint16_t hi; }; struct URange32 { - uint32 lo; - uint32 hi; + Rune lo; + Rune hi; }; struct UGroup @@ -61,4 +64,4 @@ extern const int num_perl_groups; } // namespace re2 -#endif // RE2_UNICODE_GROUPS_H__ +#endif // RE2_UNICODE_GROUPS_H_ diff --git a/contrib/libre2/re2/walker-inl.h b/contrib/libre2/re2/walker-inl.h index 4d2045f7249..032b8ac7db9 100644 --- a/contrib/libre2/re2/walker-inl.h +++ b/contrib/libre2/re2/walker-inl.h @@ -2,6 +2,9 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +#ifndef RE2_WALKER_INL_H_ +#define RE2_WALKER_INL_H_ + // Helper class for traversing Regexps without recursion. // Clients should declare their own subclasses that override // the PreVisit and PostVisit methods, which are called before @@ -10,9 +13,9 @@ // Not quite the Visitor pattern, because (among other things) // the Visitor pattern is recursive. -#ifndef RE2_WALKER_INL_H__ -#define RE2_WALKER_INL_H__ +#include +#include "util/logging.h" #include "re2/regexp.h" namespace re2 { @@ -86,13 +89,14 @@ template class Regexp::Walker { private: // Walk state for the entire traversal. - stack >* stack_; + std::stack >* stack_; bool stopped_early_; int max_visits_; T WalkInternal(Regexp* re, T top_arg, bool use_copy); - DISALLOW_EVIL_CONSTRUCTORS(Walker); + Walker(const Walker&) = delete; + Walker& operator=(const Walker&) = delete; }; template T Regexp::Walker::PreVisit(Regexp* re, @@ -130,7 +134,7 @@ template struct WalkState { }; template Regexp::Walker::Walker() { - stack_ = new stack >; + stack_ = new std::stack >; stopped_early_ = false; } @@ -187,7 +191,7 @@ template T Regexp::Walker::WalkInternal(Regexp* re, T top_arg, s->child_args = &s->child_arg; else if (re->nsub_ > 1) s->child_args = new T[re->nsub_]; - // Fall through. + FALLTHROUGH_INTENDED; } default: { if (re->nsub_ > 0) { @@ -241,4 +245,4 @@ template T Regexp::Walker::WalkExponential(Regexp* re, T top_arg, } // namespace re2 -#endif // RE2_WALKER_INL_H__ +#endif // RE2_WALKER_INL_H_ diff --git a/contrib/libre2/re2_transform.cmake b/contrib/libre2/re2_transform.cmake index 7eb9ffd9ef3..bf23095c2c2 100644 --- a/contrib/libre2/re2_transform.cmake +++ b/contrib/libre2/re2_transform.cmake @@ -1,5 +1,6 @@ file (READ ${SOURCE_FILENAME} CONTENT) string (REGEX REPLACE "using re2::RE2;" "" CONTENT "${CONTENT}") +string (REGEX REPLACE "using re2::LazyRE2;" "" CONTENT "${CONTENT}") string (REGEX REPLACE "namespace re2" "namespace re2_st" CONTENT "${CONTENT}") string (REGEX REPLACE "re2::" "re2_st::" CONTENT "${CONTENT}") string (REGEX REPLACE "\"re2/" "\"re2_st/" CONTENT "${CONTENT}") diff --git a/contrib/libre2/util/arena.cc b/contrib/libre2/util/arena.cc deleted file mode 100644 index 510592f06ab..00000000000 --- a/contrib/libre2/util/arena.cc +++ /dev/null @@ -1,168 +0,0 @@ -// Copyright 2000 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#include "util/util.h" - -namespace re2 { - -// ---------------------------------------------------------------------- -// UnsafeArena::UnsafeArena() -// UnsafeArena::~UnsafeArena() -// Destroying the arena automatically calls Reset() -// ---------------------------------------------------------------------- - - -UnsafeArena::UnsafeArena(const size_t block_size) - : block_size_(block_size), - freestart_(NULL), // set for real in Reset() - last_alloc_(NULL), - remaining_(0), - blocks_alloced_(1), - overflow_blocks_(NULL) { - assert(block_size > kDefaultAlignment); - - first_blocks_[0].mem = reinterpret_cast(malloc(block_size_)); - first_blocks_[0].size = block_size_; - - Reset(); -} - -UnsafeArena::~UnsafeArena() { - FreeBlocks(); - assert(overflow_blocks_ == NULL); // FreeBlocks() should do that - // The first X blocks stay allocated always by default. Delete them now. - for (int i = 0; i < blocks_alloced_; i++) - free(first_blocks_[i].mem); -} - -// ---------------------------------------------------------------------- -// UnsafeArena::Reset() -// Clears all the memory an arena is using. -// ---------------------------------------------------------------------- - -void UnsafeArena::Reset() { - FreeBlocks(); - freestart_ = first_blocks_[0].mem; - remaining_ = first_blocks_[0].size; - last_alloc_ = NULL; - - // We do not know for sure whether or not the first block is aligned, - // so we fix that right now. - const int overage = reinterpret_cast(freestart_) & - (kDefaultAlignment-1); - if (overage > 0) { - const int waste = kDefaultAlignment - overage; - freestart_ += waste; - remaining_ -= waste; - } - freestart_when_empty_ = freestart_; - assert(!(reinterpret_cast(freestart_)&(kDefaultAlignment-1))); -} - -// ------------------------------------------------------------- -// UnsafeArena::AllocNewBlock() -// Adds and returns an AllocatedBlock. -// The returned AllocatedBlock* is valid until the next call -// to AllocNewBlock or Reset. (i.e. anything that might -// affect overflow_blocks_). -// ------------------------------------------------------------- - -UnsafeArena::AllocatedBlock* UnsafeArena::AllocNewBlock(const size_t block_size) { - AllocatedBlock *block; - // Find the next block. - if (static_cast(blocks_alloced_) < arraysize(first_blocks_) ) { - // Use one of the pre-allocated blocks - block = &first_blocks_[blocks_alloced_++]; - } else { // oops, out of space, move to the vector - if (overflow_blocks_ == NULL) overflow_blocks_ = new vector; - // Adds another block to the vector. - overflow_blocks_->resize(overflow_blocks_->size()+1); - // block points to the last block of the vector. - block = &overflow_blocks_->back(); - } - - block->mem = reinterpret_cast(malloc(block_size)); - block->size = block_size; - - return block; -} - -// ---------------------------------------------------------------------- -// UnsafeArena::GetMemoryFallback() -// We take memory out of our pool, aligned on the byte boundary -// requested. If we don't have space in our current pool, we -// allocate a new block (wasting the remaining space in the -// current block) and give you that. If your memory needs are -// too big for a single block, we make a special your-memory-only -// allocation -- this is equivalent to not using the arena at all. -// ---------------------------------------------------------------------- - -void* UnsafeArena::GetMemoryFallback(const size_t size, const int align) { - if (size == 0) - return NULL; // stl/stl_alloc.h says this is okay - - assert(align > 0 && 0 == (align & (align - 1))); // must be power of 2 - - // If the object is more than a quarter of the block size, allocate - // it separately to avoid wasting too much space in leftover bytes - if (block_size_ == 0 || size > block_size_/4) { - // then it gets its own block in the arena - assert(align <= kDefaultAlignment); // because that's what new gives us - // This block stays separate from the rest of the world; in particular - // we don't update last_alloc_ so you can't reclaim space on this block. - return AllocNewBlock(size)->mem; - } - - const int overage = - (reinterpret_cast(freestart_) & (align-1)); - if (overage) { - const int waste = align - overage; - freestart_ += waste; - if (waste < static_cast(remaining_)) { - remaining_ -= waste; - } else { - remaining_ = 0; - } - } - if (size > remaining_) { - AllocatedBlock *block = AllocNewBlock(block_size_); - freestart_ = block->mem; - remaining_ = block->size; - } - remaining_ -= size; - last_alloc_ = freestart_; - freestart_ += size; - assert((reinterpret_cast(last_alloc_) & (align-1)) == 0); - return reinterpret_cast(last_alloc_); -} - -// ---------------------------------------------------------------------- -// UnsafeArena::FreeBlocks() -// Unlike GetMemory(), which does actual work, ReturnMemory() is a -// no-op: we don't "free" memory until Reset() is called. We do -// update some stats, though. Note we do no checking that the -// pointer you pass in was actually allocated by us, or that it -// was allocated for the size you say, so be careful here! -// FreeBlocks() does the work for Reset(), actually freeing all -// memory allocated in one fell swoop. -// ---------------------------------------------------------------------- - -void UnsafeArena::FreeBlocks() { - for ( int i = 1; i < blocks_alloced_; ++i ) { // keep first block alloced - free(first_blocks_[i].mem); - first_blocks_[i].mem = NULL; - first_blocks_[i].size = 0; - } - blocks_alloced_ = 1; - if (overflow_blocks_ != NULL) { - vector::iterator it; - for (it = overflow_blocks_->begin(); it != overflow_blocks_->end(); ++it) { - free(it->mem); - } - delete overflow_blocks_; // These should be used very rarely - overflow_blocks_ = NULL; - } -} - -} // namespace re2 diff --git a/contrib/libre2/util/arena.h b/contrib/libre2/util/arena.h deleted file mode 100644 index 7eb385b00e3..00000000000 --- a/contrib/libre2/util/arena.h +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright 2000 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Sometimes it is necessary to allocate a large number of small -// objects. Doing this the usual way (malloc, new) is slow, -// especially for multithreaded programs. An UnsafeArena provides a -// mark/release method of memory management: it asks for a large chunk -// from the operating system and doles it out bit by bit as required. -// Then you free all the memory at once by calling UnsafeArena::Reset(). -// The "Unsafe" refers to the fact that UnsafeArena is not safe to -// call from multiple threads. -// -// The global operator new that can be used as follows: -// -// #include "lib/arena-inl.h" -// -// UnsafeArena arena(1000); -// Foo* foo = new (AllocateInArena, &arena) Foo; -// - -#ifndef RE2_UTIL_ARENA_H_ -#define RE2_UTIL_ARENA_H_ - -namespace re2 { - -// This class is thread-compatible. -class UnsafeArena { - public: - UnsafeArena(const size_t block_size); - virtual ~UnsafeArena(); - - void Reset(); - - // This should be the worst-case alignment for any type. This is - // good for IA-32, SPARC version 7 (the last one I know), and - // supposedly Alpha. i386 would be more time-efficient with a - // default alignment of 8, but ::operator new() uses alignment of 4, - // and an assertion will fail below after the call to MakeNewBlock() - // if you try to use a larger alignment. -#ifdef __i386__ - static const int kDefaultAlignment = 4; -#else - static const int kDefaultAlignment = 8; -#endif - - private: - void* GetMemoryFallback(const size_t size, const int align); - - public: - void* GetMemory(const size_t size, const int align) { - if ( size > 0 && size < remaining_ && align == 1 ) { // common case - last_alloc_ = freestart_; - freestart_ += size; - remaining_ -= size; - return reinterpret_cast(last_alloc_); - } - return GetMemoryFallback(size, align); - } - - private: - struct AllocatedBlock { - char *mem; - size_t size; - }; - - // The returned AllocatedBlock* is valid until the next call to AllocNewBlock - // or Reset (i.e. anything that might affect overflow_blocks_). - AllocatedBlock *AllocNewBlock(const size_t block_size); - - const AllocatedBlock *IndexToBlock(int index) const; - - const size_t block_size_; - char* freestart_; // beginning of the free space in most recent block - char* freestart_when_empty_; // beginning of the free space when we're empty - char* last_alloc_; // used to make sure ReturnBytes() is safe - size_t remaining_; - // STL vector isn't as efficient as it could be, so we use an array at first - int blocks_alloced_; // how many of the first_blocks_ have been alloced - AllocatedBlock first_blocks_[16]; // the length of this array is arbitrary - // if the first_blocks_ aren't enough, expand into overflow_blocks_. - vector* overflow_blocks_; - - void FreeBlocks(); // Frees all except first block - - DISALLOW_EVIL_CONSTRUCTORS(UnsafeArena); -}; - -// Operators for allocation on the arena -// Syntax: new (AllocateInArena, arena) MyClass; -// STL containers, etc. -enum AllocateInArenaType { AllocateInArena }; - -} // namespace re2 - -inline void* operator new(size_t size, - re2::AllocateInArenaType /* unused */, - re2::UnsafeArena *arena) { - return reinterpret_cast(arena->GetMemory(size, 1)); -} - -#endif // RE2_UTIL_ARENA_H_ - diff --git a/contrib/libre2/util/atomicops.h b/contrib/libre2/util/atomicops.h deleted file mode 100644 index dd951f2610b..00000000000 --- a/contrib/libre2/util/atomicops.h +++ /dev/null @@ -1,137 +0,0 @@ -// Copyright 2006-2008 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#ifndef RE2_UTIL_ATOMICOPS_H__ -#define RE2_UTIL_ATOMICOPS_H__ - -// The memory ordering constraints resemble the ones in C11. -// RELAXED - no memory ordering, just an atomic operation. -// CONSUME - data-dependent ordering. -// ACQUIRE - prevents memory accesses from hoisting above the operation. -// RELEASE - prevents memory accesses from sinking below the operation. - -#if (__clang_major__ * 100 + __clang_minor__ >= 303) || \ - (__GNUC__ * 1000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__ >= 40801) - -#define ATOMIC_LOAD_RELAXED(x, p) do { (x) = __atomic_load_n((p), __ATOMIC_RELAXED); } while (0) -#define ATOMIC_LOAD_CONSUME(x, p) do { (x) = __atomic_load_n((p), __ATOMIC_CONSUME); } while (0) -#define ATOMIC_LOAD_ACQUIRE(x, p) do { (x) = __atomic_load_n((p), __ATOMIC_ACQUIRE); } while (0) -#define ATOMIC_STORE_RELAXED(p, v) __atomic_store_n((p), (v), __ATOMIC_RELAXED) -#define ATOMIC_STORE_RELEASE(p, v) __atomic_store_n((p), (v), __ATOMIC_RELEASE) - -#else // old compiler - -#define ATOMIC_LOAD_RELAXED(x, p) do { (x) = *(p); } while (0) -#define ATOMIC_LOAD_CONSUME(x, p) do { (x) = *(p); MaybeReadMemoryBarrier(); } while (0) -#define ATOMIC_LOAD_ACQUIRE(x, p) do { (x) = *(p); ReadMemoryBarrier(); } while (0) -#define ATOMIC_STORE_RELAXED(p, v) do { *(p) = (v); } while (0) -#define ATOMIC_STORE_RELEASE(p, v) do { WriteMemoryBarrier(); *(p) = (v); } while (0) - -// WriteMemoryBarrier(), ReadMemoryBarrier() and MaybeReadMemoryBarrier() -// are an implementation detail and must not be used in the rest of the code. - -#if defined(__i386__) - -static inline void WriteMemoryBarrier() { - int x; - __asm__ __volatile__("xchgl (%0),%0" // The lock prefix is implicit for xchg. - :: "r" (&x)); -} - -#elif defined(__x86_64__) - -// 64-bit implementations of memory barrier can be simpler, because -// "sfence" is guaranteed to exist. -static inline void WriteMemoryBarrier() { - __asm__ __volatile__("sfence" : : : "memory"); -} - -#elif defined(__ppc__) - -static inline void WriteMemoryBarrier() { - __asm__ __volatile__("eieio" : : : "memory"); -} - -#elif defined(__alpha__) - -static inline void WriteMemoryBarrier() { - __asm__ __volatile__("wmb" : : : "memory"); -} - -#elif defined(__aarch64__) - -static inline void WriteMemoryBarrier() { - __asm__ __volatile__("dmb st" : : : "memory"); -} - -#else - -#include "util/mutex.h" - -static inline void WriteMemoryBarrier() { - // Slight overkill, but good enough: - // any mutex implementation must have - // a read barrier after the lock operation and - // a write barrier before the unlock operation. - // - // It may be worthwhile to write architecture-specific - // barriers for the common platforms, as above, but - // this is a correct fallback. - re2::Mutex mu; - re2::MutexLock l(&mu); -} - -/* -#error Need WriteMemoryBarrier for architecture. - -// Windows -inline void WriteMemoryBarrier() { - LONG x; - ::InterlockedExchange(&x, 0); -} -*/ - -#endif - -// Alpha has very weak memory ordering. If relying on WriteBarriers, one must -// use read barriers for the readers too. -#if defined(__alpha__) - -static inline void MaybeReadMemoryBarrier() { - __asm__ __volatile__("mb" : : : "memory"); -} - -#else - -static inline void MaybeReadMemoryBarrier() {} - -#endif // __alpha__ - -// Read barrier for various targets. - -#if defined(__aarch64__) - -static inline void ReadMemoryBarrier() { - __asm__ __volatile__("dmb ld" : : : "memory"); -} - -#elif defined(__alpha__) - -static inline void ReadMemoryBarrier() { - __asm__ __volatile__("mb" : : : "memory"); -} - -#else - -static inline void ReadMemoryBarrier() {} - -#endif - -#endif // old compiler - -#ifndef NO_THREAD_SAFETY_ANALYSIS -#define NO_THREAD_SAFETY_ANALYSIS -#endif - -#endif // RE2_UTIL_ATOMICOPS_H__ diff --git a/contrib/libre2/util/benchmark.cc b/contrib/libre2/util/benchmark.cc index c3aad7ed89a..125bbe393fc 100644 --- a/contrib/libre2/util/benchmark.cc +++ b/contrib/libre2/util/benchmark.cc @@ -2,6 +2,12 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +#include +#include +#include +#include +#include + #include "util/util.h" #include "util/flags.h" #include "util/benchmark.h" @@ -9,8 +15,11 @@ DEFINE_string(test_tmpdir, "/var/tmp", "temp directory"); +#ifdef _WIN32 +#define snprintf _snprintf +#endif + using testing::Benchmark; -using namespace re2; static Benchmark* benchmarks[10000]; static int nbenchmarks; @@ -24,19 +33,17 @@ void Benchmark::Register() { nbenchmarks++; } -static int64 nsec() { - struct timeval tv; - if(gettimeofday(&tv, 0) < 0) - return -1; - return (int64)tv.tv_sec*1000*1000*1000 + tv.tv_usec*1000; +static int64_t nsec() { + return std::chrono::duration_cast( + std::chrono::steady_clock::now().time_since_epoch()).count(); } -static int64 bytes; -static int64 ns; -static int64 t0; -static int64 items; +static int64_t bytes; +static int64_t ns; +static int64_t t0; +static int64_t items; -void SetBenchmarkBytesProcessed(long long x) { +void SetBenchmarkBytesProcessed(int64_t x) { bytes = x; } @@ -74,7 +81,7 @@ static void runN(Benchmark *b, int n, int siz) { b->fnr(n, siz); else { fprintf(stderr, "%s: missing function\n", b->name); - exit(2); + abort(); } if(t0 != 0) ns += nsec() - t0; @@ -105,11 +112,11 @@ void RunBench(Benchmark* b, int nthread, int siz) { while(ns < (int)1e9 && n < (int)1e9) { last = n; if(ns/n == 0) - n = 1e9; + n = (int)1e9; else - n = 1e9 / (ns/n); + n = (int)1e9 / static_cast(ns/n); - n = max(last+1, min(n+n/2, 100*last)); + n = std::max(last+1, std::min(n+n/2, 100*last)); n = round(n); runN(b, n, siz); } @@ -146,7 +153,7 @@ int main(int argc, const char** argv) { Benchmark* b = benchmarks[i]; if(match(b->name, argc, argv)) for(int j = b->threadlo; j <= b->threadhi; j++) - for(int k = max(b->lo, 1); k <= max(b->hi, 1); k<<=1) + for(int k = std::max(b->lo, 1); k <= std::max(b->hi, 1); k<<=1) RunBench(b, j, k); } } diff --git a/contrib/libre2/util/benchmark.h b/contrib/libre2/util/benchmark.h index 31bbd5348ae..fba30b9cba7 100644 --- a/contrib/libre2/util/benchmark.h +++ b/contrib/libre2/util/benchmark.h @@ -2,8 +2,10 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -#ifndef RE2_UTIL_BENCHMARK_H__ -#define RE2_UTIL_BENCHMARK_H__ +#ifndef UTIL_BENCHMARK_H_ +#define UTIL_BENCHMARK_H_ + +#include namespace testing { struct Benchmark { @@ -14,7 +16,7 @@ struct Benchmark { int hi; int threadlo; int threadhi; - + void Register(); Benchmark(const char* name, void (*f)(int)) { Clear(name); fn = f; Register(); } Benchmark(const char* name, void (*f)(int, int), int l, int h) { Clear(name); fnr = f; lo = l; hi = h; Register(); } @@ -23,7 +25,7 @@ struct Benchmark { }; } // namespace testing -void SetBenchmarkBytesProcessed(long long); +void SetBenchmarkBytesProcessed(int64_t); void StopBenchmarkTiming(); void StartBenchmarkTiming(); void BenchmarkMemoryUsage(); @@ -38,4 +40,4 @@ int NumCPUs(); ::testing::Benchmark* _benchmark_##f = \ (new ::testing::Benchmark(#f, f, lo, hi)) -#endif // RE2_UTIL_BENCHMARK_H__ +#endif // UTIL_BENCHMARK_H_ diff --git a/contrib/libre2/util/flags.h b/contrib/libre2/util/flags.h index 77a06a222e3..5af1320f59e 100644 --- a/contrib/libre2/util/flags.h +++ b/contrib/libre2/util/flags.h @@ -2,13 +2,15 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +#ifndef UTIL_FLAGS_H_ +#define UTIL_FLAGS_H_ + // Simplified version of Google's command line flags. // Does not support parsing the command line. // If you want to do that, see -// http://code.google.com/p/google-gflags +// https://gflags.github.io/gflags/ -#ifndef RE2_UTIL_FLAGS_H__ -#define RE2_UTIL_FLAGS_H__ +#include #define DEFINE_flag(type, name, deflt, desc) \ namespace re2 { type FLAGS_##name = deflt; } @@ -17,11 +19,11 @@ namespace re2 { extern type FLAGS_##name; } #define DEFINE_bool(name, deflt, desc) DEFINE_flag(bool, name, deflt, desc) -#define DEFINE_int32(name, deflt, desc) DEFINE_flag(int32, name, deflt, desc) +#define DEFINE_int32(name, deflt, desc) DEFINE_flag(int32_t, name, deflt, desc) #define DEFINE_string(name, deflt, desc) DEFINE_flag(string, name, deflt, desc) #define DECLARE_bool(name) DECLARE_flag(bool, name) -#define DECLARE_int32(name) DECLARE_flag(int32, name) +#define DECLARE_int32(name) DECLARE_flag(int32_t, name) #define DECLARE_string(name) DECLARE_flag(string, name) -#endif // RE2_UTIL_FLAGS_H__ +#endif // UTIL_FLAGS_H_ diff --git a/contrib/libre2/util/fuzz.cc b/contrib/libre2/util/fuzz.cc new file mode 100644 index 00000000000..9cac1185ac6 --- /dev/null +++ b/contrib/libre2/util/fuzz.cc @@ -0,0 +1,21 @@ +// Copyright 2016 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include +#include +#include + +// Entry point for libFuzzer. +extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size); + +int main(int argc, char** argv) { + uint8_t data[32]; + for (int i = 0; i < 32; i++) { + for (int j = 0; j < 32; j++) { + data[j] = random() & 0xFF; + } + LLVMFuzzerTestOneInput(data, 32); + } + return 0; +} diff --git a/contrib/libre2/util/hash.cc b/contrib/libre2/util/hash.cc deleted file mode 100644 index dfef7b7c364..00000000000 --- a/contrib/libre2/util/hash.cc +++ /dev/null @@ -1,231 +0,0 @@ -// Modified by Russ Cox to add "namespace re2". -// Also threw away all but hashword and hashword2. -// http://burtleburtle.net/bob/c/lookup3.c - -/* -------------------------------------------------------------------------------- -lookup3.c, by Bob Jenkins, May 2006, Public Domain. - -These are functions for producing 32-bit hashes for hash table lookup. -hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final() -are externally useful functions. Routines to test the hash are included -if SELF_TEST is defined. You can use this free for any purpose. It's in -the public domain. It has no warranty. - -You probably want to use hashlittle(). hashlittle() and hashbig() -hash byte arrays. hashlittle() is is faster than hashbig() on -little-endian machines. Intel and AMD are little-endian machines. -On second thought, you probably want hashlittle2(), which is identical to -hashlittle() except it returns two 32-bit hashes for the price of one. -You could implement hashbig2() if you wanted but I haven't bothered here. - -If you want to find a hash of, say, exactly 7 integers, do - a = i1; b = i2; c = i3; - mix(a,b,c); - a += i4; b += i5; c += i6; - mix(a,b,c); - a += i7; - final(a,b,c); -then use c as the hash value. If you have a variable length array of -4-byte integers to hash, use hashword(). If you have a byte array (like -a character string), use hashlittle(). If you have several byte arrays, or -a mix of things, see the comments above hashlittle(). - -Why is this so big? I read 12 bytes at a time into 3 4-byte integers, -then mix those integers. This is fast (you can do a lot more thorough -mixing with 12*3 instructions on 3 integers than you can with 3 instructions -on 1 byte), but shoehorning those bytes into integers efficiently is messy. -------------------------------------------------------------------------------- -*/ - -#include "util/util.h" - -#define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k)))) - -/* -------------------------------------------------------------------------------- -mix -- mix 3 32-bit values reversibly. - -This is reversible, so any information in (a,b,c) before mix() is -still in (a,b,c) after mix(). - -If four pairs of (a,b,c) inputs are run through mix(), or through -mix() in reverse, there are at least 32 bits of the output that -are sometimes the same for one pair and different for another pair. -This was tested for: -* pairs that differed by one bit, by two bits, in any combination - of top bits of (a,b,c), or in any combination of bottom bits of - (a,b,c). -* "differ" is defined as +, -, ^, or ~^. For + and -, I transformed - the output delta to a Gray code (a^(a>>1)) so a string of 1's (as - is commonly produced by subtraction) look like a single 1-bit - difference. -* the base values were pseudorandom, all zero but one bit set, or - all zero plus a counter that starts at zero. - -Some k values for my "a-=c; a^=rot(c,k); c+=b;" arrangement that -satisfy this are - 4 6 8 16 19 4 - 9 15 3 18 27 15 - 14 9 3 7 17 3 -Well, "9 15 3 18 27 15" didn't quite get 32 bits diffing -for "differ" defined as + with a one-bit base and a two-bit delta. I -used http://burtleburtle.net/bob/hash/avalanche.html to choose -the operations, constants, and arrangements of the variables. - -This does not achieve avalanche. There are input bits of (a,b,c) -that fail to affect some output bits of (a,b,c), especially of a. The -most thoroughly mixed value is c, but it doesn't really even achieve -avalanche in c. - -This allows some parallelism. Read-after-writes are good at doubling -the number of bits affected, so the goal of mixing pulls in the opposite -direction as the goal of parallelism. I did what I could. Rotates -seem to cost as much as shifts on every machine I could lay my hands -on, and rotates are much kinder to the top and bottom bits, so I used -rotates. -------------------------------------------------------------------------------- -*/ -#define mix(a,b,c) \ -{ \ - a -= c; a ^= rot(c, 4); c += b; \ - b -= a; b ^= rot(a, 6); a += c; \ - c -= b; c ^= rot(b, 8); b += a; \ - a -= c; a ^= rot(c,16); c += b; \ - b -= a; b ^= rot(a,19); a += c; \ - c -= b; c ^= rot(b, 4); b += a; \ -} - -/* -------------------------------------------------------------------------------- -final -- final mixing of 3 32-bit values (a,b,c) into c - -Pairs of (a,b,c) values differing in only a few bits will usually -produce values of c that look totally different. This was tested for -* pairs that differed by one bit, by two bits, in any combination - of top bits of (a,b,c), or in any combination of bottom bits of - (a,b,c). -* "differ" is defined as +, -, ^, or ~^. For + and -, I transformed - the output delta to a Gray code (a^(a>>1)) so a string of 1's (as - is commonly produced by subtraction) look like a single 1-bit - difference. -* the base values were pseudorandom, all zero but one bit set, or - all zero plus a counter that starts at zero. - -These constants passed: - 14 11 25 16 4 14 24 - 12 14 25 16 4 14 24 -and these came close: - 4 8 15 26 3 22 24 - 10 8 15 26 3 22 24 - 11 8 15 26 3 22 24 -------------------------------------------------------------------------------- -*/ -#define final(a,b,c) \ -{ \ - c ^= b; c -= rot(b,14); \ - a ^= c; a -= rot(c,11); \ - b ^= a; b -= rot(a,25); \ - c ^= b; c -= rot(b,16); \ - a ^= c; a -= rot(c,4); \ - b ^= a; b -= rot(a,14); \ - c ^= b; c -= rot(b,24); \ -} - -namespace re2 { - -/* --------------------------------------------------------------------- - This works on all machines. To be useful, it requires - -- that the key be an array of uint32_t's, and - -- that the length be the number of uint32_t's in the key - - The function hashword() is identical to hashlittle() on little-endian - machines, and identical to hashbig() on big-endian machines, - except that the length has to be measured in uint32_ts rather than in - bytes. hashlittle() is more complicated than hashword() only because - hashlittle() has to dance around fitting the key bytes into registers. --------------------------------------------------------------------- -*/ -uint32 hashword( -const uint32 *k, /* the key, an array of uint32_t values */ -size_t length, /* the length of the key, in uint32_ts */ -uint32 initval) /* the previous hash, or an arbitrary value */ -{ - uint32_t a,b,c; - - /* Set up the internal state */ - a = b = c = 0xdeadbeef + (((uint32_t)length)<<2) + initval; - - /*------------------------------------------------- handle most of the key */ - while (length > 3) - { - a += k[0]; - b += k[1]; - c += k[2]; - mix(a,b,c); - length -= 3; - k += 3; - } - - /*------------------------------------------- handle the last 3 uint32_t's */ - switch(length) /* all the case statements fall through */ - { - case 3 : c+=k[2]; - case 2 : b+=k[1]; - case 1 : a+=k[0]; - final(a,b,c); - case 0: /* case 0: nothing left to add */ - break; - } - /*------------------------------------------------------ report the result */ - return c; -} - - -/* --------------------------------------------------------------------- -hashword2() -- same as hashword(), but take two seeds and return two -32-bit values. pc and pb must both be nonnull, and *pc and *pb must -both be initialized with seeds. If you pass in (*pb)==0, the output -(*pc) will be the same as the return value from hashword(). --------------------------------------------------------------------- -*/ -void hashword2 ( -const uint32 *k, /* the key, an array of uint32_t values */ -size_t length, /* the length of the key, in uint32_ts */ -uint32 *pc, /* IN: seed OUT: primary hash value */ -uint32 *pb) /* IN: more seed OUT: secondary hash value */ -{ - uint32_t a,b,c; - - /* Set up the internal state */ - a = b = c = 0xdeadbeef + ((uint32_t)(length<<2)) + *pc; - c += *pb; - - /*------------------------------------------------- handle most of the key */ - while (length > 3) - { - a += k[0]; - b += k[1]; - c += k[2]; - mix(a,b,c); - length -= 3; - k += 3; - } - - /*------------------------------------------- handle the last 3 uint32_t's */ - switch(length) /* all the case statements fall through */ - { - case 3 : c+=k[2]; - case 2 : b+=k[1]; - case 1 : a+=k[0]; - final(a,b,c); - case 0: /* case 0: nothing left to add */ - break; - } - /*------------------------------------------------------ report the result */ - *pc=c; *pb=b; -} - -} // namespace re2 diff --git a/contrib/libre2/util/logging.h b/contrib/libre2/util/logging.h index 4443f7cdfe0..e9aa4469c30 100644 --- a/contrib/libre2/util/logging.h +++ b/contrib/libre2/util/logging.h @@ -2,14 +2,19 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +#ifndef UTIL_LOGGING_H_ +#define UTIL_LOGGING_H_ + // Simplified version of Google's logging. -#ifndef RE2_UTIL_LOGGING_H__ -#define RE2_UTIL_LOGGING_H__ - -#include /* for write */ +#include +#include +#include +#include #include +#include "util/util.h" + // Debug-only checking. #define DCHECK(condition) assert(condition) #define DCHECK_EQ(val1, val2) assert((val1) == (val2)) @@ -29,33 +34,37 @@ #define CHECK_NE(x, y) CHECK((x) != (y)) #define LOG_INFO LogMessage(__FILE__, __LINE__) -#define LOG_ERROR LOG_INFO -#define LOG_WARNING LOG_INFO +#define LOG_WARNING LogMessage(__FILE__, __LINE__) +#define LOG_ERROR LogMessage(__FILE__, __LINE__) #define LOG_FATAL LogMessageFatal(__FILE__, __LINE__) #define LOG_QFATAL LOG_FATAL -#define VLOG(x) if((x)>0){}else LOG_INFO.stream() +// It seems that one of the Windows header files defines ERROR as 0. +#ifdef _WIN32 +#define LOG_0 LOG_INFO +#endif #ifdef NDEBUG -#define DEBUG_MODE 0 #define LOG_DFATAL LOG_ERROR #else -#define DEBUG_MODE 1 #define LOG_DFATAL LOG_FATAL #endif #define LOG(severity) LOG_ ## severity.stream() +#define VLOG(x) if((x)>0){}else LOG_INFO.stream() + class LogMessage { public: - LogMessage(const char* file, int line) : flushed_(false) { + LogMessage(const char* file, int line) + : flushed_(false) { stream() << file << ":" << line << ": "; } void Flush() { stream() << "\n"; string s = str_.str(); - int n = (int)s.size(); // shut up msvc - if(write(2, s.data(), n) < 0) {} // shut up gcc + size_t n = s.size(); + if (fwrite(s.data(), 1, n, stderr) < n) {} // shut up gcc flushed_ = true; } ~LogMessage() { @@ -63,24 +72,38 @@ class LogMessage { Flush(); } } - ostream& stream() { return str_; } - + std::ostream& stream() { return str_; } + private: bool flushed_; std::ostringstream str_; - DISALLOW_EVIL_CONSTRUCTORS(LogMessage); + + LogMessage(const LogMessage&) = delete; + LogMessage& operator=(const LogMessage&) = delete; }; +// Silence "destructor never returns" warning for ~LogMessageFatal(). +// Since this is a header file, push and then pop to limit the scope. +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable: 4722) +#endif + class LogMessageFatal : public LogMessage { public: LogMessageFatal(const char* file, int line) - : LogMessage(file, line) { } + : LogMessage(file, line) {} ~LogMessageFatal() { Flush(); abort(); } private: - DISALLOW_EVIL_CONSTRUCTORS(LogMessageFatal); + LogMessageFatal(const LogMessageFatal&) = delete; + LogMessageFatal& operator=(const LogMessageFatal&) = delete; }; -#endif // RE2_UTIL_LOGGING_H__ +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +#endif // UTIL_LOGGING_H_ diff --git a/contrib/libre2/util/mix.h b/contrib/libre2/util/mix.h new file mode 100644 index 00000000000..d85c172ab0e --- /dev/null +++ b/contrib/libre2/util/mix.h @@ -0,0 +1,41 @@ +// Copyright 2016 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef UTIL_MIX_H_ +#define UTIL_MIX_H_ + +#include +#include + +namespace re2 { + +// Silence "truncation of constant value" warning for kMul in 32-bit mode. +// Since this is a header file, push and then pop to limit the scope. +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable: 4309) +#endif + +class HashMix { + public: + HashMix() : hash_(1) {} + explicit HashMix(size_t val) : hash_(val + 83) {} + void Mix(size_t val) { + static const size_t kMul = static_cast(0xdc3eb94af8ab4c93ULL); + hash_ *= kMul; + hash_ = ((hash_ << 19) | + (hash_ >> (std::numeric_limits::digits - 19))) + val; + } + size_t get() const { return hash_; } + private: + size_t hash_; +}; + +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +} // namespace re2 + +#endif // UTIL_MIX_H_ diff --git a/contrib/libre2/util/mutex.h b/contrib/libre2/util/mutex.h index 4a8de4c1838..9c491580481 100644 --- a/contrib/libre2/util/mutex.h +++ b/contrib/libre2/util/mutex.h @@ -2,64 +2,41 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +#ifndef UTIL_MUTEX_H_ +#define UTIL_MUTEX_H_ + /* * A simple mutex wrapper, supporting locks and read-write locks. * You should assume the locks are *not* re-entrant. */ -#ifndef RE2_UTIL_MUTEX_H_ -#define RE2_UTIL_MUTEX_H_ +#if !defined(_WIN32) +#ifndef _POSIX_C_SOURCE +#define _POSIX_C_SOURCE 200809L +#endif +#include +#if defined(_POSIX_READER_WRITER_LOCKS) && _POSIX_READER_WRITER_LOCKS > 0 +#define MUTEX_IS_PTHREAD_RWLOCK +#endif +#endif +#if defined(MUTEX_IS_PTHREAD_RWLOCK) +#include #include +typedef pthread_rwlock_t MutexType; +#else +#include +typedef std::mutex MutexType; +#endif namespace re2 { -#define HAVE_PTHREAD 1 -#define HAVE_RWLOCK 1 - -#if defined(NO_THREADS) - typedef int MutexType; // to keep a lock-count -#elif defined(HAVE_PTHREAD) && defined(HAVE_RWLOCK) - // Needed for pthread_rwlock_*. If it causes problems, you could take it - // out, but then you'd have to unset HAVE_RWLOCK (at least on linux -- it - // *does* cause problems for FreeBSD, or MacOSX, but isn't needed - // for locking there.) -# ifdef __linux__ -# undef _XOPEN_SOURCE -# define _XOPEN_SOURCE 500 // may be needed to get the rwlock calls -# endif -# include - typedef pthread_rwlock_t MutexType; -#elif defined(HAVE_PTHREAD) -# include - typedef pthread_mutex_t MutexType; -#elif defined(WIN32) -# define WIN32_LEAN_AND_MEAN // We only need minimal includes -# ifdef GMUTEX_TRYLOCK - // We need Windows NT or later for TryEnterCriticalSection(). If you - // don't need that functionality, you can remove these _WIN32_WINNT - // lines, and change TryLock() to assert(0) or something. -# ifndef _WIN32_WINNT -# define _WIN32_WINNT 0x0400 -# endif -# endif -# include - typedef CRITICAL_SECTION MutexType; -#else -# error Need to implement mutex.h for your architecture, or #define NO_THREADS -#endif - class Mutex { public: - // Create a Mutex that is not held by anybody. inline Mutex(); - - // Destructor inline ~Mutex(); - inline void Lock(); // Block if needed until free then acquire exclusively inline void Unlock(); // Release a lock acquired via Lock() - inline bool TryLock(); // If free, Lock() and return true, else return false // Note that on systems that don't support read-write locks, these may // be implemented as synonyms to Lock() and Unlock(). So you can use // these for efficiency, but don't use them anyplace where being able @@ -68,80 +45,44 @@ class Mutex { inline void ReaderUnlock(); // Release a read share of this Mutex inline void WriterLock() { Lock(); } // Acquire an exclusive lock inline void WriterUnlock() { Unlock(); } // Release a lock from WriterLock() - inline void AssertHeld() { } private: MutexType mutex_; // Catch the error of writing Mutex when intending MutexLock. Mutex(Mutex *ignored); - // Disallow "evil" constructors - Mutex(const Mutex&); - void operator=(const Mutex&); + + Mutex(const Mutex&) = delete; + Mutex& operator=(const Mutex&) = delete; }; -// Now the implementation of Mutex for various systems -#if defined(NO_THREADS) +#if defined(MUTEX_IS_PTHREAD_RWLOCK) -// When we don't have threads, we can be either reading or writing, -// but not both. We can have lots of readers at once (in no-threads -// mode, that's most likely to happen in recursive function calls), -// but only one writer. We represent this by having mutex_ be -1 when -// writing and a number > 0 when reading (and 0 when no lock is held). -// -// In debug mode, we assert these invariants, while in non-debug mode -// we do nothing, for efficiency. That's why everything is in an -// assert. -#include - -Mutex::Mutex() : mutex_(0) { } -Mutex::~Mutex() { assert(mutex_ == 0); } -void Mutex::Lock() { assert(--mutex_ == -1); } -void Mutex::Unlock() { assert(mutex_++ == -1); } -bool Mutex::TryLock() { if (mutex_) return false; Lock(); return true; } -void Mutex::ReaderLock() { assert(++mutex_ > 0); } -void Mutex::ReaderUnlock() { assert(mutex_-- > 0); } - -#elif defined(HAVE_PTHREAD) && defined(HAVE_RWLOCK) - -#define SAFE_PTHREAD(fncall) do { if ((fncall) != 0) abort(); } while (0) +#define SAFE_PTHREAD(fncall) \ + do { \ + if ((fncall) != 0) abort(); \ + } while (0) Mutex::Mutex() { SAFE_PTHREAD(pthread_rwlock_init(&mutex_, NULL)); } Mutex::~Mutex() { SAFE_PTHREAD(pthread_rwlock_destroy(&mutex_)); } void Mutex::Lock() { SAFE_PTHREAD(pthread_rwlock_wrlock(&mutex_)); } void Mutex::Unlock() { SAFE_PTHREAD(pthread_rwlock_unlock(&mutex_)); } -bool Mutex::TryLock() { return pthread_rwlock_trywrlock(&mutex_) == 0; } void Mutex::ReaderLock() { SAFE_PTHREAD(pthread_rwlock_rdlock(&mutex_)); } void Mutex::ReaderUnlock() { SAFE_PTHREAD(pthread_rwlock_unlock(&mutex_)); } #undef SAFE_PTHREAD -#elif defined(HAVE_PTHREAD) +#else -#define SAFE_PTHREAD(fncall) do { if ((fncall) != 0) abort(); } while (0) - -Mutex::Mutex() { SAFE_PTHREAD(pthread_mutex_init(&mutex_, NULL)); } -Mutex::~Mutex() { SAFE_PTHREAD(pthread_mutex_destroy(&mutex_)); } -void Mutex::Lock() { SAFE_PTHREAD(pthread_mutex_lock(&mutex_)); } -void Mutex::Unlock() { SAFE_PTHREAD(pthread_mutex_unlock(&mutex_)); } -bool Mutex::TryLock() { return pthread_mutex_trylock(&mutex_) == 0; } -void Mutex::ReaderLock() { Lock(); } // we don't have read-write locks -void Mutex::ReaderUnlock() { Unlock(); } -#undef SAFE_PTHREAD - -#elif defined(WIN32) - -Mutex::Mutex() { InitializeCriticalSection(&mutex_); } -Mutex::~Mutex() { DeleteCriticalSection(&mutex_); } -void Mutex::Lock() { EnterCriticalSection(&mutex_); } -void Mutex::Unlock() { LeaveCriticalSection(&mutex_); } -bool Mutex::TryLock() { return TryEnterCriticalSection(&mutex_) != 0; } -void Mutex::ReaderLock() { Lock(); } // we don't have read-write locks +Mutex::Mutex() { } +Mutex::~Mutex() { } +void Mutex::Lock() { mutex_.lock(); } +void Mutex::Unlock() { mutex_.unlock(); } +void Mutex::ReaderLock() { Lock(); } // C++11 doesn't have std::shared_mutex. void Mutex::ReaderUnlock() { Unlock(); } #endif - // -------------------------------------------------------------------------- // Some helper classes @@ -152,9 +93,9 @@ class MutexLock { ~MutexLock() { mu_->Unlock(); } private: Mutex * const mu_; - // Disallow "evil" constructors - MutexLock(const MutexLock&); - void operator=(const MutexLock&); + + MutexLock(const MutexLock&) = delete; + MutexLock& operator=(const MutexLock&) = delete; }; // ReaderMutexLock and WriterMutexLock do the same, for rwlocks @@ -164,9 +105,9 @@ class ReaderMutexLock { ~ReaderMutexLock() { mu_->ReaderUnlock(); } private: Mutex * const mu_; - // Disallow "evil" constructors - ReaderMutexLock(const ReaderMutexLock&); - void operator=(const ReaderMutexLock&); + + ReaderMutexLock(const ReaderMutexLock&) = delete; + ReaderMutexLock& operator=(const ReaderMutexLock&) = delete; }; class WriterMutexLock { @@ -175,37 +116,16 @@ class WriterMutexLock { ~WriterMutexLock() { mu_->WriterUnlock(); } private: Mutex * const mu_; - // Disallow "evil" constructors - WriterMutexLock(const WriterMutexLock&); - void operator=(const WriterMutexLock&); + + WriterMutexLock(const WriterMutexLock&) = delete; + WriterMutexLock& operator=(const WriterMutexLock&) = delete; }; // Catch bug where variable name is omitted, e.g. MutexLock (&mu); -#define MutexLock(x) COMPILE_ASSERT(0, mutex_lock_decl_missing_var_name) -#define ReaderMutexLock(x) COMPILE_ASSERT(0, rmutex_lock_decl_missing_var_name) -#define WriterMutexLock(x) COMPILE_ASSERT(0, wmutex_lock_decl_missing_var_name) - -// Provide safe way to declare and use global, linker-initialized mutex. Sigh. -#ifdef HAVE_PTHREAD - -#define GLOBAL_MUTEX(name) \ - static pthread_mutex_t (name) = PTHREAD_MUTEX_INITIALIZER -#define GLOBAL_MUTEX_LOCK(name) \ - pthread_mutex_lock(&(name)) -#define GLOBAL_MUTEX_UNLOCK(name) \ - pthread_mutex_unlock(&(name)) - -#else - -#define GLOBAL_MUTEX(name) \ - static Mutex name -#define GLOBAL_MUTEX_LOCK(name) \ - name.Lock() -#define GLOBAL_MUTEX_UNLOCK(name) \ - name.Unlock() - -#endif +#define MutexLock(x) static_assert(false, "MutexLock declaration missing variable name") +#define ReaderMutexLock(x) static_assert(false, "ReaderMutexLock declaration missing variable name") +#define WriterMutexLock(x) static_assert(false, "WriterMutexLock declaration missing variable name") } // namespace re2 -#endif /* #define RE2_UTIL_MUTEX_H_ */ +#endif // UTIL_MUTEX_H_ diff --git a/contrib/libre2/util/pcre.cc b/contrib/libre2/util/pcre.cc index 6402cf2fe99..7e1d1ac9c12 100644 --- a/contrib/libre2/util/pcre.cc +++ b/contrib/libre2/util/pcre.cc @@ -6,12 +6,25 @@ // The main changes are the addition of the HitLimit method and // compilation as PCRE in namespace re2. +#include +#include #include +#include +#include +#include +#include +#include + #include "util/util.h" #include "util/flags.h" +#include "util/logging.h" #include "util/pcre.h" +#include "util/strutil.h" -#if __GNUC__ > 5 +// Silence warnings about the wacky formatting in the operator() functions. +// Note that we test for Clang first because it defines __GNUC__ as well. +#if defined(__clang__) +#elif defined(__GNUC__) && __GNUC__ >= 6 #pragma GCC diagnostic ignored "-Wmisleading-indentation" #endif @@ -26,6 +39,42 @@ DEFINE_int32(regexp_stack_limit, 256<<10, "default PCRE stack limit (bytes)"); DEFINE_int32(regexp_match_limit, 1000000, "default PCRE match limit (function calls)"); +#ifndef USEPCRE + +// Fake just enough of the PCRE API to allow this file to build. :) + +struct pcre_extra { + int flags; + int match_limit; + int match_limit_recursion; +}; + +#define PCRE_EXTRA_MATCH_LIMIT 0 +#define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0 +#define PCRE_ANCHORED 0 +#define PCRE_NOTEMPTY 0 +#define PCRE_ERROR_NOMATCH 1 +#define PCRE_ERROR_MATCHLIMIT 2 +#define PCRE_ERROR_RECURSIONLIMIT 3 +#define PCRE_INFO_CAPTURECOUNT 0 + +void pcre_free(void*) { +} + +pcre* pcre_compile(const char*, int, const char**, int*, const unsigned char*) { + return NULL; +} + +int pcre_exec(const pcre*, const pcre_extra*, const char*, int, int, int, int*, int) { + return 0; +} + +int pcre_fullinfo(const pcre*, const pcre_extra*, int, void*) { + return 0; +} + +#endif + namespace re2 { // Maximum number of args we can set @@ -117,7 +166,7 @@ pcre* PCRE::Compile(Anchor anchor) { // ANCHOR_BOTH Tack a "\z" to the end of the original pattern // and use a pcre anchored match. - const char* error; + const char* error = ""; int eoffset; pcre* re; if (anchor != ANCHOR_BOTH) { @@ -181,8 +230,8 @@ bool PCRE::FullMatchFunctor::operator ()(const StringPiece& text, if (&a15 == &no_more_args) goto done; args[n++] = &a15; done: - int consumed; - int vec[kVecSize]; + size_t consumed; + int vec[kVecSize] = {}; return re.DoMatchImpl(text, ANCHOR_BOTH, &consumed, args, n, vec, kVecSize); } @@ -224,8 +273,8 @@ bool PCRE::PartialMatchFunctor::operator ()(const StringPiece& text, if (&a15 == &no_more_args) goto done; args[n++] = &a15; done: - int consumed; - int vec[kVecSize]; + size_t consumed; + int vec[kVecSize] = {}; return re.DoMatchImpl(text, UNANCHORED, &consumed, args, n, vec, kVecSize); } @@ -267,8 +316,8 @@ bool PCRE::ConsumeFunctor::operator ()(StringPiece* input, if (&a15 == &no_more_args) goto done; args[n++] = &a15; done: - int consumed; - int vec[kVecSize]; + size_t consumed; + int vec[kVecSize] = {}; if (pattern.DoMatchImpl(*input, ANCHOR_START, &consumed, args, n, vec, kVecSize)) { input->remove_prefix(consumed); @@ -316,8 +365,8 @@ bool PCRE::FindAndConsumeFunctor::operator ()(StringPiece* input, if (&a15 == &no_more_args) goto done; args[n++] = &a15; done: - int consumed; - int vec[kVecSize]; + size_t consumed; + int vec[kVecSize] = {}; if (pattern.DoMatchImpl(*input, UNANCHORED, &consumed, args, n, vec, kVecSize)) { input->remove_prefix(consumed); @@ -330,7 +379,7 @@ done: bool PCRE::Replace(string *str, const PCRE& pattern, const StringPiece& rewrite) { - int vec[kVecSize]; + int vec[kVecSize] = {}; int matches = pattern.TryMatch(*str, 0, UNANCHORED, true, vec, kVecSize); if (matches == 0) return false; @@ -349,12 +398,12 @@ int PCRE::GlobalReplace(string *str, const PCRE& pattern, const StringPiece& rewrite) { int count = 0; - int vec[kVecSize]; + int vec[kVecSize] = {}; string out; size_t start = 0; bool last_match_was_empty_string = false; - for (; start <= str->length();) { + while (start <= str->size()) { // If the previous match was for the empty string, we shouldn't // just match again: we'll match in the same way and get an // infinite loop. Instead, we do the match in a special way: @@ -370,19 +419,20 @@ int PCRE::GlobalReplace(string *str, matches = pattern.TryMatch(*str, start, ANCHOR_START, false, vec, kVecSize); if (matches <= 0) { - if (start < str->length()) + if (start < str->size()) out.push_back((*str)[start]); start++; last_match_was_empty_string = false; continue; } } else { - matches = pattern.TryMatch(*str, start, UNANCHORED, true, vec, kVecSize); + matches = pattern.TryMatch(*str, start, UNANCHORED, true, + vec, kVecSize); if (matches <= 0) break; } - int matchstart = vec[0], matchend = vec[1]; - assert(matchstart >= static_cast(start)); + size_t matchstart = vec[0], matchend = vec[1]; + assert(matchstart >= start); assert(matchend >= matchstart); out.append(*str, start, matchstart - start); @@ -395,8 +445,9 @@ int PCRE::GlobalReplace(string *str, if (count == 0) return 0; - if (start < str->length()) - out.append(*str, start, str->length() - start); + if (start < str->size()) + out.append(*str, start, str->size() - start); + using std::swap; swap(out, *str); return count; } @@ -405,7 +456,7 @@ bool PCRE::Extract(const StringPiece &text, const PCRE& pattern, const StringPiece &rewrite, string *out) { - int vec[kVecSize]; + int vec[kVecSize] = {}; int matches = pattern.TryMatch(text, 0, UNANCHORED, true, vec, kVecSize); if (matches == 0) return false; @@ -424,7 +475,7 @@ string PCRE::QuoteMeta(const StringPiece& unquoted) { // that. (This also makes it identical to the perl function of the // same name except for the null-character special case; // see `perldoc -f quotemeta`.) - for (int ii = 0; ii < unquoted.length(); ++ii) { + for (size_t ii = 0; ii < unquoted.size(); ++ii) { // Note that using 'isalnum' here raises the benchmark time from // 32ns to 58ns: if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') && @@ -451,7 +502,7 @@ string PCRE::QuoteMeta(const StringPiece& unquoted) { /***** Actual matching and rewriting code *****/ bool PCRE::HitLimit() { - return hit_limit_; + return hit_limit_ != 0; } void PCRE::ClearHitLimit() { @@ -459,11 +510,11 @@ void PCRE::ClearHitLimit() { } int PCRE::TryMatch(const StringPiece& text, - int startpos, - Anchor anchor, - bool empty_ok, - int *vec, - int vecsize) const { + size_t startpos, + Anchor anchor, + bool empty_ok, + int *vec, + int vecsize) const { pcre* re = (anchor == ANCHOR_BOTH) ? re_full_ : re_partial_; if (re == NULL) { PCREPORT(ERROR) << "Matching against invalid re: " << *error_; @@ -499,8 +550,8 @@ int PCRE::TryMatch(const StringPiece& text, int rc = pcre_exec(re, // The regular expression object &extra, (text.data() == NULL) ? "" : text.data(), - text.size(), - startpos, + static_cast(text.size()), + static_cast(startpos), options, vec, vecsize); @@ -554,18 +605,13 @@ int PCRE::TryMatch(const StringPiece& text, return rc; } -#if !__clang__ -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#endif - bool PCRE::DoMatchImpl(const StringPiece& text, - Anchor anchor, - int* consumed, - const Arg* const* args, - int n, - int* vec, - int vecsize) const { + Anchor anchor, + size_t* consumed, + const Arg* const* args, + int n, + int* vec, + int vecsize) const { assert((1 + n) * 3 <= vecsize); // results + PCRE workspace int matches = TryMatch(text, 0, anchor, true, vec, vecsize); assert(matches >= 0); // TryMatch never returns negatives @@ -589,7 +635,17 @@ bool PCRE::DoMatchImpl(const StringPiece& text, for (int i = 0; i < n; i++) { const int start = vec[2*(i+1)]; const int limit = vec[2*(i+1)+1]; - if (!args[i]->Parse(text.data() + start, limit-start)) { + + // Avoid invoking undefined behavior when text.data() happens + // to be null and start happens to be -1, the latter being the + // case for an unmatched subexpression. Even if text.data() is + // not null, pointing one byte before was a longstanding bug. + const char* addr = NULL; + if (start != -1) { + addr = text.data() + start; + } + + if (!args[i]->Parse(addr, limit-start)) { // TODO: Should we indicate what the error was? return false; } @@ -598,19 +654,15 @@ bool PCRE::DoMatchImpl(const StringPiece& text, return true; } -#if !__clang__ -#pragma GCC diagnostic pop -#endif - bool PCRE::DoMatch(const StringPiece& text, - Anchor anchor, - int* consumed, - const Arg* const args[], - int n) const { + Anchor anchor, + size_t* consumed, + const Arg* const args[], + int n) const { assert(n >= 0); - size_t const vecsize = (1 + n) * 3; // results + PCRE workspace - // (as for kVecSize) - int *vec = new int[vecsize]; + const int vecsize = (1 + n) * 3; // results + PCRE workspace + // (as for kVecSize) + int* vec = new int[vecsize]; bool b = DoMatchImpl(text, anchor, consumed, args, n, vec, vecsize); delete[] vec; return b; @@ -695,41 +747,52 @@ int PCRE::NumberOfCapturingGroups() const { if (re_partial_ == NULL) return -1; int result; - CHECK(pcre_fullinfo(re_partial_, // The regular expression object - NULL, // We did not study the pattern - PCRE_INFO_CAPTURECOUNT, - &result) == 0); + int rc = pcre_fullinfo(re_partial_, // The regular expression object + NULL, // We did not study the pattern + PCRE_INFO_CAPTURECOUNT, + &result); + if (rc != 0) { + PCREPORT(ERROR) << "Unexpected return code: " << rc; + return -1; + } return result; } /***** Parsers for various types *****/ -bool PCRE::Arg::parse_null(const char* str, int n, void* dest) { +bool PCRE::Arg::parse_null(const char* str, size_t n, void* dest) { // We fail if somebody asked us to store into a non-NULL void* pointer return (dest == NULL); } -bool PCRE::Arg::parse_string(const char* str, int n, void* dest) { +bool PCRE::Arg::parse_string(const char* str, size_t n, void* dest) { if (dest == NULL) return true; reinterpret_cast(dest)->assign(str, n); return true; } -bool PCRE::Arg::parse_stringpiece(const char* str, int n, void* dest) { +bool PCRE::Arg::parse_stringpiece(const char* str, size_t n, void* dest) { if (dest == NULL) return true; - reinterpret_cast(dest)->set(str, n); + *(reinterpret_cast(dest)) = StringPiece(str, n); return true; } -bool PCRE::Arg::parse_char(const char* str, int n, void* dest) { +bool PCRE::Arg::parse_char(const char* str, size_t n, void* dest) { if (n != 1) return false; if (dest == NULL) return true; *(reinterpret_cast(dest)) = str[0]; return true; } -bool PCRE::Arg::parse_uchar(const char* str, int n, void* dest) { +bool PCRE::Arg::parse_schar(const char* str, size_t n, void* dest) { + if (n != 1) return false; + if (dest == NULL) return true; + *(reinterpret_cast(dest)) = str[0]; + return true; +} + +bool PCRE::Arg::parse_uchar(const char* str, size_t n, void* dest) { if (n != 1) return false; if (dest == NULL) return true; *(reinterpret_cast(dest)) = str[0]; @@ -746,7 +809,7 @@ static const int kMaxNumberLength = 32; // a. "str" if no termination is needed // b. "buf" if the string was copied and null-terminated // c. "" if the input was invalid and has no hope of being parsed -static const char* TerminateNumber(char* buf, const char* str, int n) { +static const char* TerminateNumber(char* buf, const char* str, size_t n) { if ((n > 0) && isspace(*str)) { // We are less forgiving than the strtoxxx() routines and do not // allow leading spaces. @@ -769,9 +832,9 @@ static const char* TerminateNumber(char* buf, const char* str, int n) { } bool PCRE::Arg::parse_long_radix(const char* str, - int n, - void* dest, - int radix) { + size_t n, + void* dest, + int radix) { if (n == 0) return false; char buf[kMaxNumberLength+1]; str = TerminateNumber(buf, str, n); @@ -786,16 +849,16 @@ bool PCRE::Arg::parse_long_radix(const char* str, } bool PCRE::Arg::parse_ulong_radix(const char* str, - int n, - void* dest, - int radix) { + size_t n, + void* dest, + int radix) { if (n == 0) return false; char buf[kMaxNumberLength+1]; str = TerminateNumber(buf, str, n); if (str[0] == '-') { - // strtoul() will silently accept negative numbers and parse - // them. This module is more strict and treats them as errors. - return false; + // strtoul() will silently accept negative numbers and parse + // them. This module is more strict and treats them as errors. + return false; } char* end; @@ -809,74 +872,74 @@ bool PCRE::Arg::parse_ulong_radix(const char* str, } bool PCRE::Arg::parse_short_radix(const char* str, - int n, - void* dest, - int radix) { + size_t n, + void* dest, + int radix) { long r; - if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse - if ((short)r != r) return false; // Out of range + if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse + if ((short)r != r) return false; // Out of range if (dest == NULL) return true; - *(reinterpret_cast(dest)) = r; + *(reinterpret_cast(dest)) = (short)r; return true; } bool PCRE::Arg::parse_ushort_radix(const char* str, - int n, - void* dest, - int radix) { + size_t n, + void* dest, + int radix) { unsigned long r; - if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse - if ((ushort)r != r) return false; // Out of range + if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse + if ((unsigned short)r != r) return false; // Out of range if (dest == NULL) return true; - *(reinterpret_cast(dest)) = r; + *(reinterpret_cast(dest)) = (unsigned short)r; return true; } bool PCRE::Arg::parse_int_radix(const char* str, - int n, - void* dest, - int radix) { + size_t n, + void* dest, + int radix) { long r; - if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse - if ((int)r != r) return false; // Out of range + if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse + if ((int)r != r) return false; // Out of range if (dest == NULL) return true; - *(reinterpret_cast(dest)) = r; + *(reinterpret_cast(dest)) = (int)r; return true; } bool PCRE::Arg::parse_uint_radix(const char* str, - int n, - void* dest, - int radix) { + size_t n, + void* dest, + int radix) { unsigned long r; - if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse - if ((uint)r != r) return false; // Out of range + if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse + if ((unsigned int)r != r) return false; // Out of range if (dest == NULL) return true; - *(reinterpret_cast(dest)) = r; + *(reinterpret_cast(dest)) = (unsigned int)r; return true; } bool PCRE::Arg::parse_longlong_radix(const char* str, - int n, - void* dest, - int radix) { + size_t n, + void* dest, + int radix) { if (n == 0) return false; char buf[kMaxNumberLength+1]; str = TerminateNumber(buf, str, n); char* end; errno = 0; - int64 r = strtoll(str, &end, radix); + long long r = strtoll(str, &end, radix); if (end != str + n) return false; // Leftover junk if (errno) return false; if (dest == NULL) return true; - *(reinterpret_cast(dest)) = r; + *(reinterpret_cast(dest)) = r; return true; } bool PCRE::Arg::parse_ulonglong_radix(const char* str, - int n, - void* dest, - int radix) { + size_t n, + void* dest, + int radix) { if (n == 0) return false; char buf[kMaxNumberLength+1]; str = TerminateNumber(buf, str, n); @@ -887,26 +950,32 @@ bool PCRE::Arg::parse_ulonglong_radix(const char* str, } char* end; errno = 0; - uint64 r = strtoull(str, &end, radix); + unsigned long long r = strtoull(str, &end, radix); if (end != str + n) return false; // Leftover junk if (errno) return false; if (dest == NULL) return true; - *(reinterpret_cast(dest)) = r; + *(reinterpret_cast(dest)) = r; return true; } -bool PCRE::Arg::parse_double(const char* str, int n, void* dest) { +static bool parse_double_float(const char* str, size_t n, bool isfloat, + void* dest) { if (n == 0) return false; static const int kMaxLength = 200; char buf[kMaxLength]; if (n >= kMaxLength) return false; memcpy(buf, str, n); buf[n] = '\0'; - errno = 0; char* end; - double r = strtod(buf, &end); + errno = 0; + double r; + if (isfloat) { + r = strtof(buf, &end); + } else { + r = strtod(buf, &end); + } if (end != buf + n) { -#ifdef COMPILER_MSVC +#ifdef _WIN32 // Microsoft's strtod() doesn't handle inf and nan, so we have to // handle it explicitly. Speed is not important here because this // code is only called in unit tests. @@ -918,12 +987,12 @@ bool PCRE::Arg::parse_double(const char* str, int n, void* dest) { } else if ('+' == *i) { ++i; } - if (0 == stricmp(i, "inf") || 0 == stricmp(i, "infinity")) { - r = numeric_limits::infinity(); + if (0 == _stricmp(i, "inf") || 0 == _stricmp(i, "infinity")) { + r = std::numeric_limits::infinity(); if (!pos) r = -r; - } else if (0 == stricmp(i, "nan")) { - r = numeric_limits::quiet_NaN(); + } else if (0 == _stricmp(i, "nan")) { + r = std::numeric_limits::quiet_NaN(); } else { return false; } @@ -933,42 +1002,47 @@ bool PCRE::Arg::parse_double(const char* str, int n, void* dest) { } if (errno) return false; if (dest == NULL) return true; - *(reinterpret_cast(dest)) = r; + if (isfloat) { + *(reinterpret_cast(dest)) = (float)r; + } else { + *(reinterpret_cast(dest)) = r; + } return true; } -bool PCRE::Arg::parse_float(const char* str, int n, void* dest) { - double r; - if (!parse_double(str, n, &r)) return false; - if (dest == NULL) return true; - *(reinterpret_cast(dest)) = static_cast(r); - return true; +bool PCRE::Arg::parse_double(const char* str, size_t n, void* dest) { + return parse_double_float(str, n, false, dest); } +bool PCRE::Arg::parse_float(const char* str, size_t n, void* dest) { + return parse_double_float(str, n, true, dest); +} -#define DEFINE_INTEGER_PARSERS(name) \ - bool PCRE::Arg::parse_##name(const char* str, int n, void* dest) { \ - return parse_##name##_radix(str, n, dest, 10); \ - } \ - bool PCRE::Arg::parse_##name##_hex(const char* str, int n, void* dest) { \ - return parse_##name##_radix(str, n, dest, 16); \ - } \ - bool PCRE::Arg::parse_##name##_octal(const char* str, int n, void* dest) { \ - return parse_##name##_radix(str, n, dest, 8); \ - } \ - bool PCRE::Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \ - return parse_##name##_radix(str, n, dest, 0); \ +#define DEFINE_INTEGER_PARSER(name) \ + bool PCRE::Arg::parse_##name(const char* str, size_t n, void* dest) { \ + return parse_##name##_radix(str, n, dest, 10); \ + } \ + bool PCRE::Arg::parse_##name##_hex(const char* str, size_t n, void* dest) { \ + return parse_##name##_radix(str, n, dest, 16); \ + } \ + bool PCRE::Arg::parse_##name##_octal(const char* str, size_t n, \ + void* dest) { \ + return parse_##name##_radix(str, n, dest, 8); \ + } \ + bool PCRE::Arg::parse_##name##_cradix(const char* str, size_t n, \ + void* dest) { \ + return parse_##name##_radix(str, n, dest, 0); \ } -DEFINE_INTEGER_PARSERS(short); -DEFINE_INTEGER_PARSERS(ushort); -DEFINE_INTEGER_PARSERS(int); -DEFINE_INTEGER_PARSERS(uint); -DEFINE_INTEGER_PARSERS(long); -DEFINE_INTEGER_PARSERS(ulong); -DEFINE_INTEGER_PARSERS(longlong); -DEFINE_INTEGER_PARSERS(ulonglong); +DEFINE_INTEGER_PARSER(short); +DEFINE_INTEGER_PARSER(ushort); +DEFINE_INTEGER_PARSER(int); +DEFINE_INTEGER_PARSER(uint); +DEFINE_INTEGER_PARSER(long); +DEFINE_INTEGER_PARSER(ulong); +DEFINE_INTEGER_PARSER(longlong); +DEFINE_INTEGER_PARSER(ulonglong); -#undef DEFINE_INTEGER_PARSERS +#undef DEFINE_INTEGER_PARSER } // namespace re2 diff --git a/contrib/libre2/util/pcre.h b/contrib/libre2/util/pcre.h index 4dda95dfa15..7c6403d1925 100644 --- a/contrib/libre2/util/pcre.h +++ b/contrib/libre2/util/pcre.h @@ -2,6 +2,9 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +#ifndef UTIL_PCRE_H_ +#define UTIL_PCRE_H_ + // This is a variant of PCRE's pcrecpp.h, originally written at Google. // The main changes are the addition of the HitLimit method and // compilation as PCRE in namespace re2. @@ -167,22 +170,9 @@ namespace re2 { const bool UsingPCRE = true; } // namespace re2 #else +struct pcre; // opaque namespace re2 { const bool UsingPCRE = false; -struct pcre; -struct pcre_extra { int flags, match_limit, match_limit_recursion; }; -#define pcre_free(x) {} -#define PCRE_EXTRA_MATCH_LIMIT 0 -#define PCRE_EXTRA_MATCH_LIMIT_RECURSION 0 -#define PCRE_ANCHORED 0 -#define PCRE_NOTEMPTY 0 -#define PCRE_ERROR_NOMATCH 1 -#define PCRE_ERROR_MATCHLIMIT 2 -#define PCRE_ERROR_RECURSIONLIMIT 3 -#define PCRE_INFO_CAPTURECOUNT 0 -#define pcre_compile(a,b,c,d,e) ({ (void)(a); (void)(b); *(c)=""; *(d)=0; (void)(e); ((pcre*)0); }) -#define pcre_exec(a, b, c, d, e, f, g, h) ({ (void)(a); (void)(b); (void)(c); (void)(d); (void)(e); (void)(f); (void)(g); (void)(h); 0; }) -#define pcre_fullinfo(a, b, c, d) ({ (void)(a); (void)(b); (void)(c); *(d) = 0; 0; }) } // namespace re2 #endif @@ -258,7 +248,7 @@ class PCRE { // type, or one of: // string (matched piece is copied to string) // StringPiece (StringPiece is mutated to point to matched piece) - // T (where "bool T::ParseFrom(const char*, int)" exists) + // T (where "bool T::ParseFrom(const char*, size_t)" exists) // (void*)NULL (the corresponding matched sub-pattern is not copied) // // Returns true iff all of the following conditions are satisfied: @@ -452,7 +442,7 @@ class PCRE { // "*consumed" if successful. bool DoMatch(const StringPiece& text, Anchor anchor, - int* consumed, + size_t* consumed, const Arg* const* args, int n) const; // Return the number of capturing subpatterns, or -1 if the @@ -475,7 +465,7 @@ class PCRE { // When matching PCRE("(foo)|hello") against "hello", it will return 1. // But the values for all subpattern are filled in into "vec". int TryMatch(const StringPiece& text, - int startpos, + size_t startpos, Anchor anchor, bool empty_ok, int *vec, @@ -492,7 +482,7 @@ class PCRE { // internal implementation for DoMatch bool DoMatchImpl(const StringPiece& text, Anchor anchor, - int* consumed, + size_t* consumed, const Arg* const args[], int n, int* vec, @@ -509,8 +499,10 @@ class PCRE { bool report_errors_; // Silences error logging if false int match_limit_; // Limit on execution resources int stack_limit_; // Limit on stack resources (bytes) - mutable int32_t hit_limit_; // Hit limit during execution (bool)? - DISALLOW_EVIL_CONSTRUCTORS(PCRE); + mutable int32_t hit_limit_; // Hit limit during execution (bool)? + + PCRE(const PCRE&) = delete; + PCRE& operator=(const PCRE&) = delete; }; // PCRE_Options allow you to set the PCRE::Options, plus any pcre @@ -565,7 +557,7 @@ class PCRE_Options { template class _PCRE_MatchObject { public: - static inline bool Parse(const char* str, int n, void* dest) { + static inline bool Parse(const char* str, size_t n, void* dest) { if (dest == NULL) return true; T* object = reinterpret_cast(dest); return object->ParseFrom(str, n); @@ -580,16 +572,21 @@ class PCRE::Arg { // Constructor specially designed for NULL arguments Arg(void*); - typedef bool (*Parser)(const char* str, int n, void* dest); + typedef bool (*Parser)(const char* str, size_t n, void* dest); // Type-specific parsers -#define MAKE_PARSER(type,name) \ - Arg(type* p) : arg_(p), parser_(name) { } \ - Arg(type* p, Parser parser) : arg_(p), parser_(parser) { } \ - +#define MAKE_PARSER(type, name) \ + Arg(type* p) : arg_(p), parser_(name) {} \ + Arg(type* p, Parser parser) : arg_(p), parser_(parser) {} MAKE_PARSER(char, parse_char); + MAKE_PARSER(signed char, parse_schar); MAKE_PARSER(unsigned char, parse_uchar); + MAKE_PARSER(float, parse_float); + MAKE_PARSER(double, parse_double); + MAKE_PARSER(string, parse_string); + MAKE_PARSER(StringPiece, parse_stringpiece); + MAKE_PARSER(short, parse_short); MAKE_PARSER(unsigned short, parse_ushort); MAKE_PARSER(int, parse_int); @@ -598,10 +595,6 @@ class PCRE::Arg { MAKE_PARSER(unsigned long, parse_ulong); MAKE_PARSER(long long, parse_longlong); MAKE_PARSER(unsigned long long, parse_ulonglong); - MAKE_PARSER(float, parse_float); - MAKE_PARSER(double, parse_double); - MAKE_PARSER(string, parse_string); - MAKE_PARSER(StringPiece, parse_stringpiece); #undef MAKE_PARSER @@ -613,29 +606,31 @@ class PCRE::Arg { } // Parse the data - bool Parse(const char* str, int n) const; + bool Parse(const char* str, size_t n) const; private: void* arg_; Parser parser_; - static bool parse_null (const char* str, int n, void* dest); - static bool parse_char (const char* str, int n, void* dest); - static bool parse_uchar (const char* str, int n, void* dest); - static bool parse_float (const char* str, int n, void* dest); - static bool parse_double (const char* str, int n, void* dest); - static bool parse_string (const char* str, int n, void* dest); - static bool parse_stringpiece (const char* str, int n, void* dest); + static bool parse_null (const char* str, size_t n, void* dest); + static bool parse_char (const char* str, size_t n, void* dest); + static bool parse_schar (const char* str, size_t n, void* dest); + static bool parse_uchar (const char* str, size_t n, void* dest); + static bool parse_float (const char* str, size_t n, void* dest); + static bool parse_double (const char* str, size_t n, void* dest); + static bool parse_string (const char* str, size_t n, void* dest); + static bool parse_stringpiece (const char* str, size_t n, void* dest); -#define DECLARE_INTEGER_PARSER(name) \ - private: \ - static bool parse_ ## name(const char* str, int n, void* dest); \ - static bool parse_ ## name ## _radix( \ - const char* str, int n, void* dest, int radix); \ - public: \ - static bool parse_ ## name ## _hex(const char* str, int n, void* dest); \ - static bool parse_ ## name ## _octal(const char* str, int n, void* dest); \ - static bool parse_ ## name ## _cradix(const char* str, int n, void* dest) +#define DECLARE_INTEGER_PARSER(name) \ + private: \ + static bool parse_##name(const char* str, size_t n, void* dest); \ + static bool parse_##name##_radix(const char* str, size_t n, void* dest, \ + int radix); \ + \ + public: \ + static bool parse_##name##_hex(const char* str, size_t n, void* dest); \ + static bool parse_##name##_octal(const char* str, size_t n, void* dest); \ + static bool parse_##name##_cradix(const char* str, size_t n, void* dest) DECLARE_INTEGER_PARSER(short); DECLARE_INTEGER_PARSER(ushort); @@ -647,23 +642,27 @@ class PCRE::Arg { DECLARE_INTEGER_PARSER(ulonglong); #undef DECLARE_INTEGER_PARSER + }; inline PCRE::Arg::Arg() : arg_(NULL), parser_(parse_null) { } inline PCRE::Arg::Arg(void* p) : arg_(p), parser_(parse_null) { } -inline bool PCRE::Arg::Parse(const char* str, int n) const { +inline bool PCRE::Arg::Parse(const char* str, size_t n) const { return (*parser_)(str, n, arg_); } // This part of the parser, appropriate only for ints, deals with bases -#define MAKE_INTEGER_PARSER(type, name) \ - inline PCRE::Arg Hex(type* ptr) { \ - return PCRE::Arg(ptr, PCRE::Arg::parse_ ## name ## _hex); } \ - inline PCRE::Arg Octal(type* ptr) { \ - return PCRE::Arg(ptr, PCRE::Arg::parse_ ## name ## _octal); } \ - inline PCRE::Arg CRadix(type* ptr) { \ - return PCRE::Arg(ptr, PCRE::Arg::parse_ ## name ## _cradix); } +#define MAKE_INTEGER_PARSER(type, name) \ + inline PCRE::Arg Hex(type* ptr) { \ + return PCRE::Arg(ptr, PCRE::Arg::parse_##name##_hex); \ + } \ + inline PCRE::Arg Octal(type* ptr) { \ + return PCRE::Arg(ptr, PCRE::Arg::parse_##name##_octal); \ + } \ + inline PCRE::Arg CRadix(type* ptr) { \ + return PCRE::Arg(ptr, PCRE::Arg::parse_##name##_cradix); \ + } MAKE_INTEGER_PARSER(short, short); MAKE_INTEGER_PARSER(unsigned short, ushort); @@ -677,3 +676,5 @@ MAKE_INTEGER_PARSER(unsigned long long, ulonglong); #undef MAKE_INTEGER_PARSER } // namespace re2 + +#endif // UTIL_PCRE_H_ diff --git a/contrib/libre2/util/random.cc b/contrib/libre2/util/random.cc deleted file mode 100644 index 49d6195876a..00000000000 --- a/contrib/libre2/util/random.cc +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright 2005-2009 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Modified from Google perftools's tcmalloc_unittest.cc. - -#include "util/random.h" - -namespace re2 { - -int32 ACMRandom::Next() { - const int32 M = 2147483647L; // 2^31-1 - const int32 A = 16807; - // In effect, we are computing seed_ = (seed_ * A) % M, where M = 2^31-1 - uint32 lo = A * (int32)(seed_ & 0xFFFF); - uint32 hi = A * (int32)((uint32)seed_ >> 16); - lo += (hi & 0x7FFF) << 16; - if (lo > M) { - lo &= M; - ++lo; - } - lo += hi >> 15; - if (lo > M) { - lo &= M; - ++lo; - } - return (seed_ = (int32) lo); -} - -int32 ACMRandom::Uniform(int32 n) { - return Next() % n; -} - -} // namespace re2 diff --git a/contrib/libre2/util/random.h b/contrib/libre2/util/random.h deleted file mode 100644 index 6c6e701ddf6..00000000000 --- a/contrib/libre2/util/random.h +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2005-2009 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Modified from Google perftools's tcmalloc_unittest.cc. - -#ifndef RE2_UTIL_RANDOM_H__ -#define RE2_UTIL_RANDOM_H__ - -#include "util/util.h" - -namespace re2 { - -// ACM minimal standard random number generator. (re-entrant.) -class ACMRandom { - public: - ACMRandom(int32 seed) : seed_(seed) {} - int32 Next(); - int32 Uniform(int32); - - void Reset(int32 seed) { seed_ = seed; } - - private: - int32 seed_; -}; - -} // namespace re2 - -#endif // RE2_UTIL_RANDOM_H__ diff --git a/contrib/libre2/util/rune.cc b/contrib/libre2/util/rune.cc index 26442b0ad3d..4f625ea380f 100644 --- a/contrib/libre2/util/rune.cc +++ b/contrib/libre2/util/rune.cc @@ -11,8 +11,10 @@ * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE. */ + #include #include + #include "util/utf.h" namespace re2 { @@ -133,7 +135,7 @@ runetochar(char *str, const Rune *rune) */ c = *rune; if(c <= Rune1) { - str[0] = c; + str[0] = static_cast(c); return 1; } @@ -142,7 +144,7 @@ runetochar(char *str, const Rune *rune) * 0080-07FF => T2 Tx */ if(c <= Rune2) { - str[0] = T2 | (c >> 1*Bitx); + str[0] = T2 | static_cast(c >> 1*Bitx); str[1] = Tx | (c & Maskx); return 2; } @@ -161,9 +163,9 @@ runetochar(char *str, const Rune *rune) * 0800-FFFF => T3 Tx Tx */ if (c <= Rune3) { - str[0] = T3 | (c >> 2*Bitx); + str[0] = T3 | static_cast(c >> 2*Bitx); str[1] = Tx | ((c >> 1*Bitx) & Maskx); - str[2] = Tx | (c & Maskx); + str[2] = Tx | (c & Maskx); return 3; } @@ -171,7 +173,7 @@ runetochar(char *str, const Rune *rune) * four character sequence (21-bit value) * 10000-1FFFFF => T4 Tx Tx Tx */ - str[0] = T4 | (c >> 3*Bitx); + str[0] = T4 | static_cast(c >> 3*Bitx); str[1] = Tx | ((c >> 2*Bitx) & Maskx); str[2] = Tx | ((c >> 1*Bitx) & Maskx); str[3] = Tx | (c & Maskx); diff --git a/contrib/libre2/util/sparse_array.h b/contrib/libre2/util/sparse_array.h index f572905e5b8..86d4d50d169 100644 --- a/contrib/libre2/util/sparse_array.h +++ b/contrib/libre2/util/sparse_array.h @@ -2,97 +2,111 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +#ifndef UTIL_SPARSE_ARRAY_H_ +#define UTIL_SPARSE_ARRAY_H_ + // DESCRIPTION -// +// // SparseArray(m) is a map from integers in [0, m) to T values. // It requires (sizeof(T)+sizeof(int))*m memory, but it provides // fast iteration through the elements in the array and fast clearing // of the array. The array has a concept of certain elements being // uninitialized (having no value). -// +// // Insertion and deletion are constant time operations. -// -// Allocating the array is a constant time operation +// +// Allocating the array is a constant time operation // when memory allocation is a constant time operation. -// +// // Clearing the array is a constant time operation (unusual!). -// +// // Iterating through the array is an O(n) operation, where n // is the number of items in the array (not O(m)). // -// The array iterator visits entries in the order they were first +// The array iterator visits entries in the order they were first // inserted into the array. It is safe to add items to the array while // using an iterator: the iterator will visit indices added to the array // during the iteration, but will not re-visit indices whose values // change after visiting. Thus SparseArray can be a convenient // implementation of a work queue. -// +// // The SparseArray implementation is NOT thread-safe. It is up to the // caller to make sure only one thread is accessing the array. (Typically // these arrays are temporary values and used in situations where speed is // important.) -// +// // The SparseArray interface does not present all the usual STL bells and // whistles. -// +// // Implemented with reference to Briggs & Torczon, An Efficient // Representation for Sparse Sets, ACM Letters on Programming Languages // and Systems, Volume 2, Issue 1-4 (March-Dec. 1993), pp. 59-69. -// +// // Briggs & Torczon popularized this technique, but it had been known // long before their paper. They point out that Aho, Hopcroft, and // Ullman's 1974 Design and Analysis of Computer Algorithms and Bentley's // 1986 Programming Pearls both hint at the technique in exercises to the // reader (in Aho & Hopcroft, exercise 2.12; in Bentley, column 1 // exercise 8). -// +// // Briggs & Torczon describe a sparse set implementation. I have // trivially generalized it to create a sparse array (actually the original // target of the AHU and Bentley exercises). // IMPLEMENTATION // -// SparseArray uses a vector dense_ and an array sparse_to_dense_, both of -// size max_size_. At any point, the number of elements in the sparse array is -// size_. -// -// The vector dense_ contains the size_ elements in the sparse array (with +// SparseArray is an array dense_ and an array sparse_, both of size max_size_. +// At any point, the number of elements in the sparse array is size_. +// +// The array dense_ contains the size_ elements in the sparse array (with // their indices), // in the order that the elements were first inserted. This array is dense: // the size_ pairs are dense_[0] through dense_[size_-1]. // -// The array sparse_to_dense_ maps from indices in [0,m) to indices in -// [0,size_). -// For indices present in the array, dense_[sparse_to_dense_[i]].index_ == i. -// For indices not present in the array, sparse_to_dense_ can contain -// any value at all, perhaps outside the range [0, size_) but perhaps not. -// -// The lax requirement on sparse_to_dense_ values makes clearing -// the array very easy: set size_ to 0. Lookups are slightly more -// complicated. An index i has a value in the array if and only if: -// sparse_to_dense_[i] is in [0, size_) AND -// dense_[sparse_to_dense_[i]].index_ == i. -// If both these properties hold, only then it is safe to refer to -// dense_[sparse_to_dense_[i]].value_ +// The array sparse_ maps from indices in [0,m) to indices in [0,size_). +// For indices present in the array, dense_[sparse_[i]].index_ == i. +// For indices not present in the array, sparse_ can contain any value at all, +// perhaps outside the range [0, size_) but perhaps not. +// +// The lax requirement on sparse_ values makes clearing the array very easy: +// set size_ to 0. Lookups are slightly more complicated. +// An index i has a value in the array if and only if: +// sparse_[i] is in [0, size_) AND +// dense_[sparse_[i]].index_ == i. +// If both these properties hold, only then it is safe to refer to +// dense_[sparse_[i]].value_ // as the value associated with index i. // -// To insert a new entry, set sparse_to_dense_[i] to size_, +// To insert a new entry, set sparse_[i] to size_, // initialize dense_[size_], and then increment size_. // // Deletion of specific values from the array is implemented by // swapping dense_[size_-1] and the dense_ being deleted and then -// updating the appropriate sparse_to_dense_ entries. -// +// updating the appropriate sparse_ entries. +// // To make the sparse array as efficient as possible for non-primitive types, // elements may or may not be destroyed when they are deleted from the sparse // array through a call to erase(), erase_existing() or resize(). They // immediately become inaccessible, but they are only guaranteed to be // destroyed when the SparseArray destructor is called. +// +// A moved-from SparseArray will be empty. -#ifndef RE2_UTIL_SPARSE_ARRAY_H__ -#define RE2_UTIL_SPARSE_ARRAY_H__ +// Doing this simplifies the logic below. +#ifndef __has_feature +#define __has_feature(x) 0 +#endif -#include "util/util.h" +#include +#include +#include +#if __has_feature(memory_sanitizer) +#include +#endif +#include +#include +#include +#include namespace re2 { @@ -100,36 +114,49 @@ template class SparseArray { public: SparseArray(); - SparseArray(int max_size); + explicit SparseArray(int max_size); ~SparseArray(); // IndexValue pairs: exposed in SparseArray::iterator. class IndexValue; + static_assert(std::is_trivially_destructible::value, + "IndexValue must be trivially destructible"); typedef IndexValue value_type; - typedef typename vector::iterator iterator; - typedef typename vector::const_iterator const_iterator; + typedef IndexValue* iterator; + typedef const IndexValue* const_iterator; - inline const IndexValue& iv(int i) const; + SparseArray(const SparseArray& src); + SparseArray(SparseArray&& src) /*noexcept*/; + + SparseArray& operator=(const SparseArray& src); + SparseArray& operator=(SparseArray&& src) /*noexcept*/; + + const IndexValue& iv(int i) const; // Return the number of entries in the array. int size() const { return size_; } + // Indicate whether the array is empty. + int empty() const { + return size_ == 0; + } + // Iterate over the array. iterator begin() { - return dense_.begin(); + return dense_.get(); } iterator end() { - return dense_.begin() + size_; + return dense_.get() + size_; } const_iterator begin() const { - return dense_.begin(); + return dense_.get(); } const_iterator end() const { - return dense_.begin() + size_; + return dense_.get() + size_; } // Change the maximum size of the array. @@ -148,39 +175,68 @@ class SparseArray { } // Check whether index i is in the array. - inline bool has_index(int i) const; + bool has_index(int i) const; // Comparison function for sorting. // Can sort the sparse array so that future iterations // will visit indices in increasing order using - // sort(arr.begin(), arr.end(), arr.less); + // std::sort(arr.begin(), arr.end(), arr.less); static bool less(const IndexValue& a, const IndexValue& b); public: // Set the value at index i to v. - inline iterator set(int i, Value v); + iterator set(int i, const Value& v) { + return SetInternal(true, i, v); + } + iterator set(int i, Value&& v) { // NOLINT + return SetInternal(true, i, std::move(v)); + } - pair insert(const value_type& new_value); + std::pair insert(const value_type& v) { + return InsertInternal(v); + } + std::pair insert(value_type&& v) { // NOLINT + return InsertInternal(std::move(v)); + } - // Returns the value at index i - // or defaultv if index i is not initialized in the array. - inline Value get(int i, Value defaultv) const; + template + std::pair emplace(Args&&... args) { // NOLINT + return InsertInternal(value_type(std::forward(args)...)); + } - iterator find(int i); + iterator find(int i) { + if (has_index(i)) + return dense_.get() + sparse_[i]; + return end(); + } - const_iterator find(int i) const; + const_iterator find(int i) const { + if (has_index(i)) + return dense_.get() + sparse_[i]; + return end(); + } // Change the value at index i to v. // Fast but unsafe: only use if has_index(i) is true. - inline iterator set_existing(int i, Value v); + iterator set_existing(int i, const Value& v) { + return SetExistingInternal(i, v); + } + iterator set_existing(int i, Value&& v) { // NOLINT + return SetExistingInternal(i, std::move(v)); + } // Set the value at the new index i to v. // Fast but unsafe: only use if has_index(i) is false. - inline iterator set_new(int i, Value v); + iterator set_new(int i, const Value& v) { + return SetInternal(false, i, v); + } + iterator set_new(int i, Value&& v) { // NOLINT + return SetInternal(false, i, std::move(v)); + } // Get the value at index i from the array.. // Fast but unsafe: only use if has_index(i) is true. - inline Value get_existing(int i) const; + const Value& get_existing(int i) const; // Erasing items from the array during iteration is in general // NOT safe. There is one special case, which is that the current @@ -201,37 +257,132 @@ class SparseArray { // the iterators could walk past the end of the array. // Erases the element at index i from the array. - inline void erase(int i); + void erase(int i); // Erases the element at index i from the array. // Fast but unsafe: only use if has_index(i) is true. - inline void erase_existing(int i); + void erase_existing(int i); private: + template + std::pair InsertInternal(U&& v) { + DebugCheckInvariants(); + std::pair p; + if (has_index(v.index_)) { + p = {dense_.get() + sparse_[v.index_], false}; + } else { + p = {set_new(std::forward(v).index_, std::forward(v).second), true}; + } + DebugCheckInvariants(); + return p; + } + + template + iterator SetInternal(bool allow_overwrite, int i, U&& v) { // NOLINT + DebugCheckInvariants(); + if (static_cast(i) >= static_cast(max_size_)) { + assert(false && "illegal index"); + // Semantically, end() would be better here, but we already know + // the user did something stupid, so begin() insulates them from + // dereferencing an invalid pointer. + return begin(); + } + if (!allow_overwrite) { + assert(!has_index(i)); + create_index(i); + } else { + if (!has_index(i)) + create_index(i); + } + return set_existing(i, std::forward(v)); // NOLINT + } + + template + iterator SetExistingInternal(int i, U&& v) { // NOLINT + DebugCheckInvariants(); + assert(has_index(i)); + dense_[sparse_[i]].value() = std::forward(v); + DebugCheckInvariants(); + return dense_.get() + sparse_[i]; + } + // Add the index i to the array. // Only use if has_index(i) is known to be false. // Since it doesn't set the value associated with i, // this function is private, only intended as a helper // for other methods. - inline void create_index(int i); + void create_index(int i); // In debug mode, verify that some invariant properties of the class // are being maintained. This is called at the end of the constructor // and at the beginning and end of all public non-const member functions. - inline void DebugCheckInvariants() const; + void DebugCheckInvariants() const; - int size_; - int max_size_; - int* sparse_to_dense_; - vector dense_; - bool valgrind_; + // Initializes memory for elements [min, max). + void MaybeInitializeMemory(int min, int max) { +#if __has_feature(memory_sanitizer) + __msan_unpoison(sparse_.get() + min, (max - min) * sizeof sparse_[0]); +#elif defined(RE2_ON_VALGRIND) + for (int i = min; i < max; i++) { + sparse_[i] = 0xababababU; + } +#endif + } - DISALLOW_EVIL_CONSTRUCTORS(SparseArray); + int size_ = 0; + int max_size_ = 0; + std::unique_ptr sparse_; + std::unique_ptr dense_; }; template -SparseArray::SparseArray() - : size_(0), max_size_(0), sparse_to_dense_(NULL), dense_(), valgrind_(RunningOnValgrind()) {} +SparseArray::SparseArray() = default; + +template +SparseArray::SparseArray(const SparseArray& src) + : size_(src.size_), + max_size_(src.max_size_), + sparse_(new int[max_size_]), + dense_(new IndexValue[max_size_]) { + std::copy_n(src.sparse_.get(), max_size_, sparse_.get()); + std::copy_n(src.dense_.get(), max_size_, dense_.get()); +} + +template +SparseArray::SparseArray(SparseArray&& src) /*noexcept*/ // NOLINT + : size_(src.size_), + max_size_(src.max_size_), + sparse_(std::move(src.sparse_)), + dense_(std::move(src.dense_)) { + src.size_ = 0; + src.max_size_ = 0; +} + +template +SparseArray& SparseArray::operator=(const SparseArray& src) { + size_ = src.size_; + max_size_ = src.max_size_; + std::unique_ptr a(new int[max_size_]); + std::copy_n(src.sparse_.get(), src.max_size_, a.get()); + sparse_ = std::move(a); + std::unique_ptr b(new IndexValue[max_size_]); + std::copy_n(src.dense_.get(), src.max_size_, b.get()); + dense_ = std::move(b); + return *this; +} + +template +SparseArray& SparseArray::operator=( + SparseArray&& src) /*noexcept*/ { // NOLINT + size_ = src.size_; + max_size_ = src.max_size_; + sparse_ = std::move(src.sparse_); + dense_ = std::move(src.dense_); + // clear out the source + src.size_ = 0; + src.max_size_ = 0; + return *this; +} // IndexValue pairs: exposed in SparseArray::iterator. template @@ -242,48 +393,55 @@ class SparseArray::IndexValue { typedef Value second_type; IndexValue() {} - IndexValue(int index, const Value& value) : second(value), index_(index) {} + IndexValue(int i, const Value& v) : index_(i), second(v) {} + IndexValue(int i, Value&& v) : index_(i), second(std::move(v)) {} int index() const { return index_; } - Value value() const { return second; } - // Provide the data in the 'second' member so that the utilities - // in map-util work. - Value second; + Value& value() /*&*/ { return second; } + const Value& value() const /*&*/ { return second; } + //Value&& value() /*&&*/ { return std::move(second); } // NOLINT private: int index_; + + public: + // Provide the data in the 'second' member so that the utilities + // in map-util work. + // TODO(billydonahue): 'second' is public for short-term compatibility. + // Users will be transitioned to using value() accessor. + Value second; }; template const typename SparseArray::IndexValue& SparseArray::iv(int i) const { - DCHECK_GE(i, 0); - DCHECK_LT(i, size_); + assert(i >= 0); + assert(i < size_); return dense_[i]; } // Change the maximum size of the array. // Invalidates all iterators. template -void SparseArray::resize(int new_max_size) { +void SparseArray::resize(int max_size) { DebugCheckInvariants(); - if (new_max_size > max_size_) { - int* a = new int[new_max_size]; - if (sparse_to_dense_) { - memmove(a, sparse_to_dense_, max_size_*sizeof a[0]); - // Don't need to zero the memory but appease Valgrind. - if (valgrind_) { - for (int i = max_size_; i < new_max_size; i++) - a[i] = 0xababababU; - } - delete[] sparse_to_dense_; + if (max_size > max_size_) { + std::unique_ptr a(new int[max_size]); + if (sparse_) { + std::copy_n(sparse_.get(), max_size_, a.get()); } - sparse_to_dense_ = a; + sparse_ = std::move(a); - dense_.resize(new_max_size); + std::unique_ptr b(new IndexValue[max_size]); + if (dense_) { + std::copy_n(dense_.get(), max_size_, b.get()); + } + dense_ = std::move(b); + + MaybeInitializeMemory(max_size_, max_size); } - max_size_ = new_max_size; + max_size_ = max_size; if (size_ > max_size_) size_ = max_size_; DebugCheckInvariants(); @@ -292,97 +450,20 @@ void SparseArray::resize(int new_max_size) { // Check whether index i is in the array. template bool SparseArray::has_index(int i) const { - DCHECK_GE(i, 0); - DCHECK_LT(i, max_size_); - if (static_cast(i) >= static_cast(max_size_)) { + assert(i >= 0); + assert(i < max_size_); + if (static_cast(i) >= static_cast(max_size_)) { return false; } - // Unsigned comparison avoids checking sparse_to_dense_[i] < 0. - return (uint)sparse_to_dense_[i] < (uint)size_ && - dense_[sparse_to_dense_[i]].index_ == i; -} - -// Set the value at index i to v. -template -typename SparseArray::iterator SparseArray::set(int i, Value v) { - DebugCheckInvariants(); - if (static_cast(i) >= static_cast(max_size_)) { - // Semantically, end() would be better here, but we already know - // the user did something stupid, so begin() insulates them from - // dereferencing an invalid pointer. - return begin(); - } - if (!has_index(i)) - create_index(i); - return set_existing(i, v); + // Unsigned comparison avoids checking sparse_[i] < 0. + return (uint32_t)sparse_[i] < (uint32_t)size_ && + dense_[sparse_[i]].index_ == i; } template -pair::iterator, bool> SparseArray::insert( - const value_type& new_value) { - DebugCheckInvariants(); - pair::iterator, bool> p; - if (has_index(new_value.index_)) { - p = make_pair(dense_.begin() + sparse_to_dense_[new_value.index_], false); - } else { - p = make_pair(set_new(new_value.index_, new_value.second), true); - } - DebugCheckInvariants(); - return p; -} - -template -Value SparseArray::get(int i, Value defaultv) const { - if (!has_index(i)) - return defaultv; - return get_existing(i); -} - -template -typename SparseArray::iterator SparseArray::find(int i) { - if (has_index(i)) - return dense_.begin() + sparse_to_dense_[i]; - return end(); -} - -template -typename SparseArray::const_iterator -SparseArray::find(int i) const { - if (has_index(i)) { - return dense_.begin() + sparse_to_dense_[i]; - } - return end(); -} - -template -typename SparseArray::iterator -SparseArray::set_existing(int i, Value v) { - DebugCheckInvariants(); - DCHECK(has_index(i)); - dense_[sparse_to_dense_[i]].second = v; - DebugCheckInvariants(); - return dense_.begin() + sparse_to_dense_[i]; -} - -template -typename SparseArray::iterator -SparseArray::set_new(int i, Value v) { - DebugCheckInvariants(); - if (static_cast(i) >= static_cast(max_size_)) { - // Semantically, end() would be better here, but we already know - // the user did something stupid, so begin() insulates them from - // dereferencing an invalid pointer. - return begin(); - } - DCHECK(!has_index(i)); - create_index(i); - return set_existing(i, v); -} - -template -Value SparseArray::get_existing(int i) const { - DCHECK(has_index(i)); - return dense_[sparse_to_dense_[i]].second; +const Value& SparseArray::get_existing(int i) const { + assert(has_index(i)); + return dense_[sparse_[i]].second; } template @@ -396,11 +477,11 @@ void SparseArray::erase(int i) { template void SparseArray::erase_existing(int i) { DebugCheckInvariants(); - DCHECK(has_index(i)); - int di = sparse_to_dense_[i]; + assert(has_index(i)); + int di = sparse_[i]; if (di < size_ - 1) { - dense_[di] = dense_[size_ - 1]; - sparse_to_dense_[dense_[di].index_] = di; + dense_[di] = std::move(dense_[size_ - 1]); + sparse_[dense_[di].index_] = di; } size_--; DebugCheckInvariants(); @@ -408,38 +489,30 @@ void SparseArray::erase_existing(int i) { template void SparseArray::create_index(int i) { - DCHECK(!has_index(i)); - DCHECK_LT(size_, max_size_); - sparse_to_dense_[i] = size_; + assert(!has_index(i)); + assert(size_ < max_size_); + sparse_[i] = size_; dense_[size_].index_ = i; size_++; } template SparseArray::SparseArray(int max_size) { - max_size_ = max_size; - sparse_to_dense_ = new int[max_size]; - valgrind_ = RunningOnValgrind(); - dense_.resize(max_size); - // Don't need to zero the new memory, but appease Valgrind. - if (valgrind_) { - for (int i = 0; i < max_size; i++) { - sparse_to_dense_[i] = 0xababababU; - dense_[i].index_ = 0xababababU; - } - } + sparse_.reset(new int[max_size]); + dense_.reset(new IndexValue[max_size]); size_ = 0; + MaybeInitializeMemory(size_, max_size); + max_size_ = max_size; DebugCheckInvariants(); } template SparseArray::~SparseArray() { DebugCheckInvariants(); - delete[] sparse_to_dense_; } template void SparseArray::DebugCheckInvariants() const { - DCHECK_LE(0, size_); - DCHECK_LE(size_, max_size_); - DCHECK(size_ == 0 || sparse_to_dense_ != NULL); + assert(0 <= size_); + assert(size_ <= max_size_); + assert(size_ == 0 || sparse_ != NULL); } // Comparison function for sorting. @@ -450,4 +523,4 @@ template bool SparseArray::less(const IndexValue& a, } // namespace re2 -#endif // RE2_UTIL_SPARSE_ARRAY_H__ +#endif // UTIL_SPARSE_ARRAY_H_ diff --git a/contrib/libre2/util/sparse_set.h b/contrib/libre2/util/sparse_set.h index 89d0d73aec2..2efdd282501 100644 --- a/contrib/libre2/util/sparse_set.h +++ b/contrib/libre2/util/sparse_set.h @@ -2,178 +2,265 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +#ifndef UTIL_SPARSE_SET_H_ +#define UTIL_SPARSE_SET_H_ + // DESCRIPTION -// -// SparseSet(m) is a set of integers in [0, m). +// +// SparseSet(m) is a set of integers in [0, m). // It requires sizeof(int)*m memory, but it provides // fast iteration through the elements in the set and fast clearing // of the set. -// +// // Insertion and deletion are constant time operations. -// -// Allocating the set is a constant time operation +// +// Allocating the set is a constant time operation // when memory allocation is a constant time operation. -// +// // Clearing the set is a constant time operation (unusual!). -// +// // Iterating through the set is an O(n) operation, where n // is the number of items in the set (not O(m)). // -// The set iterator visits entries in the order they were first -// inserted into the array. It is safe to add items to the set while +// The set iterator visits entries in the order they were first +// inserted into the set. It is safe to add items to the set while // using an iterator: the iterator will visit indices added to the set // during the iteration, but will not re-visit indices whose values // change after visiting. Thus SparseSet can be a convenient // implementation of a work queue. -// +// // The SparseSet implementation is NOT thread-safe. It is up to the // caller to make sure only one thread is accessing the set. (Typically // these sets are temporary values and used in situations where speed is // important.) -// +// // The SparseSet interface does not present all the usual STL bells and // whistles. -// +// // Implemented with reference to Briggs & Torczon, An Efficient // Representation for Sparse Sets, ACM Letters on Programming Languages // and Systems, Volume 2, Issue 1-4 (March-Dec. 1993), pp. 59-69. -// -// For a generalization to sparse array, see sparse_array.h. +// +// This is a specialization of sparse array; see sparse_array.h. // IMPLEMENTATION // -// See sparse_array.h for implementation details +// See sparse_array.h for implementation details. -#ifndef RE2_UTIL_SPARSE_SET_H__ -#define RE2_UTIL_SPARSE_SET_H__ +// Doing this simplifies the logic below. +#ifndef __has_feature +#define __has_feature(x) 0 +#endif -#include "util/util.h" +#include +#include +#include +#if __has_feature(memory_sanitizer) +#include +#endif +#include +#include +#include namespace re2 { -class SparseSet { +template +class SparseSetT { public: - SparseSet() - : size_(0), max_size_(0), sparse_to_dense_(NULL), dense_(NULL), valgrind_(RunningOnValgrind()) {} - - SparseSet(int max_size) { - max_size_ = max_size; - sparse_to_dense_ = new int[max_size]; - dense_ = new int[max_size]; - valgrind_ = RunningOnValgrind(); - // Don't need to zero the memory, but do so anyway - // to appease Valgrind. - if (valgrind_) { - for (int i = 0; i < max_size; i++) { - dense_[i] = 0xababababU; - sparse_to_dense_[i] = 0xababababU; - } - } - size_ = 0; - } - - ~SparseSet() { - delete[] sparse_to_dense_; - delete[] dense_; - } + SparseSetT(); + explicit SparseSetT(int max_size); + ~SparseSetT(); typedef int* iterator; typedef const int* const_iterator; - int size() const { return size_; } - iterator begin() { return dense_; } - iterator end() { return dense_ + size_; } - const_iterator begin() const { return dense_; } - const_iterator end() const { return dense_ + size_; } + // Return the number of entries in the set. + int size() const { + return size_; + } - // Change the maximum size of the array. + // Indicate whether the set is empty. + int empty() const { + return size_ == 0; + } + + // Iterate over the set. + iterator begin() { + return dense_.get(); + } + iterator end() { + return dense_.get() + size_; + } + + const_iterator begin() const { + return dense_.get(); + } + const_iterator end() const { + return dense_.get() + size_; + } + + // Change the maximum size of the set. // Invalidates all iterators. - void resize(int new_max_size) { - if (size_ > new_max_size) - size_ = new_max_size; - if (new_max_size > max_size_) { - int* a = new int[new_max_size]; - if (sparse_to_dense_) { - memmove(a, sparse_to_dense_, max_size_*sizeof a[0]); - if (valgrind_) { - for (int i = max_size_; i < new_max_size; i++) - a[i] = 0xababababU; - } - delete[] sparse_to_dense_; - } - sparse_to_dense_ = a; + void resize(int max_size); - a = new int[new_max_size]; - if (dense_) { - memmove(a, dense_, size_*sizeof a[0]); - if (valgrind_) { - for (int i = size_; i < new_max_size; i++) - a[i] = 0xababababU; - } - delete[] dense_; - } - dense_ = a; - } - max_size_ = new_max_size; - } - - // Return the maximum size of the array. + // Return the maximum size of the set. // Indices can be in the range [0, max_size). - int max_size() const { return max_size_; } - - // Clear the array. - void clear() { size_ = 0; } - - // Check whether i is in the array. - bool contains(int i) const { - DCHECK_GE(i, 0); - DCHECK_LT(i, max_size_); - if (static_cast(i) >= static_cast(max_size_)) { - return false; - } - // Unsigned comparison avoids checking sparse_to_dense_[i] < 0. - return (uint)sparse_to_dense_[i] < (uint)size_ && - dense_[sparse_to_dense_[i]] == i; + int max_size() const { + return max_size_; } - // Adds i to the set. - void insert(int i) { - if (!contains(i)) - insert_new(i); + // Clear the set. + void clear() { + size_ = 0; } - // Set the value at the new index i to v. + // Check whether index i is in the set. + bool contains(int i) const; + + // Comparison function for sorting. + // Can sort the sparse set so that future iterations + // will visit indices in increasing order using + // std::sort(arr.begin(), arr.end(), arr.less); + static bool less(int a, int b); + + public: + // Insert index i into the set. + iterator insert(int i) { + return InsertInternal(true, i); + } + + // Insert index i into the set. // Fast but unsafe: only use if contains(i) is false. - void insert_new(int i) { - if (static_cast(i) >= static_cast(max_size_)) { + iterator insert_new(int i) { + return InsertInternal(false, i); + } + + private: + iterator InsertInternal(bool allow_existing, int i) { + DebugCheckInvariants(); + if (static_cast(i) >= static_cast(max_size_)) { + assert(false && "illegal index"); // Semantically, end() would be better here, but we already know // the user did something stupid, so begin() insulates them from // dereferencing an invalid pointer. - return; + return begin(); } - DCHECK(!contains(i)); - DCHECK_LT(size_, max_size_); - sparse_to_dense_[i] = size_; - dense_[size_] = i; - size_++; + if (!allow_existing) { + assert(!contains(i)); + create_index(i); + } else { + if (!contains(i)) + create_index(i); + } + DebugCheckInvariants(); + return dense_.get() + sparse_[i]; } - // Comparison function for sorting. - // Can sort the sparse array so that future iterations - // will visit indices in increasing order using - // sort(arr.begin(), arr.end(), arr.less); - static bool less(int a, int b) { return a < b; } + // Add the index i to the set. + // Only use if contains(i) is known to be false. + // This function is private, only intended as a helper + // for other methods. + void create_index(int i); - private: - int size_; - int max_size_; - int* sparse_to_dense_; - int* dense_; - bool valgrind_; + // In debug mode, verify that some invariant properties of the class + // are being maintained. This is called at the end of the constructor + // and at the beginning and end of all public non-const member functions. + void DebugCheckInvariants() const; - DISALLOW_EVIL_CONSTRUCTORS(SparseSet); + // Initializes memory for elements [min, max). + void MaybeInitializeMemory(int min, int max) { +#if __has_feature(memory_sanitizer) + __msan_unpoison(sparse_.get() + min, (max - min) * sizeof sparse_[0]); +#elif defined(RE2_ON_VALGRIND) + for (int i = min; i < max; i++) { + sparse_[i] = 0xababababU; + } +#endif + } + + int size_ = 0; + int max_size_ = 0; + std::unique_ptr sparse_; + std::unique_ptr dense_; }; +template +SparseSetT::SparseSetT() = default; + +// Change the maximum size of the set. +// Invalidates all iterators. +template +void SparseSetT::resize(int max_size) { + DebugCheckInvariants(); + if (max_size > max_size_) { + std::unique_ptr a(new int[max_size]); + if (sparse_) { + std::copy_n(sparse_.get(), max_size_, a.get()); + } + sparse_ = std::move(a); + + std::unique_ptr b(new int[max_size]); + if (dense_) { + std::copy_n(dense_.get(), max_size_, b.get()); + } + dense_ = std::move(b); + + MaybeInitializeMemory(max_size_, max_size); + } + max_size_ = max_size; + if (size_ > max_size_) + size_ = max_size_; + DebugCheckInvariants(); +} + +// Check whether index i is in the set. +template +bool SparseSetT::contains(int i) const { + assert(i >= 0); + assert(i < max_size_); + if (static_cast(i) >= static_cast(max_size_)) { + return false; + } + // Unsigned comparison avoids checking sparse_[i] < 0. + return (uint32_t)sparse_[i] < (uint32_t)size_ && + dense_[sparse_[i]] == i; +} + +template +void SparseSetT::create_index(int i) { + assert(!contains(i)); + assert(size_ < max_size_); + sparse_[i] = size_; + dense_[size_] = i; + size_++; +} + +template SparseSetT::SparseSetT(int max_size) { + sparse_.reset(new int[max_size]); + dense_.reset(new int[max_size]); + size_ = 0; + MaybeInitializeMemory(size_, max_size); + max_size_ = max_size; + DebugCheckInvariants(); +} + +template SparseSetT::~SparseSetT() { + DebugCheckInvariants(); +} + +template void SparseSetT::DebugCheckInvariants() const { + assert(0 <= size_); + assert(size_ <= max_size_); + assert(size_ == 0 || sparse_ != NULL); +} + +// Comparison function for sorting. +template bool SparseSetT::less(int a, int b) { + return a < b; +} + +typedef SparseSetT SparseSet; + } // namespace re2 -#endif // RE2_UTIL_SPARSE_SET_H__ +#endif // UTIL_SPARSE_SET_H_ diff --git a/contrib/libre2/util/stringpiece.cc b/contrib/libre2/util/stringpiece.cc deleted file mode 100644 index cce3235061f..00000000000 --- a/contrib/libre2/util/stringpiece.cc +++ /dev/null @@ -1,87 +0,0 @@ -// Copyright 2004 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#include "re2/stringpiece.h" -#include "util/util.h" - -using re2::StringPiece; - -std::ostream& operator<<(std::ostream& o, const StringPiece& piece) { - o.write(piece.data(), piece.size()); - return o; -} - -bool StringPiece::_equal(const StringPiece& x, const StringPiece& y) { - int len = x.size(); - if (len != y.size()) { - return false; - } - const char* p = x.data(); - const char* p2 = y.data(); - // Test last byte in case strings share large common prefix - if ((len > 0) && (p[len-1] != p2[len-1])) return false; - const char* p_limit = p + len; - for (; p < p_limit; p++, p2++) { - if (*p != *p2) - return false; - } - return true; -} - -void StringPiece::CopyToString(string* target) const { - target->assign(ptr_, length_); -} - -int StringPiece::copy(char* buf, size_type n, size_type pos) const { - int ret = min(length_ - pos, n); - memcpy(buf, ptr_ + pos, ret); - return ret; -} - -int StringPiece::find(const StringPiece& s, size_type pos) const { - if (length_ < 0 || pos > static_cast(length_)) - return npos; - - const char* result = std::search(ptr_ + pos, ptr_ + length_, - s.ptr_, s.ptr_ + s.length_); - const size_type xpos = result - ptr_; - return xpos + s.length_ <= static_cast(length_) ? xpos : npos; -} - -int StringPiece::find(char c, size_type pos) const { - if (length_ <= 0 || pos >= static_cast(length_)) { - return npos; - } - const char* result = std::find(ptr_ + pos, ptr_ + length_, c); - return result != ptr_ + length_ ? result - ptr_ : npos; -} - -int StringPiece::rfind(const StringPiece& s, size_type pos) const { - if (length_ < s.length_) return npos; - const size_t ulen = length_; - if (s.length_ == 0) return min(ulen, pos); - - const char* last = ptr_ + min(ulen - s.length_, pos) + s.length_; - const char* result = std::find_end(ptr_, last, s.ptr_, s.ptr_ + s.length_); - return result != last ? result - ptr_ : npos; -} - -int StringPiece::rfind(char c, size_type pos) const { - if (length_ <= 0) return npos; - for (int i = min(pos, static_cast(length_ - 1)); - i >= 0; --i) { - if (ptr_[i] == c) { - return i; - } - } - return npos; -} - -StringPiece StringPiece::substr(size_type pos, size_type n) const { - if (pos > static_cast(length_)) pos = length_; - if (n > length_ - pos) n = length_ - pos; - return StringPiece(ptr_ + pos, n); -} - -const StringPiece::size_type StringPiece::npos = size_type(-1); diff --git a/contrib/libre2/util/stringprintf.cc b/contrib/libre2/util/stringprintf.cc deleted file mode 100644 index 1618951db39..00000000000 --- a/contrib/libre2/util/stringprintf.cc +++ /dev/null @@ -1,78 +0,0 @@ -// Copyright 2002 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#include "util/util.h" - -namespace re2 { - -static void StringAppendV(string* dst, const char* format, va_list ap) { - // First try with a small fixed size buffer - char space[1024]; - - // It's possible for methods that use a va_list to invalidate - // the data in it upon use. The fix is to make a copy - // of the structure before using it and use that copy instead. - va_list backup_ap; - va_copy(backup_ap, ap); - int result = vsnprintf(space, sizeof(space), format, backup_ap); - va_end(backup_ap); - - if ((result >= 0) && (static_cast(result) < sizeof(space))) { - // It fit - dst->append(space, result); - return; - } - - // Repeatedly increase buffer size until it fits - int length = sizeof(space); - while (true) { - if (result < 0) { - // Older behavior: just try doubling the buffer size - length *= 2; - } else { - // We need exactly "result+1" characters - length = result+1; - } - char* buf = new char[length]; - - // Restore the va_list before we use it again - va_copy(backup_ap, ap); - result = vsnprintf(buf, length, format, backup_ap); - va_end(backup_ap); - - if ((result >= 0) && (result < length)) { - // It fit - dst->append(buf, result); - delete[] buf; - return; - } - delete[] buf; - } -} - -string StringPrintf(const char* format, ...) { - va_list ap; - va_start(ap, format); - string result; - StringAppendV(&result, format, ap); - va_end(ap); - return result; -} - -void SStringPrintf(string* dst, const char* format, ...) { - va_list ap; - va_start(ap, format); - dst->clear(); - StringAppendV(dst, format, ap); - va_end(ap); -} - -void StringAppendF(string* dst, const char* format, ...) { - va_list ap; - va_start(ap, format); - StringAppendV(dst, format, ap); - va_end(ap); -} - -} // namespace re2 diff --git a/contrib/libre2/util/strutil.cc b/contrib/libre2/util/strutil.cc index 6ab79b3c6b6..8eabfa475cf 100644 --- a/contrib/libre2/util/strutil.cc +++ b/contrib/libre2/util/strutil.cc @@ -2,8 +2,15 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -#include "util/util.h" -#include "re2/stringpiece.h" +#include +#include + +#include "util/strutil.h" + +#ifdef _WIN32 +#define snprintf _snprintf +#define vsnprintf _vsnprintf +#endif namespace re2 { @@ -12,16 +19,16 @@ namespace re2 { // Copies 'src' to 'dest', escaping dangerous characters using // C-style escape sequences. 'src' and 'dest' should not overlap. // Returns the number of bytes written to 'dest' (not including the \0) -// or -1 if there was insufficient space. +// or (size_t)-1 if there was insufficient space. // ---------------------------------------------------------------------- -int CEscapeString(const char* src, int src_len, char* dest, - int dest_len) { +static size_t CEscapeString(const char* src, size_t src_len, + char* dest, size_t dest_len) { const char* src_end = src + src_len; - int used = 0; + size_t used = 0; for (; src < src_end; src++) { - if (dest_len - used < 2) // Need space for two letter escape - return -1; + if (dest_len - used < 2) // space for two-character escape + return (size_t)-1; unsigned char c = *src; switch (c) { @@ -36,9 +43,9 @@ int CEscapeString(const char* src, int src_len, char* dest, // digit then that digit must be escaped too to prevent it being // interpreted as part of the character code by C. if (c < ' ' || c > '~') { - if (dest_len - used < 4) // need space for 4 letter escape - return -1; - sprintf(dest + used, "\\%03o", c); + if (dest_len - used < 5) // space for four-character escape + \0 + return (size_t)-1; + snprintf(dest + used, 5, "\\%03o", c); used += 4; } else { dest[used++] = c; break; @@ -47,51 +54,111 @@ int CEscapeString(const char* src, int src_len, char* dest, } if (dest_len - used < 1) // make sure that there is room for \0 - return -1; + return (size_t)-1; dest[used] = '\0'; // doesn't count towards return value though return used; } - // ---------------------------------------------------------------------- // CEscape() // Copies 'src' to result, escaping dangerous characters using -// C-style escape sequences. 'src' and 'dest' should not overlap. +// C-style escape sequences. 'src' and 'dest' should not overlap. // ---------------------------------------------------------------------- string CEscape(const StringPiece& src) { - const int dest_length = src.size() * 4 + 1; // Maximum possible expansion - char* dest = new char[dest_length]; - const int len = CEscapeString(src.data(), src.size(), - dest, dest_length); - string s = string(dest, len); + const size_t dest_len = src.size() * 4 + 1; // Maximum possible expansion + char* dest = new char[dest_len]; + const size_t used = CEscapeString(src.data(), src.size(), + dest, dest_len); + string s = string(dest, used); delete[] dest; return s; } -string PrefixSuccessor(const StringPiece& prefix) { +void PrefixSuccessor(string* prefix) { // We can increment the last character in the string and be done // unless that character is 255, in which case we have to erase the // last character and increment the previous character, unless that // is 255, etc. If the string is empty or consists entirely of // 255's, we just return the empty string. - bool done = false; - string limit(prefix.data(), prefix.size()); - int index = limit.length() - 1; - while (!done && index >= 0) { - if ((limit[index]&255) == 255) { - limit.erase(index); - index--; + while (!prefix->empty()) { + char& c = prefix->back(); + if (c == '\xff') { // char literal avoids signed/unsigned. + prefix->pop_back(); } else { - limit[index]++; - done = true; + ++c; + break; } } - if (!done) { - return ""; - } else { - return limit; - } +} + +static void StringAppendV(string* dst, const char* format, va_list ap) { + // First try with a small fixed size buffer + char space[1024]; + + // It's possible for methods that use a va_list to invalidate + // the data in it upon use. The fix is to make a copy + // of the structure before using it and use that copy instead. + va_list backup_ap; + va_copy(backup_ap, ap); + int result = vsnprintf(space, sizeof(space), format, backup_ap); + va_end(backup_ap); + + if ((result >= 0) && (static_cast(result) < sizeof(space))) { + // It fit + dst->append(space, result); + return; + } + + // Repeatedly increase buffer size until it fits + int length = sizeof(space); + while (true) { + if (result < 0) { + // Older behavior: just try doubling the buffer size + length *= 2; + } else { + // We need exactly "result+1" characters + length = result+1; + } + char* buf = new char[length]; + + // Restore the va_list before we use it again + va_copy(backup_ap, ap); + result = vsnprintf(buf, length, format, backup_ap); + va_end(backup_ap); + + if ((result >= 0) && (result < length)) { + // It fit + dst->append(buf, result); + delete[] buf; + return; + } + delete[] buf; + } +} + +string StringPrintf(const char* format, ...) { + va_list ap; + va_start(ap, format); + string result; + StringAppendV(&result, format, ap); + va_end(ap); + return result; +} + +void SStringPrintf(string* dst, const char* format, ...) { + va_list ap; + va_start(ap, format); + dst->clear(); + StringAppendV(dst, format, ap); + va_end(ap); +} + +void StringAppendF(string* dst, const char* format, ...) { + va_list ap; + va_start(ap, format); + StringAppendV(dst, format, ap); + va_end(ap); } } // namespace re2 diff --git a/contrib/libre2/util/strutil.h b/contrib/libre2/util/strutil.h new file mode 100644 index 00000000000..2c3c10467e5 --- /dev/null +++ b/contrib/libre2/util/strutil.h @@ -0,0 +1,23 @@ +// Copyright 2016 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#ifndef UTIL_STRUTIL_H_ +#define UTIL_STRUTIL_H_ + +#include + +#include "re2/stringpiece.h" +#include "util/util.h" + +namespace re2 { + +string CEscape(const StringPiece& src); +void PrefixSuccessor(string* prefix); +string StringPrintf(const char* format, ...); +void SStringPrintf(string* dst, const char* format, ...); +void StringAppendF(string* dst, const char* format, ...); + +} // namespace re2 + +#endif // UTIL_STRUTIL_H_ diff --git a/contrib/libre2/util/test.cc b/contrib/libre2/util/test.cc index 0644829d8ac..fb31ed812e5 100644 --- a/contrib/libre2/util/test.cc +++ b/contrib/libre2/util/test.cc @@ -3,7 +3,10 @@ // license that can be found in the LICENSE file. #include +#ifndef _WIN32 #include +#endif + #include "util/test.h" DEFINE_string(test_tmpdir, "/var/tmp", "temp directory"); @@ -21,15 +24,7 @@ void RegisterTest(void (*fn)(void), const char *name) { tests[ntests++].name = name; } -namespace re2 { -int64 VirtualProcessSize() { - struct rusage ru; - getrusage(RUSAGE_SELF, &ru); - return (int64)ru.ru_maxrss*1024; -} -} // namespace re2 - -int main(int argc, char **argv) { +int main(int argc, char** argv) { for (int i = 0; i < ntests; i++) { printf("%s\n", tests[i].name); tests[i].fn(); diff --git a/contrib/libre2/util/test.h b/contrib/libre2/util/test.h index 0f938655536..e075c1ecc8d 100644 --- a/contrib/libre2/util/test.h +++ b/contrib/libre2/util/test.h @@ -2,11 +2,12 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -#ifndef RE2_UTIL_TEST_H__ -#define RE2_UTIL_TEST_H__ +#ifndef UTIL_TEST_H_ +#define UTIL_TEST_H_ #include "util/util.h" #include "util/flags.h" +#include "util/logging.h" #define TEST(x, y) \ void x##y(void); \ @@ -31,27 +32,15 @@ class TestRegisterer { #define EXPECT_GE CHECK_GE #define EXPECT_FALSE(x) CHECK(!(x)) -#define ARRAYSIZE arraysize - -#define EXPECT_TRUE_M(x, y) CHECK(x) << (y) -#define EXPECT_FALSE_M(x, y) CHECK(!(x)) << (y) -#define ASSERT_TRUE_M(x, y) CHECK(x) << (y) -#define ASSERT_EQUALS(x, y) CHECK_EQ(x, y) - -const bool UsingMallocCounter = false; namespace testing { class MallocCounter { public: - MallocCounter(int x) { } + MallocCounter(int x) {} static const int THIS_THREAD_ONLY = 0; long long HeapGrowth() { return 0; } long long PeakHeapGrowth() { return 0; } - void Reset() { } + void Reset() {} }; } // namespace testing -namespace re2 { -int64 VirtualProcessSize(); -} // namespace re2 - -#endif // RE2_UTIL_TEST_H__ +#endif // UTIL_TEST_H_ diff --git a/contrib/libre2/util/thread.cc b/contrib/libre2/util/thread.cc deleted file mode 100644 index 7349991530a..00000000000 --- a/contrib/libre2/util/thread.cc +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2009 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#include - -#include "util/util.h" -#include "util/thread.h" - -Thread::Thread() { - pid_ = 0; - running_ = 0; - joinable_ = 0; -} - -Thread::~Thread() { -} - -void *startThread(void *v) { - Thread* t = (Thread*)v; - t->Run(); - return 0; -} - -void Thread::Start() { - CHECK(!running_); - pthread_create(&pid_, 0, startThread, this); - running_ = true; - if (!joinable_) - pthread_detach(pid_); -} - -void Thread::Join() { - CHECK(running_); - CHECK(joinable_); - void *val; - pthread_join(pid_, &val); - running_ = 0; -} - -void Thread::SetJoinable(bool j) { - CHECK(!running_); - joinable_ = j; -} diff --git a/contrib/libre2/util/thread.h b/contrib/libre2/util/thread.h deleted file mode 100644 index b9610e04589..00000000000 --- a/contrib/libre2/util/thread.h +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright 2009 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#ifndef RE2_UTIL_THREAD_H__ -#define RE2_UTIL_THREAD_H__ - -#include - -class Thread { - public: - Thread(); - virtual ~Thread(); - void Start(); - void Join(); - void SetJoinable(bool); - virtual void Run() = 0; - - private: - pthread_t pid_; - bool running_; - bool joinable_; -}; - -#endif // RE2_UTIL_THREAD_H__ - diff --git a/contrib/libre2/util/utf.h b/contrib/libre2/util/utf.h index 06ff8f03eb4..85b42972390 100644 --- a/contrib/libre2/util/utf.h +++ b/contrib/libre2/util/utf.h @@ -14,8 +14,9 @@ * This file and rune.cc have been converted to compile as C++ code * in name space re2. */ -#ifndef RE2_UTIL_UTF_H__ -#define RE2_UTIL_UTF_H__ + +#ifndef UTIL_UTF_H_ +#define UTIL_UTF_H_ #include @@ -40,4 +41,4 @@ char* utfrune(const char*, Rune); } // namespace re2 -#endif // RE2_UTIL_UTF_H__ +#endif // UTIL_UTF_H_ diff --git a/contrib/libre2/util/util.h b/contrib/libre2/util/util.h index 6a7b2a8e1be..a69d8429722 100644 --- a/contrib/libre2/util/util.h +++ b/contrib/libre2/util/util.h @@ -2,125 +2,21 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -#ifndef RE2_UTIL_UTIL_H__ -#define RE2_UTIL_UTIL_H__ +#ifndef UTIL_UTIL_H_ +#define UTIL_UTIL_H_ -// C -#include -#include -#include -#include // For size_t -#include -#include -#include -#include -#include // For isdigit, isalpha. - -// C++ -#include +// TODO(junyer): Get rid of this. #include -#include -#include -#include -#include -#include -#include -#include - -// Use std names. -using std::set; -using std::pair; -using std::vector; using std::string; -using std::min; -using std::max; -using std::ostream; -using std::map; -using std::stack; -using std::sort; -using std::swap; -using std::make_pair; -#if defined(__GNUC__) && !defined(USE_CXX0X) && !defined(_LIBCPP_ABI_VERSION) && !defined(OS_ANDROID) +#define arraysize(array) (int)(sizeof(array)/sizeof((array)[0])) -#include -using std::tr1::unordered_set; - -#else - -#include -#if defined(WIN32) || defined(OS_ANDROID) -using std::tr1::unordered_set; -#else -using std::unordered_set; +#ifndef FALLTHROUGH_INTENDED +#define FALLTHROUGH_INTENDED do { } while (0) #endif +#ifndef NO_THREAD_SAFETY_ANALYSIS +#define NO_THREAD_SAFETY_ANALYSIS #endif -namespace re2 { - -typedef int8_t int8; -typedef uint8_t uint8; -typedef int16_t int16; -typedef uint16_t uint16; -typedef int32_t int32; -typedef uint32_t uint32; -typedef int64_t int64; -typedef uint64_t uint64; - -typedef unsigned long ulong; -typedef unsigned int uint; -typedef unsigned short ushort; - -// COMPILE_ASSERT causes a compile error about msg if expr is not true. -#if __cplusplus >= 201103L -#define COMPILE_ASSERT(expr, msg) static_assert(expr, #msg) -#else -template struct CompileAssert {}; -#define COMPILE_ASSERT(expr, msg) \ - typedef CompileAssert<(bool(expr))> msg[bool(expr) ? 1 : -1] -#endif - -// DISALLOW_EVIL_CONSTRUCTORS disallows the copy and operator= functions. -// It goes in the private: declarations in a class. -#define DISALLOW_EVIL_CONSTRUCTORS(TypeName) \ - TypeName(const TypeName&); \ - void operator=(const TypeName&) - -#define arraysize(array) (sizeof(array)/sizeof((array)[0])) - -class StringPiece; - -string CEscape(const StringPiece& src); -int CEscapeString(const char* src, int src_len, char* dest, int dest_len); - -extern string StringPrintf(const char* format, ...); -extern void SStringPrintf(string* dst, const char* format, ...); -extern void StringAppendF(string* dst, const char* format, ...); -extern string PrefixSuccessor(const StringPiece& prefix); - -uint32 hashword(const uint32*, size_t, uint32); -void hashword2(const uint32*, size_t, uint32*, uint32*); - -static inline uint32 Hash32StringWithSeed(const char* s, int len, uint32 seed) { - return hashword((uint32*)s, len/4, seed); -} - -static inline uint64 Hash64StringWithSeed(const char* s, int len, uint32 seed) { - uint32 x, y; - x = seed; - y = 0; - hashword2((uint32*)s, len/4, &x, &y); - return ((uint64)x << 32) | y; -} - -int RunningOnValgrind(); - -} // namespace re2 - -#include "util/arena.h" -#include "util/logging.h" -#include "util/mutex.h" -#include "util/utf.h" - -#endif // RE2_UTIL_UTIL_H__ +#endif // UTIL_UTIL_H_ diff --git a/contrib/libre2/util/valgrind.cc b/contrib/libre2/util/valgrind.cc deleted file mode 100644 index 7115c8efdb5..00000000000 --- a/contrib/libre2/util/valgrind.cc +++ /dev/null @@ -1,24 +0,0 @@ -// Copyright 2009 The RE2 Authors. All Rights Reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -#include "util/util.h" -#include "util/valgrind.h" - -namespace re2 { - -#ifndef __has_feature -#define __has_feature(x) 0 -#endif - -int RunningOnValgrind() { -#if __has_feature(memory_sanitizer) - return true; -#elif defined(RUNNING_ON_VALGRIND) - return RUNNING_ON_VALGRIND; -#else - return 0; -#endif -} - -} // namespace re2 diff --git a/contrib/libre2/util/valgrind.h b/contrib/libre2/util/valgrind.h deleted file mode 100644 index ca10b1a0ddf..00000000000 --- a/contrib/libre2/util/valgrind.h +++ /dev/null @@ -1,4517 +0,0 @@ -/* -*- c -*- - ---------------------------------------------------------------- - - Notice that the following BSD-style license applies to this one - file (valgrind.h) only. The rest of Valgrind is licensed under the - terms of the GNU General Public License, version 2, unless - otherwise indicated. See the COPYING file in the source - distribution for details. - - ---------------------------------------------------------------- - - This file is part of Valgrind, a dynamic binary instrumentation - framework. - - Copyright (C) 2000-2009 Julian Seward. All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - 1. Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - 2. The origin of this software must not be misrepresented; you must - not claim that you wrote the original software. If you use this - software in a product, an acknowledgment in the product - documentation would be appreciated but is not required. - - 3. Altered source versions must be plainly marked as such, and must - not be misrepresented as being the original software. - - 4. The name of the author may not be used to endorse or promote - products derived from this software without specific prior written - permission. - - THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS - OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE - GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - ---------------------------------------------------------------- - - Notice that the above BSD-style license applies to this one file - (valgrind.h) only. The entire rest of Valgrind is licensed under - the terms of the GNU General Public License, version 2. See the - COPYING file in the source distribution for details. - - ---------------------------------------------------------------- -*/ - - -/* This file is for inclusion into client (your!) code. - - You can use these macros to manipulate and query Valgrind's - execution inside your own programs. - - The resulting executables will still run without Valgrind, just a - little bit more slowly than they otherwise would, but otherwise - unchanged. When not running on valgrind, each client request - consumes very few (eg. 7) instructions, so the resulting performance - loss is negligible unless you plan to execute client requests - millions of times per second. Nevertheless, if that is still a - problem, you can compile with the NVALGRIND symbol defined (gcc - -DNVALGRIND) so that client requests are not even compiled in. */ - -#ifndef __VALGRIND_H -#define __VALGRIND_H - -#include - -/* Nb: this file might be included in a file compiled with -ansi. So - we can't use C++ style "//" comments nor the "asm" keyword (instead - use "__asm__"). */ - -/* Derive some tags indicating what the target platform is. Note - that in this file we're using the compiler's CPP symbols for - identifying architectures, which are different to the ones we use - within the rest of Valgrind. Note, __powerpc__ is active for both - 32 and 64-bit PPC, whereas __powerpc64__ is only active for the - latter (on Linux, that is). - - Misc note: how to find out what's predefined in gcc by default: - gcc -Wp,-dM somefile.c -*/ -#undef PLAT_ppc64_aix5 -#undef PLAT_ppc32_aix5 -#undef PLAT_x86_darwin -#undef PLAT_amd64_darwin -#undef PLAT_x86_linux -#undef PLAT_amd64_linux -#undef PLAT_ppc32_linux -#undef PLAT_ppc64_linux -#undef PLAT_arm_linux - -#if defined(_AIX) && defined(__64BIT__) -# define PLAT_ppc64_aix5 1 -#elif defined(_AIX) && !defined(__64BIT__) -# define PLAT_ppc32_aix5 1 -#elif defined(__APPLE__) && defined(__i386__) -# define PLAT_x86_darwin 1 -#elif defined(__APPLE__) && defined(__x86_64__) -# define PLAT_amd64_darwin 1 -#elif defined(__linux__) && defined(__i386__) -# define PLAT_x86_linux 1 -#elif defined(__linux__) && defined(__x86_64__) -# define PLAT_amd64_linux 1 -#elif defined(__linux__) && defined(__powerpc__) && !defined(__powerpc64__) -# define PLAT_ppc32_linux 1 -#elif defined(__linux__) && defined(__powerpc__) && defined(__powerpc64__) -# define PLAT_ppc64_linux 1 -#elif defined(__linux__) && defined(__arm__) -# define PLAT_arm_linux 1 -#else -/* If we're not compiling for our target platform, don't generate - any inline asms. */ -# if !defined(NVALGRIND) -# define NVALGRIND 1 -# endif -#endif - - -/* ------------------------------------------------------------------ */ -/* ARCHITECTURE SPECIFICS for SPECIAL INSTRUCTIONS. There is nothing */ -/* in here of use to end-users -- skip to the next section. */ -/* ------------------------------------------------------------------ */ - -#if defined(NVALGRIND) - -/* Define NVALGRIND to completely remove the Valgrind magic sequence - from the compiled code (analogous to NDEBUG's effects on - assert()) */ -#define VALGRIND_DO_CLIENT_REQUEST( \ - _zzq_rlval, _zzq_default, _zzq_request, \ - _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5) \ - { \ - (_zzq_rlval) = (_zzq_default); \ - } - -#else /* ! NVALGRIND */ - -/* The following defines the magic code sequences which the JITter - spots and handles magically. Don't look too closely at them as - they will rot your brain. - - The assembly code sequences for all architectures is in this one - file. This is because this file must be stand-alone, and we don't - want to have multiple files. - - For VALGRIND_DO_CLIENT_REQUEST, we must ensure that the default - value gets put in the return slot, so that everything works when - this is executed not under Valgrind. Args are passed in a memory - block, and so there's no intrinsic limit to the number that could - be passed, but it's currently five. - - The macro args are: - _zzq_rlval result lvalue - _zzq_default default value (result returned when running on real CPU) - _zzq_request request code - _zzq_arg1..5 request params - - The other two macros are used to support function wrapping, and are - a lot simpler. VALGRIND_GET_NR_CONTEXT returns the value of the - guest's NRADDR pseudo-register and whatever other information is - needed to safely run the call original from the wrapper: on - ppc64-linux, the R2 value at the divert point is also needed. This - information is abstracted into a user-visible type, OrigFn. - - VALGRIND_CALL_NOREDIR_* behaves the same as the following on the - guest, but guarantees that the branch instruction will not be - redirected: x86: call *%eax, amd64: call *%rax, ppc32/ppc64: - branch-and-link-to-r11. VALGRIND_CALL_NOREDIR is just text, not a - complete inline asm, since it needs to be combined with more magic - inline asm stuff to be useful. -*/ - -/* ------------------------- x86-{linux,darwin} ---------------- */ - -#if defined(PLAT_x86_linux) || defined(PLAT_x86_darwin) - -typedef - struct { - unsigned int nraddr; /* where's the code? */ - } - OrigFn; - -#define __SPECIAL_INSTRUCTION_PREAMBLE \ - "roll $3, %%edi ; roll $13, %%edi\n\t" \ - "roll $29, %%edi ; roll $19, %%edi\n\t" - -#define VALGRIND_DO_CLIENT_REQUEST( \ - _zzq_rlval, _zzq_default, _zzq_request, \ - _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5) \ - { volatile unsigned int _zzq_args[6]; \ - volatile unsigned int _zzq_result; \ - _zzq_args[0] = (unsigned int)(_zzq_request); \ - _zzq_args[1] = (unsigned int)(_zzq_arg1); \ - _zzq_args[2] = (unsigned int)(_zzq_arg2); \ - _zzq_args[3] = (unsigned int)(_zzq_arg3); \ - _zzq_args[4] = (unsigned int)(_zzq_arg4); \ - _zzq_args[5] = (unsigned int)(_zzq_arg5); \ - __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE \ - /* %EDX = client_request ( %EAX ) */ \ - "xchgl %%ebx,%%ebx" \ - : "=d" (_zzq_result) \ - : "a" (&_zzq_args[0]), "0" (_zzq_default) \ - : "cc", "memory" \ - ); \ - _zzq_rlval = _zzq_result; \ - } - -#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval) \ - { volatile OrigFn* _zzq_orig = &(_zzq_rlval); \ - volatile unsigned int __addr; \ - __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE \ - /* %EAX = guest_NRADDR */ \ - "xchgl %%ecx,%%ecx" \ - : "=a" (__addr) \ - : \ - : "cc", "memory" \ - ); \ - _zzq_orig->nraddr = __addr; \ - } - -#define VALGRIND_CALL_NOREDIR_EAX \ - __SPECIAL_INSTRUCTION_PREAMBLE \ - /* call-noredir *%EAX */ \ - "xchgl %%edx,%%edx\n\t" -#endif /* PLAT_x86_linux || PLAT_x86_darwin */ - -/* ------------------------ amd64-{linux,darwin} --------------- */ - -#if defined(PLAT_amd64_linux) || defined(PLAT_amd64_darwin) - -typedef - struct { - unsigned long long int nraddr; /* where's the code? */ - } - OrigFn; - -#define __SPECIAL_INSTRUCTION_PREAMBLE \ - "rolq $3, %%rdi ; rolq $13, %%rdi\n\t" \ - "rolq $61, %%rdi ; rolq $51, %%rdi\n\t" - -#define VALGRIND_DO_CLIENT_REQUEST( \ - _zzq_rlval, _zzq_default, _zzq_request, \ - _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5) \ - { volatile unsigned long long int _zzq_args[6]; \ - volatile unsigned long long int _zzq_result; \ - _zzq_args[0] = (unsigned long long int)(_zzq_request); \ - _zzq_args[1] = (unsigned long long int)(_zzq_arg1); \ - _zzq_args[2] = (unsigned long long int)(_zzq_arg2); \ - _zzq_args[3] = (unsigned long long int)(_zzq_arg3); \ - _zzq_args[4] = (unsigned long long int)(_zzq_arg4); \ - _zzq_args[5] = (unsigned long long int)(_zzq_arg5); \ - __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE \ - /* %RDX = client_request ( %RAX ) */ \ - "xchgq %%rbx,%%rbx" \ - : "=d" (_zzq_result) \ - : "a" (&_zzq_args[0]), "0" (_zzq_default) \ - : "cc", "memory" \ - ); \ - _zzq_rlval = _zzq_result; \ - } - -#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval) \ - { volatile OrigFn* _zzq_orig = &(_zzq_rlval); \ - volatile unsigned long long int __addr; \ - __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE \ - /* %RAX = guest_NRADDR */ \ - "xchgq %%rcx,%%rcx" \ - : "=a" (__addr) \ - : \ - : "cc", "memory" \ - ); \ - _zzq_orig->nraddr = __addr; \ - } - -#define VALGRIND_CALL_NOREDIR_RAX \ - __SPECIAL_INSTRUCTION_PREAMBLE \ - /* call-noredir *%RAX */ \ - "xchgq %%rdx,%%rdx\n\t" -#endif /* PLAT_amd64_linux || PLAT_amd64_darwin */ - -/* ------------------------ ppc32-linux ------------------------ */ - -#if defined(PLAT_ppc32_linux) - -typedef - struct { - unsigned int nraddr; /* where's the code? */ - } - OrigFn; - -#define __SPECIAL_INSTRUCTION_PREAMBLE \ - "rlwinm 0,0,3,0,0 ; rlwinm 0,0,13,0,0\n\t" \ - "rlwinm 0,0,29,0,0 ; rlwinm 0,0,19,0,0\n\t" - -#define VALGRIND_DO_CLIENT_REQUEST( \ - _zzq_rlval, _zzq_default, _zzq_request, \ - _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5) \ - \ - { unsigned int _zzq_args[6]; \ - unsigned int _zzq_result; \ - unsigned int* _zzq_ptr; \ - _zzq_args[0] = (unsigned int)(_zzq_request); \ - _zzq_args[1] = (unsigned int)(_zzq_arg1); \ - _zzq_args[2] = (unsigned int)(_zzq_arg2); \ - _zzq_args[3] = (unsigned int)(_zzq_arg3); \ - _zzq_args[4] = (unsigned int)(_zzq_arg4); \ - _zzq_args[5] = (unsigned int)(_zzq_arg5); \ - _zzq_ptr = _zzq_args; \ - __asm__ volatile("mr 3,%1\n\t" /*default*/ \ - "mr 4,%2\n\t" /*ptr*/ \ - __SPECIAL_INSTRUCTION_PREAMBLE \ - /* %R3 = client_request ( %R4 ) */ \ - "or 1,1,1\n\t" \ - "mr %0,3" /*result*/ \ - : "=b" (_zzq_result) \ - : "b" (_zzq_default), "b" (_zzq_ptr) \ - : "cc", "memory", "r3", "r4"); \ - _zzq_rlval = _zzq_result; \ - } - -#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval) \ - { volatile OrigFn* _zzq_orig = &(_zzq_rlval); \ - unsigned int __addr; \ - __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE \ - /* %R3 = guest_NRADDR */ \ - "or 2,2,2\n\t" \ - "mr %0,3" \ - : "=b" (__addr) \ - : \ - : "cc", "memory", "r3" \ - ); \ - _zzq_orig->nraddr = __addr; \ - } - -#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - __SPECIAL_INSTRUCTION_PREAMBLE \ - /* branch-and-link-to-noredir *%R11 */ \ - "or 3,3,3\n\t" -#endif /* PLAT_ppc32_linux */ - -/* ------------------------ ppc64-linux ------------------------ */ - -#if defined(PLAT_ppc64_linux) - -typedef - struct { - unsigned long long int nraddr; /* where's the code? */ - unsigned long long int r2; /* what tocptr do we need? */ - } - OrigFn; - -#define __SPECIAL_INSTRUCTION_PREAMBLE \ - "rotldi 0,0,3 ; rotldi 0,0,13\n\t" \ - "rotldi 0,0,61 ; rotldi 0,0,51\n\t" - -#define VALGRIND_DO_CLIENT_REQUEST( \ - _zzq_rlval, _zzq_default, _zzq_request, \ - _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5) \ - \ - { unsigned long long int _zzq_args[6]; \ - register unsigned long long int _zzq_result __asm__("r3"); \ - register unsigned long long int* _zzq_ptr __asm__("r4"); \ - _zzq_args[0] = (unsigned long long int)(_zzq_request); \ - _zzq_args[1] = (unsigned long long int)(_zzq_arg1); \ - _zzq_args[2] = (unsigned long long int)(_zzq_arg2); \ - _zzq_args[3] = (unsigned long long int)(_zzq_arg3); \ - _zzq_args[4] = (unsigned long long int)(_zzq_arg4); \ - _zzq_args[5] = (unsigned long long int)(_zzq_arg5); \ - _zzq_ptr = _zzq_args; \ - __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE \ - /* %R3 = client_request ( %R4 ) */ \ - "or 1,1,1" \ - : "=r" (_zzq_result) \ - : "0" (_zzq_default), "r" (_zzq_ptr) \ - : "cc", "memory"); \ - _zzq_rlval = _zzq_result; \ - } - -#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval) \ - { volatile OrigFn* _zzq_orig = &(_zzq_rlval); \ - register unsigned long long int __addr __asm__("r3"); \ - __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE \ - /* %R3 = guest_NRADDR */ \ - "or 2,2,2" \ - : "=r" (__addr) \ - : \ - : "cc", "memory" \ - ); \ - _zzq_orig->nraddr = __addr; \ - __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE \ - /* %R3 = guest_NRADDR_GPR2 */ \ - "or 4,4,4" \ - : "=r" (__addr) \ - : \ - : "cc", "memory" \ - ); \ - _zzq_orig->r2 = __addr; \ - } - -#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - __SPECIAL_INSTRUCTION_PREAMBLE \ - /* branch-and-link-to-noredir *%R11 */ \ - "or 3,3,3\n\t" - -#endif /* PLAT_ppc64_linux */ - -/* ------------------------- arm-linux ------------------------- */ - -#if defined(PLAT_arm_linux) - -typedef - struct { - unsigned int nraddr; /* where's the code? */ - } - OrigFn; - -#define __SPECIAL_INSTRUCTION_PREAMBLE \ - "mov r12, r12, ror #3 ; mov r12, r12, ror #13 \n\t" \ - "mov r12, r12, ror #29 ; mov r12, r12, ror #19 \n\t" - -#define VALGRIND_DO_CLIENT_REQUEST( \ - _zzq_rlval, _zzq_default, _zzq_request, \ - _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5) \ - \ - { volatile unsigned int _zzq_args[6]; \ - volatile unsigned int _zzq_result; \ - _zzq_args[0] = (unsigned int)(_zzq_request); \ - _zzq_args[1] = (unsigned int)(_zzq_arg1); \ - _zzq_args[2] = (unsigned int)(_zzq_arg2); \ - _zzq_args[3] = (unsigned int)(_zzq_arg3); \ - _zzq_args[4] = (unsigned int)(_zzq_arg4); \ - _zzq_args[5] = (unsigned int)(_zzq_arg5); \ - __asm__ volatile("mov r3, %1\n\t" /*default*/ \ - "mov r4, %2\n\t" /*ptr*/ \ - __SPECIAL_INSTRUCTION_PREAMBLE \ - /* R3 = client_request ( R4 ) */ \ - "orr r10, r10, r10\n\t" \ - "mov %0, r3" /*result*/ \ - : "=r" (_zzq_result) \ - : "r" (_zzq_default), "r" (&_zzq_args[0]) \ - : "cc","memory", "r3", "r4"); \ - _zzq_rlval = _zzq_result; \ - } - -#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval) \ - { volatile OrigFn* _zzq_orig = &(_zzq_rlval); \ - unsigned int __addr; \ - __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE \ - /* R3 = guest_NRADDR */ \ - "orr r11, r11, r11\n\t" \ - "mov %0, r3" \ - : "=r" (__addr) \ - : \ - : "cc", "memory", "r3" \ - ); \ - _zzq_orig->nraddr = __addr; \ - } - -#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4 \ - __SPECIAL_INSTRUCTION_PREAMBLE \ - /* branch-and-link-to-noredir *%R4 */ \ - "orr r12, r12, r12\n\t" - -#endif /* PLAT_arm_linux */ - -/* ------------------------ ppc32-aix5 ------------------------- */ - -#if defined(PLAT_ppc32_aix5) - -typedef - struct { - unsigned int nraddr; /* where's the code? */ - unsigned int r2; /* what tocptr do we need? */ - } - OrigFn; - -#define __SPECIAL_INSTRUCTION_PREAMBLE \ - "rlwinm 0,0,3,0,0 ; rlwinm 0,0,13,0,0\n\t" \ - "rlwinm 0,0,29,0,0 ; rlwinm 0,0,19,0,0\n\t" - -#define VALGRIND_DO_CLIENT_REQUEST( \ - _zzq_rlval, _zzq_default, _zzq_request, \ - _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5) \ - \ - { unsigned int _zzq_args[7]; \ - register unsigned int _zzq_result; \ - register unsigned int* _zzq_ptr; \ - _zzq_args[0] = (unsigned int)(_zzq_request); \ - _zzq_args[1] = (unsigned int)(_zzq_arg1); \ - _zzq_args[2] = (unsigned int)(_zzq_arg2); \ - _zzq_args[3] = (unsigned int)(_zzq_arg3); \ - _zzq_args[4] = (unsigned int)(_zzq_arg4); \ - _zzq_args[5] = (unsigned int)(_zzq_arg5); \ - _zzq_args[6] = (unsigned int)(_zzq_default); \ - _zzq_ptr = _zzq_args; \ - __asm__ volatile("mr 4,%1\n\t" \ - "lwz 3, 24(4)\n\t" \ - __SPECIAL_INSTRUCTION_PREAMBLE \ - /* %R3 = client_request ( %R4 ) */ \ - "or 1,1,1\n\t" \ - "mr %0,3" \ - : "=b" (_zzq_result) \ - : "b" (_zzq_ptr) \ - : "r3", "r4", "cc", "memory"); \ - _zzq_rlval = _zzq_result; \ - } - -#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval) \ - { volatile OrigFn* _zzq_orig = &(_zzq_rlval); \ - register unsigned int __addr; \ - __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE \ - /* %R3 = guest_NRADDR */ \ - "or 2,2,2\n\t" \ - "mr %0,3" \ - : "=b" (__addr) \ - : \ - : "r3", "cc", "memory" \ - ); \ - _zzq_orig->nraddr = __addr; \ - __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE \ - /* %R3 = guest_NRADDR_GPR2 */ \ - "or 4,4,4\n\t" \ - "mr %0,3" \ - : "=b" (__addr) \ - : \ - : "r3", "cc", "memory" \ - ); \ - _zzq_orig->r2 = __addr; \ - } - -#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - __SPECIAL_INSTRUCTION_PREAMBLE \ - /* branch-and-link-to-noredir *%R11 */ \ - "or 3,3,3\n\t" - -#endif /* PLAT_ppc32_aix5 */ - -/* ------------------------ ppc64-aix5 ------------------------- */ - -#if defined(PLAT_ppc64_aix5) - -typedef - struct { - unsigned long long int nraddr; /* where's the code? */ - unsigned long long int r2; /* what tocptr do we need? */ - } - OrigFn; - -#define __SPECIAL_INSTRUCTION_PREAMBLE \ - "rotldi 0,0,3 ; rotldi 0,0,13\n\t" \ - "rotldi 0,0,61 ; rotldi 0,0,51\n\t" - -#define VALGRIND_DO_CLIENT_REQUEST( \ - _zzq_rlval, _zzq_default, _zzq_request, \ - _zzq_arg1, _zzq_arg2, _zzq_arg3, _zzq_arg4, _zzq_arg5) \ - \ - { unsigned long long int _zzq_args[7]; \ - register unsigned long long int _zzq_result; \ - register unsigned long long int* _zzq_ptr; \ - _zzq_args[0] = (unsigned int long long)(_zzq_request); \ - _zzq_args[1] = (unsigned int long long)(_zzq_arg1); \ - _zzq_args[2] = (unsigned int long long)(_zzq_arg2); \ - _zzq_args[3] = (unsigned int long long)(_zzq_arg3); \ - _zzq_args[4] = (unsigned int long long)(_zzq_arg4); \ - _zzq_args[5] = (unsigned int long long)(_zzq_arg5); \ - _zzq_args[6] = (unsigned int long long)(_zzq_default); \ - _zzq_ptr = _zzq_args; \ - __asm__ volatile("mr 4,%1\n\t" \ - "ld 3, 48(4)\n\t" \ - __SPECIAL_INSTRUCTION_PREAMBLE \ - /* %R3 = client_request ( %R4 ) */ \ - "or 1,1,1\n\t" \ - "mr %0,3" \ - : "=b" (_zzq_result) \ - : "b" (_zzq_ptr) \ - : "r3", "r4", "cc", "memory"); \ - _zzq_rlval = _zzq_result; \ - } - -#define VALGRIND_GET_NR_CONTEXT(_zzq_rlval) \ - { volatile OrigFn* _zzq_orig = &(_zzq_rlval); \ - register unsigned long long int __addr; \ - __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE \ - /* %R3 = guest_NRADDR */ \ - "or 2,2,2\n\t" \ - "mr %0,3" \ - : "=b" (__addr) \ - : \ - : "r3", "cc", "memory" \ - ); \ - _zzq_orig->nraddr = __addr; \ - __asm__ volatile(__SPECIAL_INSTRUCTION_PREAMBLE \ - /* %R3 = guest_NRADDR_GPR2 */ \ - "or 4,4,4\n\t" \ - "mr %0,3" \ - : "=b" (__addr) \ - : \ - : "r3", "cc", "memory" \ - ); \ - _zzq_orig->r2 = __addr; \ - } - -#define VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - __SPECIAL_INSTRUCTION_PREAMBLE \ - /* branch-and-link-to-noredir *%R11 */ \ - "or 3,3,3\n\t" - -#endif /* PLAT_ppc64_aix5 */ - -/* Insert assembly code for other platforms here... */ - -#endif /* NVALGRIND */ - - -/* ------------------------------------------------------------------ */ -/* PLATFORM SPECIFICS for FUNCTION WRAPPING. This is all very */ -/* ugly. It's the least-worst tradeoff I can think of. */ -/* ------------------------------------------------------------------ */ - -/* This section defines magic (a.k.a appalling-hack) macros for doing - guaranteed-no-redirection macros, so as to get from function - wrappers to the functions they are wrapping. The whole point is to - construct standard call sequences, but to do the call itself with a - special no-redirect call pseudo-instruction that the JIT - understands and handles specially. This section is long and - repetitious, and I can't see a way to make it shorter. - - The naming scheme is as follows: - - CALL_FN_{W,v}_{v,W,WW,WWW,WWWW,5W,6W,7W,etc} - - 'W' stands for "word" and 'v' for "void". Hence there are - different macros for calling arity 0, 1, 2, 3, 4, etc, functions, - and for each, the possibility of returning a word-typed result, or - no result. -*/ - -/* Use these to write the name of your wrapper. NOTE: duplicates - VG_WRAP_FUNCTION_Z{U,Z} in pub_tool_redir.h. */ - -/* Use an extra level of macroisation so as to ensure the soname/fnname - args are fully macro-expanded before pasting them together. */ -#define VG_CONCAT4(_aa,_bb,_cc,_dd) _aa##_bb##_cc##_dd - -#define I_WRAP_SONAME_FNNAME_ZU(soname,fnname) \ - VG_CONCAT4(_vgwZU_,soname,_,fnname) - -#define I_WRAP_SONAME_FNNAME_ZZ(soname,fnname) \ - VG_CONCAT4(_vgwZZ_,soname,_,fnname) - -/* Use this macro from within a wrapper function to collect the - context (address and possibly other info) of the original function. - Once you have that you can then use it in one of the CALL_FN_ - macros. The type of the argument _lval is OrigFn. */ -#define VALGRIND_GET_ORIG_FN(_lval) VALGRIND_GET_NR_CONTEXT(_lval) - -/* Derivatives of the main macros below, for calling functions - returning void. */ - -#define CALL_FN_v_v(fnptr) \ - do { volatile unsigned long _junk; \ - CALL_FN_W_v(_junk,fnptr); } while (0) - -#define CALL_FN_v_W(fnptr, arg1) \ - do { volatile unsigned long _junk; \ - CALL_FN_W_W(_junk,fnptr,arg1); } while (0) - -#define CALL_FN_v_WW(fnptr, arg1,arg2) \ - do { volatile unsigned long _junk; \ - CALL_FN_W_WW(_junk,fnptr,arg1,arg2); } while (0) - -#define CALL_FN_v_WWW(fnptr, arg1,arg2,arg3) \ - do { volatile unsigned long _junk; \ - CALL_FN_W_WWW(_junk,fnptr,arg1,arg2,arg3); } while (0) - -#define CALL_FN_v_WWWW(fnptr, arg1,arg2,arg3,arg4) \ - do { volatile unsigned long _junk; \ - CALL_FN_W_WWWW(_junk,fnptr,arg1,arg2,arg3,arg4); } while (0) - -#define CALL_FN_v_5W(fnptr, arg1,arg2,arg3,arg4,arg5) \ - do { volatile unsigned long _junk; \ - CALL_FN_W_5W(_junk,fnptr,arg1,arg2,arg3,arg4,arg5); } while (0) - -#define CALL_FN_v_6W(fnptr, arg1,arg2,arg3,arg4,arg5,arg6) \ - do { volatile unsigned long _junk; \ - CALL_FN_W_6W(_junk,fnptr,arg1,arg2,arg3,arg4,arg5,arg6); } while (0) - -#define CALL_FN_v_7W(fnptr, arg1,arg2,arg3,arg4,arg5,arg6,arg7) \ - do { volatile unsigned long _junk; \ - CALL_FN_W_7W(_junk,fnptr,arg1,arg2,arg3,arg4,arg5,arg6,arg7); } while (0) - -/* ------------------------- x86-{linux,darwin} ---------------- */ - -#if defined(PLAT_x86_linux) || defined(PLAT_x86_darwin) - -/* These regs are trashed by the hidden call. No need to mention eax - as gcc can already see that, plus causes gcc to bomb. */ -#define __CALLER_SAVED_REGS /*"eax"*/ "ecx", "edx" - -/* These CALL_FN_ macros assume that on x86-linux, sizeof(unsigned - long) == 4. */ - -#define CALL_FN_W_v(lval, orig) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[1]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - __asm__ volatile( \ - "movl (%%eax), %%eax\n\t" /* target->%eax */ \ - VALGRIND_CALL_NOREDIR_EAX \ - : /*out*/ "=a" (_res) \ - : /*in*/ "a" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_W(lval, orig, arg1) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[2]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - _argvec[1] = (unsigned long)(arg1); \ - __asm__ volatile( \ - "pushl 4(%%eax)\n\t" \ - "movl (%%eax), %%eax\n\t" /* target->%eax */ \ - VALGRIND_CALL_NOREDIR_EAX \ - "addl $4, %%esp\n" \ - : /*out*/ "=a" (_res) \ - : /*in*/ "a" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_WW(lval, orig, arg1,arg2) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[3]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - _argvec[1] = (unsigned long)(arg1); \ - _argvec[2] = (unsigned long)(arg2); \ - __asm__ volatile( \ - "pushl 8(%%eax)\n\t" \ - "pushl 4(%%eax)\n\t" \ - "movl (%%eax), %%eax\n\t" /* target->%eax */ \ - VALGRIND_CALL_NOREDIR_EAX \ - "addl $8, %%esp\n" \ - : /*out*/ "=a" (_res) \ - : /*in*/ "a" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[4]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - _argvec[1] = (unsigned long)(arg1); \ - _argvec[2] = (unsigned long)(arg2); \ - _argvec[3] = (unsigned long)(arg3); \ - __asm__ volatile( \ - "pushl 12(%%eax)\n\t" \ - "pushl 8(%%eax)\n\t" \ - "pushl 4(%%eax)\n\t" \ - "movl (%%eax), %%eax\n\t" /* target->%eax */ \ - VALGRIND_CALL_NOREDIR_EAX \ - "addl $12, %%esp\n" \ - : /*out*/ "=a" (_res) \ - : /*in*/ "a" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[5]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - _argvec[1] = (unsigned long)(arg1); \ - _argvec[2] = (unsigned long)(arg2); \ - _argvec[3] = (unsigned long)(arg3); \ - _argvec[4] = (unsigned long)(arg4); \ - __asm__ volatile( \ - "pushl 16(%%eax)\n\t" \ - "pushl 12(%%eax)\n\t" \ - "pushl 8(%%eax)\n\t" \ - "pushl 4(%%eax)\n\t" \ - "movl (%%eax), %%eax\n\t" /* target->%eax */ \ - VALGRIND_CALL_NOREDIR_EAX \ - "addl $16, %%esp\n" \ - : /*out*/ "=a" (_res) \ - : /*in*/ "a" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[6]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - _argvec[1] = (unsigned long)(arg1); \ - _argvec[2] = (unsigned long)(arg2); \ - _argvec[3] = (unsigned long)(arg3); \ - _argvec[4] = (unsigned long)(arg4); \ - _argvec[5] = (unsigned long)(arg5); \ - __asm__ volatile( \ - "pushl 20(%%eax)\n\t" \ - "pushl 16(%%eax)\n\t" \ - "pushl 12(%%eax)\n\t" \ - "pushl 8(%%eax)\n\t" \ - "pushl 4(%%eax)\n\t" \ - "movl (%%eax), %%eax\n\t" /* target->%eax */ \ - VALGRIND_CALL_NOREDIR_EAX \ - "addl $20, %%esp\n" \ - : /*out*/ "=a" (_res) \ - : /*in*/ "a" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[7]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - _argvec[1] = (unsigned long)(arg1); \ - _argvec[2] = (unsigned long)(arg2); \ - _argvec[3] = (unsigned long)(arg3); \ - _argvec[4] = (unsigned long)(arg4); \ - _argvec[5] = (unsigned long)(arg5); \ - _argvec[6] = (unsigned long)(arg6); \ - __asm__ volatile( \ - "pushl 24(%%eax)\n\t" \ - "pushl 20(%%eax)\n\t" \ - "pushl 16(%%eax)\n\t" \ - "pushl 12(%%eax)\n\t" \ - "pushl 8(%%eax)\n\t" \ - "pushl 4(%%eax)\n\t" \ - "movl (%%eax), %%eax\n\t" /* target->%eax */ \ - VALGRIND_CALL_NOREDIR_EAX \ - "addl $24, %%esp\n" \ - : /*out*/ "=a" (_res) \ - : /*in*/ "a" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ - arg7) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[8]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - _argvec[1] = (unsigned long)(arg1); \ - _argvec[2] = (unsigned long)(arg2); \ - _argvec[3] = (unsigned long)(arg3); \ - _argvec[4] = (unsigned long)(arg4); \ - _argvec[5] = (unsigned long)(arg5); \ - _argvec[6] = (unsigned long)(arg6); \ - _argvec[7] = (unsigned long)(arg7); \ - __asm__ volatile( \ - "pushl 28(%%eax)\n\t" \ - "pushl 24(%%eax)\n\t" \ - "pushl 20(%%eax)\n\t" \ - "pushl 16(%%eax)\n\t" \ - "pushl 12(%%eax)\n\t" \ - "pushl 8(%%eax)\n\t" \ - "pushl 4(%%eax)\n\t" \ - "movl (%%eax), %%eax\n\t" /* target->%eax */ \ - VALGRIND_CALL_NOREDIR_EAX \ - "addl $28, %%esp\n" \ - : /*out*/ "=a" (_res) \ - : /*in*/ "a" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ - arg7,arg8) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[9]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - _argvec[1] = (unsigned long)(arg1); \ - _argvec[2] = (unsigned long)(arg2); \ - _argvec[3] = (unsigned long)(arg3); \ - _argvec[4] = (unsigned long)(arg4); \ - _argvec[5] = (unsigned long)(arg5); \ - _argvec[6] = (unsigned long)(arg6); \ - _argvec[7] = (unsigned long)(arg7); \ - _argvec[8] = (unsigned long)(arg8); \ - __asm__ volatile( \ - "pushl 32(%%eax)\n\t" \ - "pushl 28(%%eax)\n\t" \ - "pushl 24(%%eax)\n\t" \ - "pushl 20(%%eax)\n\t" \ - "pushl 16(%%eax)\n\t" \ - "pushl 12(%%eax)\n\t" \ - "pushl 8(%%eax)\n\t" \ - "pushl 4(%%eax)\n\t" \ - "movl (%%eax), %%eax\n\t" /* target->%eax */ \ - VALGRIND_CALL_NOREDIR_EAX \ - "addl $32, %%esp\n" \ - : /*out*/ "=a" (_res) \ - : /*in*/ "a" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ - arg7,arg8,arg9) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[10]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - _argvec[1] = (unsigned long)(arg1); \ - _argvec[2] = (unsigned long)(arg2); \ - _argvec[3] = (unsigned long)(arg3); \ - _argvec[4] = (unsigned long)(arg4); \ - _argvec[5] = (unsigned long)(arg5); \ - _argvec[6] = (unsigned long)(arg6); \ - _argvec[7] = (unsigned long)(arg7); \ - _argvec[8] = (unsigned long)(arg8); \ - _argvec[9] = (unsigned long)(arg9); \ - __asm__ volatile( \ - "pushl 36(%%eax)\n\t" \ - "pushl 32(%%eax)\n\t" \ - "pushl 28(%%eax)\n\t" \ - "pushl 24(%%eax)\n\t" \ - "pushl 20(%%eax)\n\t" \ - "pushl 16(%%eax)\n\t" \ - "pushl 12(%%eax)\n\t" \ - "pushl 8(%%eax)\n\t" \ - "pushl 4(%%eax)\n\t" \ - "movl (%%eax), %%eax\n\t" /* target->%eax */ \ - VALGRIND_CALL_NOREDIR_EAX \ - "addl $36, %%esp\n" \ - : /*out*/ "=a" (_res) \ - : /*in*/ "a" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ - arg7,arg8,arg9,arg10) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[11]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - _argvec[1] = (unsigned long)(arg1); \ - _argvec[2] = (unsigned long)(arg2); \ - _argvec[3] = (unsigned long)(arg3); \ - _argvec[4] = (unsigned long)(arg4); \ - _argvec[5] = (unsigned long)(arg5); \ - _argvec[6] = (unsigned long)(arg6); \ - _argvec[7] = (unsigned long)(arg7); \ - _argvec[8] = (unsigned long)(arg8); \ - _argvec[9] = (unsigned long)(arg9); \ - _argvec[10] = (unsigned long)(arg10); \ - __asm__ volatile( \ - "pushl 40(%%eax)\n\t" \ - "pushl 36(%%eax)\n\t" \ - "pushl 32(%%eax)\n\t" \ - "pushl 28(%%eax)\n\t" \ - "pushl 24(%%eax)\n\t" \ - "pushl 20(%%eax)\n\t" \ - "pushl 16(%%eax)\n\t" \ - "pushl 12(%%eax)\n\t" \ - "pushl 8(%%eax)\n\t" \ - "pushl 4(%%eax)\n\t" \ - "movl (%%eax), %%eax\n\t" /* target->%eax */ \ - VALGRIND_CALL_NOREDIR_EAX \ - "addl $40, %%esp\n" \ - : /*out*/ "=a" (_res) \ - : /*in*/ "a" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5, \ - arg6,arg7,arg8,arg9,arg10, \ - arg11) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[12]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - _argvec[1] = (unsigned long)(arg1); \ - _argvec[2] = (unsigned long)(arg2); \ - _argvec[3] = (unsigned long)(arg3); \ - _argvec[4] = (unsigned long)(arg4); \ - _argvec[5] = (unsigned long)(arg5); \ - _argvec[6] = (unsigned long)(arg6); \ - _argvec[7] = (unsigned long)(arg7); \ - _argvec[8] = (unsigned long)(arg8); \ - _argvec[9] = (unsigned long)(arg9); \ - _argvec[10] = (unsigned long)(arg10); \ - _argvec[11] = (unsigned long)(arg11); \ - __asm__ volatile( \ - "pushl 44(%%eax)\n\t" \ - "pushl 40(%%eax)\n\t" \ - "pushl 36(%%eax)\n\t" \ - "pushl 32(%%eax)\n\t" \ - "pushl 28(%%eax)\n\t" \ - "pushl 24(%%eax)\n\t" \ - "pushl 20(%%eax)\n\t" \ - "pushl 16(%%eax)\n\t" \ - "pushl 12(%%eax)\n\t" \ - "pushl 8(%%eax)\n\t" \ - "pushl 4(%%eax)\n\t" \ - "movl (%%eax), %%eax\n\t" /* target->%eax */ \ - VALGRIND_CALL_NOREDIR_EAX \ - "addl $44, %%esp\n" \ - : /*out*/ "=a" (_res) \ - : /*in*/ "a" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5, \ - arg6,arg7,arg8,arg9,arg10, \ - arg11,arg12) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[13]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - _argvec[1] = (unsigned long)(arg1); \ - _argvec[2] = (unsigned long)(arg2); \ - _argvec[3] = (unsigned long)(arg3); \ - _argvec[4] = (unsigned long)(arg4); \ - _argvec[5] = (unsigned long)(arg5); \ - _argvec[6] = (unsigned long)(arg6); \ - _argvec[7] = (unsigned long)(arg7); \ - _argvec[8] = (unsigned long)(arg8); \ - _argvec[9] = (unsigned long)(arg9); \ - _argvec[10] = (unsigned long)(arg10); \ - _argvec[11] = (unsigned long)(arg11); \ - _argvec[12] = (unsigned long)(arg12); \ - __asm__ volatile( \ - "pushl 48(%%eax)\n\t" \ - "pushl 44(%%eax)\n\t" \ - "pushl 40(%%eax)\n\t" \ - "pushl 36(%%eax)\n\t" \ - "pushl 32(%%eax)\n\t" \ - "pushl 28(%%eax)\n\t" \ - "pushl 24(%%eax)\n\t" \ - "pushl 20(%%eax)\n\t" \ - "pushl 16(%%eax)\n\t" \ - "pushl 12(%%eax)\n\t" \ - "pushl 8(%%eax)\n\t" \ - "pushl 4(%%eax)\n\t" \ - "movl (%%eax), %%eax\n\t" /* target->%eax */ \ - VALGRIND_CALL_NOREDIR_EAX \ - "addl $48, %%esp\n" \ - : /*out*/ "=a" (_res) \ - : /*in*/ "a" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#endif /* PLAT_x86_linux || PLAT_x86_darwin */ - -/* ------------------------ amd64-{linux,darwin} --------------- */ - -#if defined(PLAT_amd64_linux) || defined(PLAT_amd64_darwin) - -/* ARGREGS: rdi rsi rdx rcx r8 r9 (the rest on stack in R-to-L order) */ - -/* These regs are trashed by the hidden call. */ -#define __CALLER_SAVED_REGS /*"rax",*/ "rcx", "rdx", "rsi", \ - "rdi", "r8", "r9", "r10", "r11" - -/* These CALL_FN_ macros assume that on amd64-linux, sizeof(unsigned - long) == 8. */ - -/* NB 9 Sept 07. There is a nasty kludge here in all these CALL_FN_ - macros. In order not to trash the stack redzone, we need to drop - %rsp by 128 before the hidden call, and restore afterwards. The - nastyness is that it is only by luck that the stack still appears - to be unwindable during the hidden call - since then the behaviour - of any routine using this macro does not match what the CFI data - says. Sigh. - - Why is this important? Imagine that a wrapper has a stack - allocated local, and passes to the hidden call, a pointer to it. - Because gcc does not know about the hidden call, it may allocate - that local in the redzone. Unfortunately the hidden call may then - trash it before it comes to use it. So we must step clear of the - redzone, for the duration of the hidden call, to make it safe. - - Probably the same problem afflicts the other redzone-style ABIs too - (ppc64-linux, ppc32-aix5, ppc64-aix5); but for those, the stack is - self describing (none of this CFI nonsense) so at least messing - with the stack pointer doesn't give a danger of non-unwindable - stack. */ - -#define CALL_FN_W_v(lval, orig) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[1]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - __asm__ volatile( \ - "subq $128,%%rsp\n\t" \ - "movq (%%rax), %%rax\n\t" /* target->%rax */ \ - VALGRIND_CALL_NOREDIR_RAX \ - "addq $128,%%rsp\n\t" \ - : /*out*/ "=a" (_res) \ - : /*in*/ "a" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_W(lval, orig, arg1) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[2]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - _argvec[1] = (unsigned long)(arg1); \ - __asm__ volatile( \ - "subq $128,%%rsp\n\t" \ - "movq 8(%%rax), %%rdi\n\t" \ - "movq (%%rax), %%rax\n\t" /* target->%rax */ \ - VALGRIND_CALL_NOREDIR_RAX \ - "addq $128,%%rsp\n\t" \ - : /*out*/ "=a" (_res) \ - : /*in*/ "a" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_WW(lval, orig, arg1,arg2) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[3]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - _argvec[1] = (unsigned long)(arg1); \ - _argvec[2] = (unsigned long)(arg2); \ - __asm__ volatile( \ - "subq $128,%%rsp\n\t" \ - "movq 16(%%rax), %%rsi\n\t" \ - "movq 8(%%rax), %%rdi\n\t" \ - "movq (%%rax), %%rax\n\t" /* target->%rax */ \ - VALGRIND_CALL_NOREDIR_RAX \ - "addq $128,%%rsp\n\t" \ - : /*out*/ "=a" (_res) \ - : /*in*/ "a" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[4]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - _argvec[1] = (unsigned long)(arg1); \ - _argvec[2] = (unsigned long)(arg2); \ - _argvec[3] = (unsigned long)(arg3); \ - __asm__ volatile( \ - "subq $128,%%rsp\n\t" \ - "movq 24(%%rax), %%rdx\n\t" \ - "movq 16(%%rax), %%rsi\n\t" \ - "movq 8(%%rax), %%rdi\n\t" \ - "movq (%%rax), %%rax\n\t" /* target->%rax */ \ - VALGRIND_CALL_NOREDIR_RAX \ - "addq $128,%%rsp\n\t" \ - : /*out*/ "=a" (_res) \ - : /*in*/ "a" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[5]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - _argvec[1] = (unsigned long)(arg1); \ - _argvec[2] = (unsigned long)(arg2); \ - _argvec[3] = (unsigned long)(arg3); \ - _argvec[4] = (unsigned long)(arg4); \ - __asm__ volatile( \ - "subq $128,%%rsp\n\t" \ - "movq 32(%%rax), %%rcx\n\t" \ - "movq 24(%%rax), %%rdx\n\t" \ - "movq 16(%%rax), %%rsi\n\t" \ - "movq 8(%%rax), %%rdi\n\t" \ - "movq (%%rax), %%rax\n\t" /* target->%rax */ \ - VALGRIND_CALL_NOREDIR_RAX \ - "addq $128,%%rsp\n\t" \ - : /*out*/ "=a" (_res) \ - : /*in*/ "a" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[6]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - _argvec[1] = (unsigned long)(arg1); \ - _argvec[2] = (unsigned long)(arg2); \ - _argvec[3] = (unsigned long)(arg3); \ - _argvec[4] = (unsigned long)(arg4); \ - _argvec[5] = (unsigned long)(arg5); \ - __asm__ volatile( \ - "subq $128,%%rsp\n\t" \ - "movq 40(%%rax), %%r8\n\t" \ - "movq 32(%%rax), %%rcx\n\t" \ - "movq 24(%%rax), %%rdx\n\t" \ - "movq 16(%%rax), %%rsi\n\t" \ - "movq 8(%%rax), %%rdi\n\t" \ - "movq (%%rax), %%rax\n\t" /* target->%rax */ \ - VALGRIND_CALL_NOREDIR_RAX \ - "addq $128,%%rsp\n\t" \ - : /*out*/ "=a" (_res) \ - : /*in*/ "a" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[7]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - _argvec[1] = (unsigned long)(arg1); \ - _argvec[2] = (unsigned long)(arg2); \ - _argvec[3] = (unsigned long)(arg3); \ - _argvec[4] = (unsigned long)(arg4); \ - _argvec[5] = (unsigned long)(arg5); \ - _argvec[6] = (unsigned long)(arg6); \ - __asm__ volatile( \ - "subq $128,%%rsp\n\t" \ - "movq 48(%%rax), %%r9\n\t" \ - "movq 40(%%rax), %%r8\n\t" \ - "movq 32(%%rax), %%rcx\n\t" \ - "movq 24(%%rax), %%rdx\n\t" \ - "movq 16(%%rax), %%rsi\n\t" \ - "movq 8(%%rax), %%rdi\n\t" \ - "movq (%%rax), %%rax\n\t" /* target->%rax */ \ - "addq $128,%%rsp\n\t" \ - VALGRIND_CALL_NOREDIR_RAX \ - : /*out*/ "=a" (_res) \ - : /*in*/ "a" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ - arg7) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[8]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - _argvec[1] = (unsigned long)(arg1); \ - _argvec[2] = (unsigned long)(arg2); \ - _argvec[3] = (unsigned long)(arg3); \ - _argvec[4] = (unsigned long)(arg4); \ - _argvec[5] = (unsigned long)(arg5); \ - _argvec[6] = (unsigned long)(arg6); \ - _argvec[7] = (unsigned long)(arg7); \ - __asm__ volatile( \ - "subq $128,%%rsp\n\t" \ - "pushq 56(%%rax)\n\t" \ - "movq 48(%%rax), %%r9\n\t" \ - "movq 40(%%rax), %%r8\n\t" \ - "movq 32(%%rax), %%rcx\n\t" \ - "movq 24(%%rax), %%rdx\n\t" \ - "movq 16(%%rax), %%rsi\n\t" \ - "movq 8(%%rax), %%rdi\n\t" \ - "movq (%%rax), %%rax\n\t" /* target->%rax */ \ - VALGRIND_CALL_NOREDIR_RAX \ - "addq $8, %%rsp\n" \ - "addq $128,%%rsp\n\t" \ - : /*out*/ "=a" (_res) \ - : /*in*/ "a" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ - arg7,arg8) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[9]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - _argvec[1] = (unsigned long)(arg1); \ - _argvec[2] = (unsigned long)(arg2); \ - _argvec[3] = (unsigned long)(arg3); \ - _argvec[4] = (unsigned long)(arg4); \ - _argvec[5] = (unsigned long)(arg5); \ - _argvec[6] = (unsigned long)(arg6); \ - _argvec[7] = (unsigned long)(arg7); \ - _argvec[8] = (unsigned long)(arg8); \ - __asm__ volatile( \ - "subq $128,%%rsp\n\t" \ - "pushq 64(%%rax)\n\t" \ - "pushq 56(%%rax)\n\t" \ - "movq 48(%%rax), %%r9\n\t" \ - "movq 40(%%rax), %%r8\n\t" \ - "movq 32(%%rax), %%rcx\n\t" \ - "movq 24(%%rax), %%rdx\n\t" \ - "movq 16(%%rax), %%rsi\n\t" \ - "movq 8(%%rax), %%rdi\n\t" \ - "movq (%%rax), %%rax\n\t" /* target->%rax */ \ - VALGRIND_CALL_NOREDIR_RAX \ - "addq $16, %%rsp\n" \ - "addq $128,%%rsp\n\t" \ - : /*out*/ "=a" (_res) \ - : /*in*/ "a" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ - arg7,arg8,arg9) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[10]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - _argvec[1] = (unsigned long)(arg1); \ - _argvec[2] = (unsigned long)(arg2); \ - _argvec[3] = (unsigned long)(arg3); \ - _argvec[4] = (unsigned long)(arg4); \ - _argvec[5] = (unsigned long)(arg5); \ - _argvec[6] = (unsigned long)(arg6); \ - _argvec[7] = (unsigned long)(arg7); \ - _argvec[8] = (unsigned long)(arg8); \ - _argvec[9] = (unsigned long)(arg9); \ - __asm__ volatile( \ - "subq $128,%%rsp\n\t" \ - "pushq 72(%%rax)\n\t" \ - "pushq 64(%%rax)\n\t" \ - "pushq 56(%%rax)\n\t" \ - "movq 48(%%rax), %%r9\n\t" \ - "movq 40(%%rax), %%r8\n\t" \ - "movq 32(%%rax), %%rcx\n\t" \ - "movq 24(%%rax), %%rdx\n\t" \ - "movq 16(%%rax), %%rsi\n\t" \ - "movq 8(%%rax), %%rdi\n\t" \ - "movq (%%rax), %%rax\n\t" /* target->%rax */ \ - VALGRIND_CALL_NOREDIR_RAX \ - "addq $24, %%rsp\n" \ - "addq $128,%%rsp\n\t" \ - : /*out*/ "=a" (_res) \ - : /*in*/ "a" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ - arg7,arg8,arg9,arg10) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[11]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - _argvec[1] = (unsigned long)(arg1); \ - _argvec[2] = (unsigned long)(arg2); \ - _argvec[3] = (unsigned long)(arg3); \ - _argvec[4] = (unsigned long)(arg4); \ - _argvec[5] = (unsigned long)(arg5); \ - _argvec[6] = (unsigned long)(arg6); \ - _argvec[7] = (unsigned long)(arg7); \ - _argvec[8] = (unsigned long)(arg8); \ - _argvec[9] = (unsigned long)(arg9); \ - _argvec[10] = (unsigned long)(arg10); \ - __asm__ volatile( \ - "subq $128,%%rsp\n\t" \ - "pushq 80(%%rax)\n\t" \ - "pushq 72(%%rax)\n\t" \ - "pushq 64(%%rax)\n\t" \ - "pushq 56(%%rax)\n\t" \ - "movq 48(%%rax), %%r9\n\t" \ - "movq 40(%%rax), %%r8\n\t" \ - "movq 32(%%rax), %%rcx\n\t" \ - "movq 24(%%rax), %%rdx\n\t" \ - "movq 16(%%rax), %%rsi\n\t" \ - "movq 8(%%rax), %%rdi\n\t" \ - "movq (%%rax), %%rax\n\t" /* target->%rax */ \ - VALGRIND_CALL_NOREDIR_RAX \ - "addq $32, %%rsp\n" \ - "addq $128,%%rsp\n\t" \ - : /*out*/ "=a" (_res) \ - : /*in*/ "a" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ - arg7,arg8,arg9,arg10,arg11) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[12]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - _argvec[1] = (unsigned long)(arg1); \ - _argvec[2] = (unsigned long)(arg2); \ - _argvec[3] = (unsigned long)(arg3); \ - _argvec[4] = (unsigned long)(arg4); \ - _argvec[5] = (unsigned long)(arg5); \ - _argvec[6] = (unsigned long)(arg6); \ - _argvec[7] = (unsigned long)(arg7); \ - _argvec[8] = (unsigned long)(arg8); \ - _argvec[9] = (unsigned long)(arg9); \ - _argvec[10] = (unsigned long)(arg10); \ - _argvec[11] = (unsigned long)(arg11); \ - __asm__ volatile( \ - "subq $128,%%rsp\n\t" \ - "pushq 88(%%rax)\n\t" \ - "pushq 80(%%rax)\n\t" \ - "pushq 72(%%rax)\n\t" \ - "pushq 64(%%rax)\n\t" \ - "pushq 56(%%rax)\n\t" \ - "movq 48(%%rax), %%r9\n\t" \ - "movq 40(%%rax), %%r8\n\t" \ - "movq 32(%%rax), %%rcx\n\t" \ - "movq 24(%%rax), %%rdx\n\t" \ - "movq 16(%%rax), %%rsi\n\t" \ - "movq 8(%%rax), %%rdi\n\t" \ - "movq (%%rax), %%rax\n\t" /* target->%rax */ \ - VALGRIND_CALL_NOREDIR_RAX \ - "addq $40, %%rsp\n" \ - "addq $128,%%rsp\n\t" \ - : /*out*/ "=a" (_res) \ - : /*in*/ "a" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ - arg7,arg8,arg9,arg10,arg11,arg12) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[13]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - _argvec[1] = (unsigned long)(arg1); \ - _argvec[2] = (unsigned long)(arg2); \ - _argvec[3] = (unsigned long)(arg3); \ - _argvec[4] = (unsigned long)(arg4); \ - _argvec[5] = (unsigned long)(arg5); \ - _argvec[6] = (unsigned long)(arg6); \ - _argvec[7] = (unsigned long)(arg7); \ - _argvec[8] = (unsigned long)(arg8); \ - _argvec[9] = (unsigned long)(arg9); \ - _argvec[10] = (unsigned long)(arg10); \ - _argvec[11] = (unsigned long)(arg11); \ - _argvec[12] = (unsigned long)(arg12); \ - __asm__ volatile( \ - "subq $128,%%rsp\n\t" \ - "pushq 96(%%rax)\n\t" \ - "pushq 88(%%rax)\n\t" \ - "pushq 80(%%rax)\n\t" \ - "pushq 72(%%rax)\n\t" \ - "pushq 64(%%rax)\n\t" \ - "pushq 56(%%rax)\n\t" \ - "movq 48(%%rax), %%r9\n\t" \ - "movq 40(%%rax), %%r8\n\t" \ - "movq 32(%%rax), %%rcx\n\t" \ - "movq 24(%%rax), %%rdx\n\t" \ - "movq 16(%%rax), %%rsi\n\t" \ - "movq 8(%%rax), %%rdi\n\t" \ - "movq (%%rax), %%rax\n\t" /* target->%rax */ \ - VALGRIND_CALL_NOREDIR_RAX \ - "addq $48, %%rsp\n" \ - "addq $128,%%rsp\n\t" \ - : /*out*/ "=a" (_res) \ - : /*in*/ "a" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#endif /* PLAT_amd64_linux || PLAT_amd64_darwin */ - -/* ------------------------ ppc32-linux ------------------------ */ - -#if defined(PLAT_ppc32_linux) - -/* This is useful for finding out about the on-stack stuff: - - extern int f9 ( int,int,int,int,int,int,int,int,int ); - extern int f10 ( int,int,int,int,int,int,int,int,int,int ); - extern int f11 ( int,int,int,int,int,int,int,int,int,int,int ); - extern int f12 ( int,int,int,int,int,int,int,int,int,int,int,int ); - - int g9 ( void ) { - return f9(11,22,33,44,55,66,77,88,99); - } - int g10 ( void ) { - return f10(11,22,33,44,55,66,77,88,99,110); - } - int g11 ( void ) { - return f11(11,22,33,44,55,66,77,88,99,110,121); - } - int g12 ( void ) { - return f12(11,22,33,44,55,66,77,88,99,110,121,132); - } -*/ - -/* ARGREGS: r3 r4 r5 r6 r7 r8 r9 r10 (the rest on stack somewhere) */ - -/* These regs are trashed by the hidden call. */ -#define __CALLER_SAVED_REGS \ - "lr", "ctr", "xer", \ - "cr0", "cr1", "cr2", "cr3", "cr4", "cr5", "cr6", "cr7", \ - "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", \ - "r11", "r12", "r13" - -/* These CALL_FN_ macros assume that on ppc32-linux, - sizeof(unsigned long) == 4. */ - -#define CALL_FN_W_v(lval, orig) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[1]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - "lwz 11,0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "mr %0,3" \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_W(lval, orig, arg1) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[2]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - _argvec[1] = (unsigned long)arg1; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - "lwz 3,4(11)\n\t" /* arg1->r3 */ \ - "lwz 11,0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "mr %0,3" \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_WW(lval, orig, arg1,arg2) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[3]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - _argvec[1] = (unsigned long)arg1; \ - _argvec[2] = (unsigned long)arg2; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - "lwz 3,4(11)\n\t" /* arg1->r3 */ \ - "lwz 4,8(11)\n\t" \ - "lwz 11,0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "mr %0,3" \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[4]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - _argvec[1] = (unsigned long)arg1; \ - _argvec[2] = (unsigned long)arg2; \ - _argvec[3] = (unsigned long)arg3; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - "lwz 3,4(11)\n\t" /* arg1->r3 */ \ - "lwz 4,8(11)\n\t" \ - "lwz 5,12(11)\n\t" \ - "lwz 11,0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "mr %0,3" \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[5]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - _argvec[1] = (unsigned long)arg1; \ - _argvec[2] = (unsigned long)arg2; \ - _argvec[3] = (unsigned long)arg3; \ - _argvec[4] = (unsigned long)arg4; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - "lwz 3,4(11)\n\t" /* arg1->r3 */ \ - "lwz 4,8(11)\n\t" \ - "lwz 5,12(11)\n\t" \ - "lwz 6,16(11)\n\t" /* arg4->r6 */ \ - "lwz 11,0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "mr %0,3" \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[6]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - _argvec[1] = (unsigned long)arg1; \ - _argvec[2] = (unsigned long)arg2; \ - _argvec[3] = (unsigned long)arg3; \ - _argvec[4] = (unsigned long)arg4; \ - _argvec[5] = (unsigned long)arg5; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - "lwz 3,4(11)\n\t" /* arg1->r3 */ \ - "lwz 4,8(11)\n\t" \ - "lwz 5,12(11)\n\t" \ - "lwz 6,16(11)\n\t" /* arg4->r6 */ \ - "lwz 7,20(11)\n\t" \ - "lwz 11,0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "mr %0,3" \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[7]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - _argvec[1] = (unsigned long)arg1; \ - _argvec[2] = (unsigned long)arg2; \ - _argvec[3] = (unsigned long)arg3; \ - _argvec[4] = (unsigned long)arg4; \ - _argvec[5] = (unsigned long)arg5; \ - _argvec[6] = (unsigned long)arg6; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - "lwz 3,4(11)\n\t" /* arg1->r3 */ \ - "lwz 4,8(11)\n\t" \ - "lwz 5,12(11)\n\t" \ - "lwz 6,16(11)\n\t" /* arg4->r6 */ \ - "lwz 7,20(11)\n\t" \ - "lwz 8,24(11)\n\t" \ - "lwz 11,0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "mr %0,3" \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ - arg7) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[8]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - _argvec[1] = (unsigned long)arg1; \ - _argvec[2] = (unsigned long)arg2; \ - _argvec[3] = (unsigned long)arg3; \ - _argvec[4] = (unsigned long)arg4; \ - _argvec[5] = (unsigned long)arg5; \ - _argvec[6] = (unsigned long)arg6; \ - _argvec[7] = (unsigned long)arg7; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - "lwz 3,4(11)\n\t" /* arg1->r3 */ \ - "lwz 4,8(11)\n\t" \ - "lwz 5,12(11)\n\t" \ - "lwz 6,16(11)\n\t" /* arg4->r6 */ \ - "lwz 7,20(11)\n\t" \ - "lwz 8,24(11)\n\t" \ - "lwz 9,28(11)\n\t" \ - "lwz 11,0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "mr %0,3" \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ - arg7,arg8) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[9]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - _argvec[1] = (unsigned long)arg1; \ - _argvec[2] = (unsigned long)arg2; \ - _argvec[3] = (unsigned long)arg3; \ - _argvec[4] = (unsigned long)arg4; \ - _argvec[5] = (unsigned long)arg5; \ - _argvec[6] = (unsigned long)arg6; \ - _argvec[7] = (unsigned long)arg7; \ - _argvec[8] = (unsigned long)arg8; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - "lwz 3,4(11)\n\t" /* arg1->r3 */ \ - "lwz 4,8(11)\n\t" \ - "lwz 5,12(11)\n\t" \ - "lwz 6,16(11)\n\t" /* arg4->r6 */ \ - "lwz 7,20(11)\n\t" \ - "lwz 8,24(11)\n\t" \ - "lwz 9,28(11)\n\t" \ - "lwz 10,32(11)\n\t" /* arg8->r10 */ \ - "lwz 11,0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "mr %0,3" \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ - arg7,arg8,arg9) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[10]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - _argvec[1] = (unsigned long)arg1; \ - _argvec[2] = (unsigned long)arg2; \ - _argvec[3] = (unsigned long)arg3; \ - _argvec[4] = (unsigned long)arg4; \ - _argvec[5] = (unsigned long)arg5; \ - _argvec[6] = (unsigned long)arg6; \ - _argvec[7] = (unsigned long)arg7; \ - _argvec[8] = (unsigned long)arg8; \ - _argvec[9] = (unsigned long)arg9; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - "addi 1,1,-16\n\t" \ - /* arg9 */ \ - "lwz 3,36(11)\n\t" \ - "stw 3,8(1)\n\t" \ - /* args1-8 */ \ - "lwz 3,4(11)\n\t" /* arg1->r3 */ \ - "lwz 4,8(11)\n\t" \ - "lwz 5,12(11)\n\t" \ - "lwz 6,16(11)\n\t" /* arg4->r6 */ \ - "lwz 7,20(11)\n\t" \ - "lwz 8,24(11)\n\t" \ - "lwz 9,28(11)\n\t" \ - "lwz 10,32(11)\n\t" /* arg8->r10 */ \ - "lwz 11,0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "addi 1,1,16\n\t" \ - "mr %0,3" \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ - arg7,arg8,arg9,arg10) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[11]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - _argvec[1] = (unsigned long)arg1; \ - _argvec[2] = (unsigned long)arg2; \ - _argvec[3] = (unsigned long)arg3; \ - _argvec[4] = (unsigned long)arg4; \ - _argvec[5] = (unsigned long)arg5; \ - _argvec[6] = (unsigned long)arg6; \ - _argvec[7] = (unsigned long)arg7; \ - _argvec[8] = (unsigned long)arg8; \ - _argvec[9] = (unsigned long)arg9; \ - _argvec[10] = (unsigned long)arg10; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - "addi 1,1,-16\n\t" \ - /* arg10 */ \ - "lwz 3,40(11)\n\t" \ - "stw 3,12(1)\n\t" \ - /* arg9 */ \ - "lwz 3,36(11)\n\t" \ - "stw 3,8(1)\n\t" \ - /* args1-8 */ \ - "lwz 3,4(11)\n\t" /* arg1->r3 */ \ - "lwz 4,8(11)\n\t" \ - "lwz 5,12(11)\n\t" \ - "lwz 6,16(11)\n\t" /* arg4->r6 */ \ - "lwz 7,20(11)\n\t" \ - "lwz 8,24(11)\n\t" \ - "lwz 9,28(11)\n\t" \ - "lwz 10,32(11)\n\t" /* arg8->r10 */ \ - "lwz 11,0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "addi 1,1,16\n\t" \ - "mr %0,3" \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ - arg7,arg8,arg9,arg10,arg11) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[12]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - _argvec[1] = (unsigned long)arg1; \ - _argvec[2] = (unsigned long)arg2; \ - _argvec[3] = (unsigned long)arg3; \ - _argvec[4] = (unsigned long)arg4; \ - _argvec[5] = (unsigned long)arg5; \ - _argvec[6] = (unsigned long)arg6; \ - _argvec[7] = (unsigned long)arg7; \ - _argvec[8] = (unsigned long)arg8; \ - _argvec[9] = (unsigned long)arg9; \ - _argvec[10] = (unsigned long)arg10; \ - _argvec[11] = (unsigned long)arg11; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - "addi 1,1,-32\n\t" \ - /* arg11 */ \ - "lwz 3,44(11)\n\t" \ - "stw 3,16(1)\n\t" \ - /* arg10 */ \ - "lwz 3,40(11)\n\t" \ - "stw 3,12(1)\n\t" \ - /* arg9 */ \ - "lwz 3,36(11)\n\t" \ - "stw 3,8(1)\n\t" \ - /* args1-8 */ \ - "lwz 3,4(11)\n\t" /* arg1->r3 */ \ - "lwz 4,8(11)\n\t" \ - "lwz 5,12(11)\n\t" \ - "lwz 6,16(11)\n\t" /* arg4->r6 */ \ - "lwz 7,20(11)\n\t" \ - "lwz 8,24(11)\n\t" \ - "lwz 9,28(11)\n\t" \ - "lwz 10,32(11)\n\t" /* arg8->r10 */ \ - "lwz 11,0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "addi 1,1,32\n\t" \ - "mr %0,3" \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ - arg7,arg8,arg9,arg10,arg11,arg12) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[13]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - _argvec[1] = (unsigned long)arg1; \ - _argvec[2] = (unsigned long)arg2; \ - _argvec[3] = (unsigned long)arg3; \ - _argvec[4] = (unsigned long)arg4; \ - _argvec[5] = (unsigned long)arg5; \ - _argvec[6] = (unsigned long)arg6; \ - _argvec[7] = (unsigned long)arg7; \ - _argvec[8] = (unsigned long)arg8; \ - _argvec[9] = (unsigned long)arg9; \ - _argvec[10] = (unsigned long)arg10; \ - _argvec[11] = (unsigned long)arg11; \ - _argvec[12] = (unsigned long)arg12; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - "addi 1,1,-32\n\t" \ - /* arg12 */ \ - "lwz 3,48(11)\n\t" \ - "stw 3,20(1)\n\t" \ - /* arg11 */ \ - "lwz 3,44(11)\n\t" \ - "stw 3,16(1)\n\t" \ - /* arg10 */ \ - "lwz 3,40(11)\n\t" \ - "stw 3,12(1)\n\t" \ - /* arg9 */ \ - "lwz 3,36(11)\n\t" \ - "stw 3,8(1)\n\t" \ - /* args1-8 */ \ - "lwz 3,4(11)\n\t" /* arg1->r3 */ \ - "lwz 4,8(11)\n\t" \ - "lwz 5,12(11)\n\t" \ - "lwz 6,16(11)\n\t" /* arg4->r6 */ \ - "lwz 7,20(11)\n\t" \ - "lwz 8,24(11)\n\t" \ - "lwz 9,28(11)\n\t" \ - "lwz 10,32(11)\n\t" /* arg8->r10 */ \ - "lwz 11,0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "addi 1,1,32\n\t" \ - "mr %0,3" \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#endif /* PLAT_ppc32_linux */ - -/* ------------------------ ppc64-linux ------------------------ */ - -#if defined(PLAT_ppc64_linux) - -/* ARGREGS: r3 r4 r5 r6 r7 r8 r9 r10 (the rest on stack somewhere) */ - -/* These regs are trashed by the hidden call. */ -#define __CALLER_SAVED_REGS \ - "lr", "ctr", "xer", \ - "cr0", "cr1", "cr2", "cr3", "cr4", "cr5", "cr6", "cr7", \ - "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", \ - "r11", "r12", "r13" - -/* These CALL_FN_ macros assume that on ppc64-linux, sizeof(unsigned - long) == 8. */ - -#define CALL_FN_W_v(lval, orig) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[3+0]; \ - volatile unsigned long _res; \ - /* _argvec[0] holds current r2 across the call */ \ - _argvec[1] = (unsigned long)_orig.r2; \ - _argvec[2] = (unsigned long)_orig.nraddr; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - "std 2,-16(11)\n\t" /* save tocptr */ \ - "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ - "ld 11, 0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "mr 11,%1\n\t" \ - "mr %0,3\n\t" \ - "ld 2,-16(11)" /* restore tocptr */ \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[2]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_W(lval, orig, arg1) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[3+1]; \ - volatile unsigned long _res; \ - /* _argvec[0] holds current r2 across the call */ \ - _argvec[1] = (unsigned long)_orig.r2; \ - _argvec[2] = (unsigned long)_orig.nraddr; \ - _argvec[2+1] = (unsigned long)arg1; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - "std 2,-16(11)\n\t" /* save tocptr */ \ - "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ - "ld 3, 8(11)\n\t" /* arg1->r3 */ \ - "ld 11, 0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "mr 11,%1\n\t" \ - "mr %0,3\n\t" \ - "ld 2,-16(11)" /* restore tocptr */ \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[2]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_WW(lval, orig, arg1,arg2) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[3+2]; \ - volatile unsigned long _res; \ - /* _argvec[0] holds current r2 across the call */ \ - _argvec[1] = (unsigned long)_orig.r2; \ - _argvec[2] = (unsigned long)_orig.nraddr; \ - _argvec[2+1] = (unsigned long)arg1; \ - _argvec[2+2] = (unsigned long)arg2; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - "std 2,-16(11)\n\t" /* save tocptr */ \ - "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ - "ld 3, 8(11)\n\t" /* arg1->r3 */ \ - "ld 4, 16(11)\n\t" /* arg2->r4 */ \ - "ld 11, 0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "mr 11,%1\n\t" \ - "mr %0,3\n\t" \ - "ld 2,-16(11)" /* restore tocptr */ \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[2]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[3+3]; \ - volatile unsigned long _res; \ - /* _argvec[0] holds current r2 across the call */ \ - _argvec[1] = (unsigned long)_orig.r2; \ - _argvec[2] = (unsigned long)_orig.nraddr; \ - _argvec[2+1] = (unsigned long)arg1; \ - _argvec[2+2] = (unsigned long)arg2; \ - _argvec[2+3] = (unsigned long)arg3; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - "std 2,-16(11)\n\t" /* save tocptr */ \ - "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ - "ld 3, 8(11)\n\t" /* arg1->r3 */ \ - "ld 4, 16(11)\n\t" /* arg2->r4 */ \ - "ld 5, 24(11)\n\t" /* arg3->r5 */ \ - "ld 11, 0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "mr 11,%1\n\t" \ - "mr %0,3\n\t" \ - "ld 2,-16(11)" /* restore tocptr */ \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[2]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[3+4]; \ - volatile unsigned long _res; \ - /* _argvec[0] holds current r2 across the call */ \ - _argvec[1] = (unsigned long)_orig.r2; \ - _argvec[2] = (unsigned long)_orig.nraddr; \ - _argvec[2+1] = (unsigned long)arg1; \ - _argvec[2+2] = (unsigned long)arg2; \ - _argvec[2+3] = (unsigned long)arg3; \ - _argvec[2+4] = (unsigned long)arg4; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - "std 2,-16(11)\n\t" /* save tocptr */ \ - "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ - "ld 3, 8(11)\n\t" /* arg1->r3 */ \ - "ld 4, 16(11)\n\t" /* arg2->r4 */ \ - "ld 5, 24(11)\n\t" /* arg3->r5 */ \ - "ld 6, 32(11)\n\t" /* arg4->r6 */ \ - "ld 11, 0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "mr 11,%1\n\t" \ - "mr %0,3\n\t" \ - "ld 2,-16(11)" /* restore tocptr */ \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[2]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[3+5]; \ - volatile unsigned long _res; \ - /* _argvec[0] holds current r2 across the call */ \ - _argvec[1] = (unsigned long)_orig.r2; \ - _argvec[2] = (unsigned long)_orig.nraddr; \ - _argvec[2+1] = (unsigned long)arg1; \ - _argvec[2+2] = (unsigned long)arg2; \ - _argvec[2+3] = (unsigned long)arg3; \ - _argvec[2+4] = (unsigned long)arg4; \ - _argvec[2+5] = (unsigned long)arg5; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - "std 2,-16(11)\n\t" /* save tocptr */ \ - "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ - "ld 3, 8(11)\n\t" /* arg1->r3 */ \ - "ld 4, 16(11)\n\t" /* arg2->r4 */ \ - "ld 5, 24(11)\n\t" /* arg3->r5 */ \ - "ld 6, 32(11)\n\t" /* arg4->r6 */ \ - "ld 7, 40(11)\n\t" /* arg5->r7 */ \ - "ld 11, 0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "mr 11,%1\n\t" \ - "mr %0,3\n\t" \ - "ld 2,-16(11)" /* restore tocptr */ \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[2]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[3+6]; \ - volatile unsigned long _res; \ - /* _argvec[0] holds current r2 across the call */ \ - _argvec[1] = (unsigned long)_orig.r2; \ - _argvec[2] = (unsigned long)_orig.nraddr; \ - _argvec[2+1] = (unsigned long)arg1; \ - _argvec[2+2] = (unsigned long)arg2; \ - _argvec[2+3] = (unsigned long)arg3; \ - _argvec[2+4] = (unsigned long)arg4; \ - _argvec[2+5] = (unsigned long)arg5; \ - _argvec[2+6] = (unsigned long)arg6; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - "std 2,-16(11)\n\t" /* save tocptr */ \ - "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ - "ld 3, 8(11)\n\t" /* arg1->r3 */ \ - "ld 4, 16(11)\n\t" /* arg2->r4 */ \ - "ld 5, 24(11)\n\t" /* arg3->r5 */ \ - "ld 6, 32(11)\n\t" /* arg4->r6 */ \ - "ld 7, 40(11)\n\t" /* arg5->r7 */ \ - "ld 8, 48(11)\n\t" /* arg6->r8 */ \ - "ld 11, 0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "mr 11,%1\n\t" \ - "mr %0,3\n\t" \ - "ld 2,-16(11)" /* restore tocptr */ \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[2]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ - arg7) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[3+7]; \ - volatile unsigned long _res; \ - /* _argvec[0] holds current r2 across the call */ \ - _argvec[1] = (unsigned long)_orig.r2; \ - _argvec[2] = (unsigned long)_orig.nraddr; \ - _argvec[2+1] = (unsigned long)arg1; \ - _argvec[2+2] = (unsigned long)arg2; \ - _argvec[2+3] = (unsigned long)arg3; \ - _argvec[2+4] = (unsigned long)arg4; \ - _argvec[2+5] = (unsigned long)arg5; \ - _argvec[2+6] = (unsigned long)arg6; \ - _argvec[2+7] = (unsigned long)arg7; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - "std 2,-16(11)\n\t" /* save tocptr */ \ - "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ - "ld 3, 8(11)\n\t" /* arg1->r3 */ \ - "ld 4, 16(11)\n\t" /* arg2->r4 */ \ - "ld 5, 24(11)\n\t" /* arg3->r5 */ \ - "ld 6, 32(11)\n\t" /* arg4->r6 */ \ - "ld 7, 40(11)\n\t" /* arg5->r7 */ \ - "ld 8, 48(11)\n\t" /* arg6->r8 */ \ - "ld 9, 56(11)\n\t" /* arg7->r9 */ \ - "ld 11, 0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "mr 11,%1\n\t" \ - "mr %0,3\n\t" \ - "ld 2,-16(11)" /* restore tocptr */ \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[2]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ - arg7,arg8) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[3+8]; \ - volatile unsigned long _res; \ - /* _argvec[0] holds current r2 across the call */ \ - _argvec[1] = (unsigned long)_orig.r2; \ - _argvec[2] = (unsigned long)_orig.nraddr; \ - _argvec[2+1] = (unsigned long)arg1; \ - _argvec[2+2] = (unsigned long)arg2; \ - _argvec[2+3] = (unsigned long)arg3; \ - _argvec[2+4] = (unsigned long)arg4; \ - _argvec[2+5] = (unsigned long)arg5; \ - _argvec[2+6] = (unsigned long)arg6; \ - _argvec[2+7] = (unsigned long)arg7; \ - _argvec[2+8] = (unsigned long)arg8; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - "std 2,-16(11)\n\t" /* save tocptr */ \ - "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ - "ld 3, 8(11)\n\t" /* arg1->r3 */ \ - "ld 4, 16(11)\n\t" /* arg2->r4 */ \ - "ld 5, 24(11)\n\t" /* arg3->r5 */ \ - "ld 6, 32(11)\n\t" /* arg4->r6 */ \ - "ld 7, 40(11)\n\t" /* arg5->r7 */ \ - "ld 8, 48(11)\n\t" /* arg6->r8 */ \ - "ld 9, 56(11)\n\t" /* arg7->r9 */ \ - "ld 10, 64(11)\n\t" /* arg8->r10 */ \ - "ld 11, 0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "mr 11,%1\n\t" \ - "mr %0,3\n\t" \ - "ld 2,-16(11)" /* restore tocptr */ \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[2]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ - arg7,arg8,arg9) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[3+9]; \ - volatile unsigned long _res; \ - /* _argvec[0] holds current r2 across the call */ \ - _argvec[1] = (unsigned long)_orig.r2; \ - _argvec[2] = (unsigned long)_orig.nraddr; \ - _argvec[2+1] = (unsigned long)arg1; \ - _argvec[2+2] = (unsigned long)arg2; \ - _argvec[2+3] = (unsigned long)arg3; \ - _argvec[2+4] = (unsigned long)arg4; \ - _argvec[2+5] = (unsigned long)arg5; \ - _argvec[2+6] = (unsigned long)arg6; \ - _argvec[2+7] = (unsigned long)arg7; \ - _argvec[2+8] = (unsigned long)arg8; \ - _argvec[2+9] = (unsigned long)arg9; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - "std 2,-16(11)\n\t" /* save tocptr */ \ - "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ - "addi 1,1,-128\n\t" /* expand stack frame */ \ - /* arg9 */ \ - "ld 3,72(11)\n\t" \ - "std 3,112(1)\n\t" \ - /* args1-8 */ \ - "ld 3, 8(11)\n\t" /* arg1->r3 */ \ - "ld 4, 16(11)\n\t" /* arg2->r4 */ \ - "ld 5, 24(11)\n\t" /* arg3->r5 */ \ - "ld 6, 32(11)\n\t" /* arg4->r6 */ \ - "ld 7, 40(11)\n\t" /* arg5->r7 */ \ - "ld 8, 48(11)\n\t" /* arg6->r8 */ \ - "ld 9, 56(11)\n\t" /* arg7->r9 */ \ - "ld 10, 64(11)\n\t" /* arg8->r10 */ \ - "ld 11, 0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "mr 11,%1\n\t" \ - "mr %0,3\n\t" \ - "ld 2,-16(11)\n\t" /* restore tocptr */ \ - "addi 1,1,128" /* restore frame */ \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[2]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ - arg7,arg8,arg9,arg10) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[3+10]; \ - volatile unsigned long _res; \ - /* _argvec[0] holds current r2 across the call */ \ - _argvec[1] = (unsigned long)_orig.r2; \ - _argvec[2] = (unsigned long)_orig.nraddr; \ - _argvec[2+1] = (unsigned long)arg1; \ - _argvec[2+2] = (unsigned long)arg2; \ - _argvec[2+3] = (unsigned long)arg3; \ - _argvec[2+4] = (unsigned long)arg4; \ - _argvec[2+5] = (unsigned long)arg5; \ - _argvec[2+6] = (unsigned long)arg6; \ - _argvec[2+7] = (unsigned long)arg7; \ - _argvec[2+8] = (unsigned long)arg8; \ - _argvec[2+9] = (unsigned long)arg9; \ - _argvec[2+10] = (unsigned long)arg10; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - "std 2,-16(11)\n\t" /* save tocptr */ \ - "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ - "addi 1,1,-128\n\t" /* expand stack frame */ \ - /* arg10 */ \ - "ld 3,80(11)\n\t" \ - "std 3,120(1)\n\t" \ - /* arg9 */ \ - "ld 3,72(11)\n\t" \ - "std 3,112(1)\n\t" \ - /* args1-8 */ \ - "ld 3, 8(11)\n\t" /* arg1->r3 */ \ - "ld 4, 16(11)\n\t" /* arg2->r4 */ \ - "ld 5, 24(11)\n\t" /* arg3->r5 */ \ - "ld 6, 32(11)\n\t" /* arg4->r6 */ \ - "ld 7, 40(11)\n\t" /* arg5->r7 */ \ - "ld 8, 48(11)\n\t" /* arg6->r8 */ \ - "ld 9, 56(11)\n\t" /* arg7->r9 */ \ - "ld 10, 64(11)\n\t" /* arg8->r10 */ \ - "ld 11, 0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "mr 11,%1\n\t" \ - "mr %0,3\n\t" \ - "ld 2,-16(11)\n\t" /* restore tocptr */ \ - "addi 1,1,128" /* restore frame */ \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[2]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ - arg7,arg8,arg9,arg10,arg11) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[3+11]; \ - volatile unsigned long _res; \ - /* _argvec[0] holds current r2 across the call */ \ - _argvec[1] = (unsigned long)_orig.r2; \ - _argvec[2] = (unsigned long)_orig.nraddr; \ - _argvec[2+1] = (unsigned long)arg1; \ - _argvec[2+2] = (unsigned long)arg2; \ - _argvec[2+3] = (unsigned long)arg3; \ - _argvec[2+4] = (unsigned long)arg4; \ - _argvec[2+5] = (unsigned long)arg5; \ - _argvec[2+6] = (unsigned long)arg6; \ - _argvec[2+7] = (unsigned long)arg7; \ - _argvec[2+8] = (unsigned long)arg8; \ - _argvec[2+9] = (unsigned long)arg9; \ - _argvec[2+10] = (unsigned long)arg10; \ - _argvec[2+11] = (unsigned long)arg11; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - "std 2,-16(11)\n\t" /* save tocptr */ \ - "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ - "addi 1,1,-144\n\t" /* expand stack frame */ \ - /* arg11 */ \ - "ld 3,88(11)\n\t" \ - "std 3,128(1)\n\t" \ - /* arg10 */ \ - "ld 3,80(11)\n\t" \ - "std 3,120(1)\n\t" \ - /* arg9 */ \ - "ld 3,72(11)\n\t" \ - "std 3,112(1)\n\t" \ - /* args1-8 */ \ - "ld 3, 8(11)\n\t" /* arg1->r3 */ \ - "ld 4, 16(11)\n\t" /* arg2->r4 */ \ - "ld 5, 24(11)\n\t" /* arg3->r5 */ \ - "ld 6, 32(11)\n\t" /* arg4->r6 */ \ - "ld 7, 40(11)\n\t" /* arg5->r7 */ \ - "ld 8, 48(11)\n\t" /* arg6->r8 */ \ - "ld 9, 56(11)\n\t" /* arg7->r9 */ \ - "ld 10, 64(11)\n\t" /* arg8->r10 */ \ - "ld 11, 0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "mr 11,%1\n\t" \ - "mr %0,3\n\t" \ - "ld 2,-16(11)\n\t" /* restore tocptr */ \ - "addi 1,1,144" /* restore frame */ \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[2]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ - arg7,arg8,arg9,arg10,arg11,arg12) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[3+12]; \ - volatile unsigned long _res; \ - /* _argvec[0] holds current r2 across the call */ \ - _argvec[1] = (unsigned long)_orig.r2; \ - _argvec[2] = (unsigned long)_orig.nraddr; \ - _argvec[2+1] = (unsigned long)arg1; \ - _argvec[2+2] = (unsigned long)arg2; \ - _argvec[2+3] = (unsigned long)arg3; \ - _argvec[2+4] = (unsigned long)arg4; \ - _argvec[2+5] = (unsigned long)arg5; \ - _argvec[2+6] = (unsigned long)arg6; \ - _argvec[2+7] = (unsigned long)arg7; \ - _argvec[2+8] = (unsigned long)arg8; \ - _argvec[2+9] = (unsigned long)arg9; \ - _argvec[2+10] = (unsigned long)arg10; \ - _argvec[2+11] = (unsigned long)arg11; \ - _argvec[2+12] = (unsigned long)arg12; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - "std 2,-16(11)\n\t" /* save tocptr */ \ - "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ - "addi 1,1,-144\n\t" /* expand stack frame */ \ - /* arg12 */ \ - "ld 3,96(11)\n\t" \ - "std 3,136(1)\n\t" \ - /* arg11 */ \ - "ld 3,88(11)\n\t" \ - "std 3,128(1)\n\t" \ - /* arg10 */ \ - "ld 3,80(11)\n\t" \ - "std 3,120(1)\n\t" \ - /* arg9 */ \ - "ld 3,72(11)\n\t" \ - "std 3,112(1)\n\t" \ - /* args1-8 */ \ - "ld 3, 8(11)\n\t" /* arg1->r3 */ \ - "ld 4, 16(11)\n\t" /* arg2->r4 */ \ - "ld 5, 24(11)\n\t" /* arg3->r5 */ \ - "ld 6, 32(11)\n\t" /* arg4->r6 */ \ - "ld 7, 40(11)\n\t" /* arg5->r7 */ \ - "ld 8, 48(11)\n\t" /* arg6->r8 */ \ - "ld 9, 56(11)\n\t" /* arg7->r9 */ \ - "ld 10, 64(11)\n\t" /* arg8->r10 */ \ - "ld 11, 0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "mr 11,%1\n\t" \ - "mr %0,3\n\t" \ - "ld 2,-16(11)\n\t" /* restore tocptr */ \ - "addi 1,1,144" /* restore frame */ \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[2]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#endif /* PLAT_ppc64_linux */ - -/* ------------------------- arm-linux ------------------------- */ - -#if defined(PLAT_arm_linux) - -/* These regs are trashed by the hidden call. */ -#define __CALLER_SAVED_REGS "r0", "r1", "r2", "r3","r4","r14" - -/* These CALL_FN_ macros assume that on arm-linux, sizeof(unsigned - long) == 4. */ - -#define CALL_FN_W_v(lval, orig) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[1]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - __asm__ volatile( \ - "ldr r4, [%1] \n\t" /* target->r4 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4 \ - "mov %0, r0\n" \ - : /*out*/ "=r" (_res) \ - : /*in*/ "0" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_W(lval, orig, arg1) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[2]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - _argvec[1] = (unsigned long)(arg1); \ - __asm__ volatile( \ - "ldr r0, [%1, #4] \n\t" \ - "ldr r4, [%1] \n\t" /* target->r4 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4 \ - "mov %0, r0\n" \ - : /*out*/ "=r" (_res) \ - : /*in*/ "0" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_WW(lval, orig, arg1,arg2) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[3]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - _argvec[1] = (unsigned long)(arg1); \ - _argvec[2] = (unsigned long)(arg2); \ - __asm__ volatile( \ - "ldr r0, [%1, #4] \n\t" \ - "ldr r1, [%1, #8] \n\t" \ - "ldr r4, [%1] \n\t" /* target->r4 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4 \ - "mov %0, r0\n" \ - : /*out*/ "=r" (_res) \ - : /*in*/ "0" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[4]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - _argvec[1] = (unsigned long)(arg1); \ - _argvec[2] = (unsigned long)(arg2); \ - _argvec[3] = (unsigned long)(arg3); \ - __asm__ volatile( \ - "ldr r0, [%1, #4] \n\t" \ - "ldr r1, [%1, #8] \n\t" \ - "ldr r2, [%1, #12] \n\t" \ - "ldr r4, [%1] \n\t" /* target->r4 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4 \ - "mov %0, r0\n" \ - : /*out*/ "=r" (_res) \ - : /*in*/ "0" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[5]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - _argvec[1] = (unsigned long)(arg1); \ - _argvec[2] = (unsigned long)(arg2); \ - _argvec[3] = (unsigned long)(arg3); \ - _argvec[4] = (unsigned long)(arg4); \ - __asm__ volatile( \ - "ldr r0, [%1, #4] \n\t" \ - "ldr r1, [%1, #8] \n\t" \ - "ldr r2, [%1, #12] \n\t" \ - "ldr r3, [%1, #16] \n\t" \ - "ldr r4, [%1] \n\t" /* target->r4 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4 \ - "mov %0, r0" \ - : /*out*/ "=r" (_res) \ - : /*in*/ "0" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[6]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - _argvec[1] = (unsigned long)(arg1); \ - _argvec[2] = (unsigned long)(arg2); \ - _argvec[3] = (unsigned long)(arg3); \ - _argvec[4] = (unsigned long)(arg4); \ - _argvec[5] = (unsigned long)(arg5); \ - __asm__ volatile( \ - "ldr r0, [%1, #20] \n\t" \ - "push {r0} \n\t" \ - "ldr r0, [%1, #4] \n\t" \ - "ldr r1, [%1, #8] \n\t" \ - "ldr r2, [%1, #12] \n\t" \ - "ldr r3, [%1, #16] \n\t" \ - "ldr r4, [%1] \n\t" /* target->r4 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4 \ - "add sp, sp, #4 \n\t" \ - "mov %0, r0" \ - : /*out*/ "=r" (_res) \ - : /*in*/ "0" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[7]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - _argvec[1] = (unsigned long)(arg1); \ - _argvec[2] = (unsigned long)(arg2); \ - _argvec[3] = (unsigned long)(arg3); \ - _argvec[4] = (unsigned long)(arg4); \ - _argvec[5] = (unsigned long)(arg5); \ - _argvec[6] = (unsigned long)(arg6); \ - __asm__ volatile( \ - "ldr r0, [%1, #20] \n\t" \ - "ldr r1, [%1, #24] \n\t" \ - "push {r0, r1} \n\t" \ - "ldr r0, [%1, #4] \n\t" \ - "ldr r1, [%1, #8] \n\t" \ - "ldr r2, [%1, #12] \n\t" \ - "ldr r3, [%1, #16] \n\t" \ - "ldr r4, [%1] \n\t" /* target->r4 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4 \ - "add sp, sp, #8 \n\t" \ - "mov %0, r0" \ - : /*out*/ "=r" (_res) \ - : /*in*/ "0" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ - arg7) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[8]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - _argvec[1] = (unsigned long)(arg1); \ - _argvec[2] = (unsigned long)(arg2); \ - _argvec[3] = (unsigned long)(arg3); \ - _argvec[4] = (unsigned long)(arg4); \ - _argvec[5] = (unsigned long)(arg5); \ - _argvec[6] = (unsigned long)(arg6); \ - _argvec[7] = (unsigned long)(arg7); \ - __asm__ volatile( \ - "ldr r0, [%1, #20] \n\t" \ - "ldr r1, [%1, #24] \n\t" \ - "ldr r2, [%1, #28] \n\t" \ - "push {r0, r1, r2} \n\t" \ - "ldr r0, [%1, #4] \n\t" \ - "ldr r1, [%1, #8] \n\t" \ - "ldr r2, [%1, #12] \n\t" \ - "ldr r3, [%1, #16] \n\t" \ - "ldr r4, [%1] \n\t" /* target->r4 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4 \ - "add sp, sp, #12 \n\t" \ - "mov %0, r0" \ - : /*out*/ "=r" (_res) \ - : /*in*/ "0" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ - arg7,arg8) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[9]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - _argvec[1] = (unsigned long)(arg1); \ - _argvec[2] = (unsigned long)(arg2); \ - _argvec[3] = (unsigned long)(arg3); \ - _argvec[4] = (unsigned long)(arg4); \ - _argvec[5] = (unsigned long)(arg5); \ - _argvec[6] = (unsigned long)(arg6); \ - _argvec[7] = (unsigned long)(arg7); \ - _argvec[8] = (unsigned long)(arg8); \ - __asm__ volatile( \ - "ldr r0, [%1, #20] \n\t" \ - "ldr r1, [%1, #24] \n\t" \ - "ldr r2, [%1, #28] \n\t" \ - "ldr r3, [%1, #32] \n\t" \ - "push {r0, r1, r2, r3} \n\t" \ - "ldr r0, [%1, #4] \n\t" \ - "ldr r1, [%1, #8] \n\t" \ - "ldr r2, [%1, #12] \n\t" \ - "ldr r3, [%1, #16] \n\t" \ - "ldr r4, [%1] \n\t" /* target->r4 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4 \ - "add sp, sp, #16 \n\t" \ - "mov %0, r0" \ - : /*out*/ "=r" (_res) \ - : /*in*/ "0" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ - arg7,arg8,arg9) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[10]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - _argvec[1] = (unsigned long)(arg1); \ - _argvec[2] = (unsigned long)(arg2); \ - _argvec[3] = (unsigned long)(arg3); \ - _argvec[4] = (unsigned long)(arg4); \ - _argvec[5] = (unsigned long)(arg5); \ - _argvec[6] = (unsigned long)(arg6); \ - _argvec[7] = (unsigned long)(arg7); \ - _argvec[8] = (unsigned long)(arg8); \ - _argvec[9] = (unsigned long)(arg9); \ - __asm__ volatile( \ - "ldr r0, [%1, #20] \n\t" \ - "ldr r1, [%1, #24] \n\t" \ - "ldr r2, [%1, #28] \n\t" \ - "ldr r3, [%1, #32] \n\t" \ - "ldr r4, [%1, #36] \n\t" \ - "push {r0, r1, r2, r3, r4} \n\t" \ - "ldr r0, [%1, #4] \n\t" \ - "ldr r1, [%1, #8] \n\t" \ - "ldr r2, [%1, #12] \n\t" \ - "ldr r3, [%1, #16] \n\t" \ - "ldr r4, [%1] \n\t" /* target->r4 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4 \ - "add sp, sp, #20 \n\t" \ - "mov %0, r0" \ - : /*out*/ "=r" (_res) \ - : /*in*/ "0" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ - arg7,arg8,arg9,arg10) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[11]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - _argvec[1] = (unsigned long)(arg1); \ - _argvec[2] = (unsigned long)(arg2); \ - _argvec[3] = (unsigned long)(arg3); \ - _argvec[4] = (unsigned long)(arg4); \ - _argvec[5] = (unsigned long)(arg5); \ - _argvec[6] = (unsigned long)(arg6); \ - _argvec[7] = (unsigned long)(arg7); \ - _argvec[8] = (unsigned long)(arg8); \ - _argvec[9] = (unsigned long)(arg9); \ - _argvec[10] = (unsigned long)(arg10); \ - __asm__ volatile( \ - "ldr r0, [%1, #40] \n\t" \ - "push {r0} \n\t" \ - "ldr r0, [%1, #20] \n\t" \ - "ldr r1, [%1, #24] \n\t" \ - "ldr r2, [%1, #28] \n\t" \ - "ldr r3, [%1, #32] \n\t" \ - "ldr r4, [%1, #36] \n\t" \ - "push {r0, r1, r2, r3, r4} \n\t" \ - "ldr r0, [%1, #4] \n\t" \ - "ldr r1, [%1, #8] \n\t" \ - "ldr r2, [%1, #12] \n\t" \ - "ldr r3, [%1, #16] \n\t" \ - "ldr r4, [%1] \n\t" /* target->r4 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4 \ - "add sp, sp, #24 \n\t" \ - "mov %0, r0" \ - : /*out*/ "=r" (_res) \ - : /*in*/ "0" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5, \ - arg6,arg7,arg8,arg9,arg10, \ - arg11) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[12]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - _argvec[1] = (unsigned long)(arg1); \ - _argvec[2] = (unsigned long)(arg2); \ - _argvec[3] = (unsigned long)(arg3); \ - _argvec[4] = (unsigned long)(arg4); \ - _argvec[5] = (unsigned long)(arg5); \ - _argvec[6] = (unsigned long)(arg6); \ - _argvec[7] = (unsigned long)(arg7); \ - _argvec[8] = (unsigned long)(arg8); \ - _argvec[9] = (unsigned long)(arg9); \ - _argvec[10] = (unsigned long)(arg10); \ - _argvec[11] = (unsigned long)(arg11); \ - __asm__ volatile( \ - "ldr r0, [%1, #40] \n\t" \ - "ldr r1, [%1, #44] \n\t" \ - "push {r0, r1} \n\t" \ - "ldr r0, [%1, #20] \n\t" \ - "ldr r1, [%1, #24] \n\t" \ - "ldr r2, [%1, #28] \n\t" \ - "ldr r3, [%1, #32] \n\t" \ - "ldr r4, [%1, #36] \n\t" \ - "push {r0, r1, r2, r3, r4} \n\t" \ - "ldr r0, [%1, #4] \n\t" \ - "ldr r1, [%1, #8] \n\t" \ - "ldr r2, [%1, #12] \n\t" \ - "ldr r3, [%1, #16] \n\t" \ - "ldr r4, [%1] \n\t" /* target->r4 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4 \ - "add sp, sp, #28 \n\t" \ - "mov %0, r0" \ - : /*out*/ "=r" (_res) \ - : /*in*/ "0" (&_argvec[0]) \ - : /*trash*/ "cc", "memory",__CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5, \ - arg6,arg7,arg8,arg9,arg10, \ - arg11,arg12) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[13]; \ - volatile unsigned long _res; \ - _argvec[0] = (unsigned long)_orig.nraddr; \ - _argvec[1] = (unsigned long)(arg1); \ - _argvec[2] = (unsigned long)(arg2); \ - _argvec[3] = (unsigned long)(arg3); \ - _argvec[4] = (unsigned long)(arg4); \ - _argvec[5] = (unsigned long)(arg5); \ - _argvec[6] = (unsigned long)(arg6); \ - _argvec[7] = (unsigned long)(arg7); \ - _argvec[8] = (unsigned long)(arg8); \ - _argvec[9] = (unsigned long)(arg9); \ - _argvec[10] = (unsigned long)(arg10); \ - _argvec[11] = (unsigned long)(arg11); \ - _argvec[12] = (unsigned long)(arg12); \ - __asm__ volatile( \ - "ldr r0, [%1, #40] \n\t" \ - "ldr r1, [%1, #44] \n\t" \ - "ldr r2, [%1, #48] \n\t" \ - "push {r0, r1, r2} \n\t" \ - "ldr r0, [%1, #20] \n\t" \ - "ldr r1, [%1, #24] \n\t" \ - "ldr r2, [%1, #28] \n\t" \ - "ldr r3, [%1, #32] \n\t" \ - "ldr r4, [%1, #36] \n\t" \ - "push {r0, r1, r2, r3, r4} \n\t" \ - "ldr r0, [%1, #4] \n\t" \ - "ldr r1, [%1, #8] \n\t" \ - "ldr r2, [%1, #12] \n\t" \ - "ldr r3, [%1, #16] \n\t" \ - "ldr r4, [%1] \n\t" /* target->r4 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R4 \ - "add sp, sp, #32 \n\t" \ - "mov %0, r0" \ - : /*out*/ "=r" (_res) \ - : /*in*/ "0" (&_argvec[0]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#endif /* PLAT_arm_linux */ - -/* ------------------------ ppc32-aix5 ------------------------- */ - -#if defined(PLAT_ppc32_aix5) - -/* ARGREGS: r3 r4 r5 r6 r7 r8 r9 r10 (the rest on stack somewhere) */ - -/* These regs are trashed by the hidden call. */ -#define __CALLER_SAVED_REGS \ - "lr", "ctr", "xer", \ - "cr0", "cr1", "cr2", "cr3", "cr4", "cr5", "cr6", "cr7", \ - "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", \ - "r11", "r12", "r13" - -/* Expand the stack frame, copying enough info that unwinding - still works. Trashes r3. */ - -#define VG_EXPAND_FRAME_BY_trashes_r3(_n_fr) \ - "addi 1,1,-" #_n_fr "\n\t" \ - "lwz 3," #_n_fr "(1)\n\t" \ - "stw 3,0(1)\n\t" - -#define VG_CONTRACT_FRAME_BY(_n_fr) \ - "addi 1,1," #_n_fr "\n\t" - -/* These CALL_FN_ macros assume that on ppc32-aix5, sizeof(unsigned - long) == 4. */ - -#define CALL_FN_W_v(lval, orig) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[3+0]; \ - volatile unsigned long _res; \ - /* _argvec[0] holds current r2 across the call */ \ - _argvec[1] = (unsigned long)_orig.r2; \ - _argvec[2] = (unsigned long)_orig.nraddr; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - VG_EXPAND_FRAME_BY_trashes_r3(512) \ - "stw 2,-8(11)\n\t" /* save tocptr */ \ - "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ - "lwz 11, 0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "mr 11,%1\n\t" \ - "mr %0,3\n\t" \ - "lwz 2,-8(11)\n\t" /* restore tocptr */ \ - VG_CONTRACT_FRAME_BY(512) \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[2]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_W(lval, orig, arg1) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[3+1]; \ - volatile unsigned long _res; \ - /* _argvec[0] holds current r2 across the call */ \ - _argvec[1] = (unsigned long)_orig.r2; \ - _argvec[2] = (unsigned long)_orig.nraddr; \ - _argvec[2+1] = (unsigned long)arg1; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - VG_EXPAND_FRAME_BY_trashes_r3(512) \ - "stw 2,-8(11)\n\t" /* save tocptr */ \ - "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ - "lwz 3, 4(11)\n\t" /* arg1->r3 */ \ - "lwz 11, 0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "mr 11,%1\n\t" \ - "mr %0,3\n\t" \ - "lwz 2,-8(11)\n\t" /* restore tocptr */ \ - VG_CONTRACT_FRAME_BY(512) \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[2]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_WW(lval, orig, arg1,arg2) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[3+2]; \ - volatile unsigned long _res; \ - /* _argvec[0] holds current r2 across the call */ \ - _argvec[1] = (unsigned long)_orig.r2; \ - _argvec[2] = (unsigned long)_orig.nraddr; \ - _argvec[2+1] = (unsigned long)arg1; \ - _argvec[2+2] = (unsigned long)arg2; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - VG_EXPAND_FRAME_BY_trashes_r3(512) \ - "stw 2,-8(11)\n\t" /* save tocptr */ \ - "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ - "lwz 3, 4(11)\n\t" /* arg1->r3 */ \ - "lwz 4, 8(11)\n\t" /* arg2->r4 */ \ - "lwz 11, 0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "mr 11,%1\n\t" \ - "mr %0,3\n\t" \ - "lwz 2,-8(11)\n\t" /* restore tocptr */ \ - VG_CONTRACT_FRAME_BY(512) \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[2]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[3+3]; \ - volatile unsigned long _res; \ - /* _argvec[0] holds current r2 across the call */ \ - _argvec[1] = (unsigned long)_orig.r2; \ - _argvec[2] = (unsigned long)_orig.nraddr; \ - _argvec[2+1] = (unsigned long)arg1; \ - _argvec[2+2] = (unsigned long)arg2; \ - _argvec[2+3] = (unsigned long)arg3; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - VG_EXPAND_FRAME_BY_trashes_r3(512) \ - "stw 2,-8(11)\n\t" /* save tocptr */ \ - "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ - "lwz 3, 4(11)\n\t" /* arg1->r3 */ \ - "lwz 4, 8(11)\n\t" /* arg2->r4 */ \ - "lwz 5, 12(11)\n\t" /* arg3->r5 */ \ - "lwz 11, 0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "mr 11,%1\n\t" \ - "mr %0,3\n\t" \ - "lwz 2,-8(11)\n\t" /* restore tocptr */ \ - VG_CONTRACT_FRAME_BY(512) \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[2]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[3+4]; \ - volatile unsigned long _res; \ - /* _argvec[0] holds current r2 across the call */ \ - _argvec[1] = (unsigned long)_orig.r2; \ - _argvec[2] = (unsigned long)_orig.nraddr; \ - _argvec[2+1] = (unsigned long)arg1; \ - _argvec[2+2] = (unsigned long)arg2; \ - _argvec[2+3] = (unsigned long)arg3; \ - _argvec[2+4] = (unsigned long)arg4; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - VG_EXPAND_FRAME_BY_trashes_r3(512) \ - "stw 2,-8(11)\n\t" /* save tocptr */ \ - "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ - "lwz 3, 4(11)\n\t" /* arg1->r3 */ \ - "lwz 4, 8(11)\n\t" /* arg2->r4 */ \ - "lwz 5, 12(11)\n\t" /* arg3->r5 */ \ - "lwz 6, 16(11)\n\t" /* arg4->r6 */ \ - "lwz 11, 0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "mr 11,%1\n\t" \ - "mr %0,3\n\t" \ - "lwz 2,-8(11)\n\t" /* restore tocptr */ \ - VG_CONTRACT_FRAME_BY(512) \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[2]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[3+5]; \ - volatile unsigned long _res; \ - /* _argvec[0] holds current r2 across the call */ \ - _argvec[1] = (unsigned long)_orig.r2; \ - _argvec[2] = (unsigned long)_orig.nraddr; \ - _argvec[2+1] = (unsigned long)arg1; \ - _argvec[2+2] = (unsigned long)arg2; \ - _argvec[2+3] = (unsigned long)arg3; \ - _argvec[2+4] = (unsigned long)arg4; \ - _argvec[2+5] = (unsigned long)arg5; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - VG_EXPAND_FRAME_BY_trashes_r3(512) \ - "stw 2,-8(11)\n\t" /* save tocptr */ \ - "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ - "lwz 3, 4(11)\n\t" /* arg1->r3 */ \ - "lwz 4, 8(11)\n\t" /* arg2->r4 */ \ - "lwz 5, 12(11)\n\t" /* arg3->r5 */ \ - "lwz 6, 16(11)\n\t" /* arg4->r6 */ \ - "lwz 7, 20(11)\n\t" /* arg5->r7 */ \ - "lwz 11, 0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "mr 11,%1\n\t" \ - "mr %0,3\n\t" \ - "lwz 2,-8(11)\n\t" /* restore tocptr */ \ - VG_CONTRACT_FRAME_BY(512) \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[2]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[3+6]; \ - volatile unsigned long _res; \ - /* _argvec[0] holds current r2 across the call */ \ - _argvec[1] = (unsigned long)_orig.r2; \ - _argvec[2] = (unsigned long)_orig.nraddr; \ - _argvec[2+1] = (unsigned long)arg1; \ - _argvec[2+2] = (unsigned long)arg2; \ - _argvec[2+3] = (unsigned long)arg3; \ - _argvec[2+4] = (unsigned long)arg4; \ - _argvec[2+5] = (unsigned long)arg5; \ - _argvec[2+6] = (unsigned long)arg6; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - VG_EXPAND_FRAME_BY_trashes_r3(512) \ - "stw 2,-8(11)\n\t" /* save tocptr */ \ - "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ - "lwz 3, 4(11)\n\t" /* arg1->r3 */ \ - "lwz 4, 8(11)\n\t" /* arg2->r4 */ \ - "lwz 5, 12(11)\n\t" /* arg3->r5 */ \ - "lwz 6, 16(11)\n\t" /* arg4->r6 */ \ - "lwz 7, 20(11)\n\t" /* arg5->r7 */ \ - "lwz 8, 24(11)\n\t" /* arg6->r8 */ \ - "lwz 11, 0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "mr 11,%1\n\t" \ - "mr %0,3\n\t" \ - "lwz 2,-8(11)\n\t" /* restore tocptr */ \ - VG_CONTRACT_FRAME_BY(512) \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[2]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ - arg7) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[3+7]; \ - volatile unsigned long _res; \ - /* _argvec[0] holds current r2 across the call */ \ - _argvec[1] = (unsigned long)_orig.r2; \ - _argvec[2] = (unsigned long)_orig.nraddr; \ - _argvec[2+1] = (unsigned long)arg1; \ - _argvec[2+2] = (unsigned long)arg2; \ - _argvec[2+3] = (unsigned long)arg3; \ - _argvec[2+4] = (unsigned long)arg4; \ - _argvec[2+5] = (unsigned long)arg5; \ - _argvec[2+6] = (unsigned long)arg6; \ - _argvec[2+7] = (unsigned long)arg7; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - VG_EXPAND_FRAME_BY_trashes_r3(512) \ - "stw 2,-8(11)\n\t" /* save tocptr */ \ - "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ - "lwz 3, 4(11)\n\t" /* arg1->r3 */ \ - "lwz 4, 8(11)\n\t" /* arg2->r4 */ \ - "lwz 5, 12(11)\n\t" /* arg3->r5 */ \ - "lwz 6, 16(11)\n\t" /* arg4->r6 */ \ - "lwz 7, 20(11)\n\t" /* arg5->r7 */ \ - "lwz 8, 24(11)\n\t" /* arg6->r8 */ \ - "lwz 9, 28(11)\n\t" /* arg7->r9 */ \ - "lwz 11, 0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "mr 11,%1\n\t" \ - "mr %0,3\n\t" \ - "lwz 2,-8(11)\n\t" /* restore tocptr */ \ - VG_CONTRACT_FRAME_BY(512) \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[2]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ - arg7,arg8) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[3+8]; \ - volatile unsigned long _res; \ - /* _argvec[0] holds current r2 across the call */ \ - _argvec[1] = (unsigned long)_orig.r2; \ - _argvec[2] = (unsigned long)_orig.nraddr; \ - _argvec[2+1] = (unsigned long)arg1; \ - _argvec[2+2] = (unsigned long)arg2; \ - _argvec[2+3] = (unsigned long)arg3; \ - _argvec[2+4] = (unsigned long)arg4; \ - _argvec[2+5] = (unsigned long)arg5; \ - _argvec[2+6] = (unsigned long)arg6; \ - _argvec[2+7] = (unsigned long)arg7; \ - _argvec[2+8] = (unsigned long)arg8; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - VG_EXPAND_FRAME_BY_trashes_r3(512) \ - "stw 2,-8(11)\n\t" /* save tocptr */ \ - "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ - "lwz 3, 4(11)\n\t" /* arg1->r3 */ \ - "lwz 4, 8(11)\n\t" /* arg2->r4 */ \ - "lwz 5, 12(11)\n\t" /* arg3->r5 */ \ - "lwz 6, 16(11)\n\t" /* arg4->r6 */ \ - "lwz 7, 20(11)\n\t" /* arg5->r7 */ \ - "lwz 8, 24(11)\n\t" /* arg6->r8 */ \ - "lwz 9, 28(11)\n\t" /* arg7->r9 */ \ - "lwz 10, 32(11)\n\t" /* arg8->r10 */ \ - "lwz 11, 0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "mr 11,%1\n\t" \ - "mr %0,3\n\t" \ - "lwz 2,-8(11)\n\t" /* restore tocptr */ \ - VG_CONTRACT_FRAME_BY(512) \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[2]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ - arg7,arg8,arg9) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[3+9]; \ - volatile unsigned long _res; \ - /* _argvec[0] holds current r2 across the call */ \ - _argvec[1] = (unsigned long)_orig.r2; \ - _argvec[2] = (unsigned long)_orig.nraddr; \ - _argvec[2+1] = (unsigned long)arg1; \ - _argvec[2+2] = (unsigned long)arg2; \ - _argvec[2+3] = (unsigned long)arg3; \ - _argvec[2+4] = (unsigned long)arg4; \ - _argvec[2+5] = (unsigned long)arg5; \ - _argvec[2+6] = (unsigned long)arg6; \ - _argvec[2+7] = (unsigned long)arg7; \ - _argvec[2+8] = (unsigned long)arg8; \ - _argvec[2+9] = (unsigned long)arg9; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - VG_EXPAND_FRAME_BY_trashes_r3(512) \ - "stw 2,-8(11)\n\t" /* save tocptr */ \ - "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ - VG_EXPAND_FRAME_BY_trashes_r3(64) \ - /* arg9 */ \ - "lwz 3,36(11)\n\t" \ - "stw 3,56(1)\n\t" \ - /* args1-8 */ \ - "lwz 3, 4(11)\n\t" /* arg1->r3 */ \ - "lwz 4, 8(11)\n\t" /* arg2->r4 */ \ - "lwz 5, 12(11)\n\t" /* arg3->r5 */ \ - "lwz 6, 16(11)\n\t" /* arg4->r6 */ \ - "lwz 7, 20(11)\n\t" /* arg5->r7 */ \ - "lwz 8, 24(11)\n\t" /* arg6->r8 */ \ - "lwz 9, 28(11)\n\t" /* arg7->r9 */ \ - "lwz 10, 32(11)\n\t" /* arg8->r10 */ \ - "lwz 11, 0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "mr 11,%1\n\t" \ - "mr %0,3\n\t" \ - "lwz 2,-8(11)\n\t" /* restore tocptr */ \ - VG_CONTRACT_FRAME_BY(64) \ - VG_CONTRACT_FRAME_BY(512) \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[2]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ - arg7,arg8,arg9,arg10) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[3+10]; \ - volatile unsigned long _res; \ - /* _argvec[0] holds current r2 across the call */ \ - _argvec[1] = (unsigned long)_orig.r2; \ - _argvec[2] = (unsigned long)_orig.nraddr; \ - _argvec[2+1] = (unsigned long)arg1; \ - _argvec[2+2] = (unsigned long)arg2; \ - _argvec[2+3] = (unsigned long)arg3; \ - _argvec[2+4] = (unsigned long)arg4; \ - _argvec[2+5] = (unsigned long)arg5; \ - _argvec[2+6] = (unsigned long)arg6; \ - _argvec[2+7] = (unsigned long)arg7; \ - _argvec[2+8] = (unsigned long)arg8; \ - _argvec[2+9] = (unsigned long)arg9; \ - _argvec[2+10] = (unsigned long)arg10; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - VG_EXPAND_FRAME_BY_trashes_r3(512) \ - "stw 2,-8(11)\n\t" /* save tocptr */ \ - "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ - VG_EXPAND_FRAME_BY_trashes_r3(64) \ - /* arg10 */ \ - "lwz 3,40(11)\n\t" \ - "stw 3,60(1)\n\t" \ - /* arg9 */ \ - "lwz 3,36(11)\n\t" \ - "stw 3,56(1)\n\t" \ - /* args1-8 */ \ - "lwz 3, 4(11)\n\t" /* arg1->r3 */ \ - "lwz 4, 8(11)\n\t" /* arg2->r4 */ \ - "lwz 5, 12(11)\n\t" /* arg3->r5 */ \ - "lwz 6, 16(11)\n\t" /* arg4->r6 */ \ - "lwz 7, 20(11)\n\t" /* arg5->r7 */ \ - "lwz 8, 24(11)\n\t" /* arg6->r8 */ \ - "lwz 9, 28(11)\n\t" /* arg7->r9 */ \ - "lwz 10, 32(11)\n\t" /* arg8->r10 */ \ - "lwz 11, 0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "mr 11,%1\n\t" \ - "mr %0,3\n\t" \ - "lwz 2,-8(11)\n\t" /* restore tocptr */ \ - VG_CONTRACT_FRAME_BY(64) \ - VG_CONTRACT_FRAME_BY(512) \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[2]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ - arg7,arg8,arg9,arg10,arg11) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[3+11]; \ - volatile unsigned long _res; \ - /* _argvec[0] holds current r2 across the call */ \ - _argvec[1] = (unsigned long)_orig.r2; \ - _argvec[2] = (unsigned long)_orig.nraddr; \ - _argvec[2+1] = (unsigned long)arg1; \ - _argvec[2+2] = (unsigned long)arg2; \ - _argvec[2+3] = (unsigned long)arg3; \ - _argvec[2+4] = (unsigned long)arg4; \ - _argvec[2+5] = (unsigned long)arg5; \ - _argvec[2+6] = (unsigned long)arg6; \ - _argvec[2+7] = (unsigned long)arg7; \ - _argvec[2+8] = (unsigned long)arg8; \ - _argvec[2+9] = (unsigned long)arg9; \ - _argvec[2+10] = (unsigned long)arg10; \ - _argvec[2+11] = (unsigned long)arg11; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - VG_EXPAND_FRAME_BY_trashes_r3(512) \ - "stw 2,-8(11)\n\t" /* save tocptr */ \ - "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ - VG_EXPAND_FRAME_BY_trashes_r3(72) \ - /* arg11 */ \ - "lwz 3,44(11)\n\t" \ - "stw 3,64(1)\n\t" \ - /* arg10 */ \ - "lwz 3,40(11)\n\t" \ - "stw 3,60(1)\n\t" \ - /* arg9 */ \ - "lwz 3,36(11)\n\t" \ - "stw 3,56(1)\n\t" \ - /* args1-8 */ \ - "lwz 3, 4(11)\n\t" /* arg1->r3 */ \ - "lwz 4, 8(11)\n\t" /* arg2->r4 */ \ - "lwz 5, 12(11)\n\t" /* arg3->r5 */ \ - "lwz 6, 16(11)\n\t" /* arg4->r6 */ \ - "lwz 7, 20(11)\n\t" /* arg5->r7 */ \ - "lwz 8, 24(11)\n\t" /* arg6->r8 */ \ - "lwz 9, 28(11)\n\t" /* arg7->r9 */ \ - "lwz 10, 32(11)\n\t" /* arg8->r10 */ \ - "lwz 11, 0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "mr 11,%1\n\t" \ - "mr %0,3\n\t" \ - "lwz 2,-8(11)\n\t" /* restore tocptr */ \ - VG_CONTRACT_FRAME_BY(72) \ - VG_CONTRACT_FRAME_BY(512) \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[2]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ - arg7,arg8,arg9,arg10,arg11,arg12) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[3+12]; \ - volatile unsigned long _res; \ - /* _argvec[0] holds current r2 across the call */ \ - _argvec[1] = (unsigned long)_orig.r2; \ - _argvec[2] = (unsigned long)_orig.nraddr; \ - _argvec[2+1] = (unsigned long)arg1; \ - _argvec[2+2] = (unsigned long)arg2; \ - _argvec[2+3] = (unsigned long)arg3; \ - _argvec[2+4] = (unsigned long)arg4; \ - _argvec[2+5] = (unsigned long)arg5; \ - _argvec[2+6] = (unsigned long)arg6; \ - _argvec[2+7] = (unsigned long)arg7; \ - _argvec[2+8] = (unsigned long)arg8; \ - _argvec[2+9] = (unsigned long)arg9; \ - _argvec[2+10] = (unsigned long)arg10; \ - _argvec[2+11] = (unsigned long)arg11; \ - _argvec[2+12] = (unsigned long)arg12; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - VG_EXPAND_FRAME_BY_trashes_r3(512) \ - "stw 2,-8(11)\n\t" /* save tocptr */ \ - "lwz 2,-4(11)\n\t" /* use nraddr's tocptr */ \ - VG_EXPAND_FRAME_BY_trashes_r3(72) \ - /* arg12 */ \ - "lwz 3,48(11)\n\t" \ - "stw 3,68(1)\n\t" \ - /* arg11 */ \ - "lwz 3,44(11)\n\t" \ - "stw 3,64(1)\n\t" \ - /* arg10 */ \ - "lwz 3,40(11)\n\t" \ - "stw 3,60(1)\n\t" \ - /* arg9 */ \ - "lwz 3,36(11)\n\t" \ - "stw 3,56(1)\n\t" \ - /* args1-8 */ \ - "lwz 3, 4(11)\n\t" /* arg1->r3 */ \ - "lwz 4, 8(11)\n\t" /* arg2->r4 */ \ - "lwz 5, 12(11)\n\t" /* arg3->r5 */ \ - "lwz 6, 16(11)\n\t" /* arg4->r6 */ \ - "lwz 7, 20(11)\n\t" /* arg5->r7 */ \ - "lwz 8, 24(11)\n\t" /* arg6->r8 */ \ - "lwz 9, 28(11)\n\t" /* arg7->r9 */ \ - "lwz 10, 32(11)\n\t" /* arg8->r10 */ \ - "lwz 11, 0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "mr 11,%1\n\t" \ - "mr %0,3\n\t" \ - "lwz 2,-8(11)\n\t" /* restore tocptr */ \ - VG_CONTRACT_FRAME_BY(72) \ - VG_CONTRACT_FRAME_BY(512) \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[2]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#endif /* PLAT_ppc32_aix5 */ - -/* ------------------------ ppc64-aix5 ------------------------- */ - -#if defined(PLAT_ppc64_aix5) - -/* ARGREGS: r3 r4 r5 r6 r7 r8 r9 r10 (the rest on stack somewhere) */ - -/* These regs are trashed by the hidden call. */ -#define __CALLER_SAVED_REGS \ - "lr", "ctr", "xer", \ - "cr0", "cr1", "cr2", "cr3", "cr4", "cr5", "cr6", "cr7", \ - "r0", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", \ - "r11", "r12", "r13" - -/* Expand the stack frame, copying enough info that unwinding - still works. Trashes r3. */ - -#define VG_EXPAND_FRAME_BY_trashes_r3(_n_fr) \ - "addi 1,1,-" #_n_fr "\n\t" \ - "ld 3," #_n_fr "(1)\n\t" \ - "std 3,0(1)\n\t" - -#define VG_CONTRACT_FRAME_BY(_n_fr) \ - "addi 1,1," #_n_fr "\n\t" - -/* These CALL_FN_ macros assume that on ppc64-aix5, sizeof(unsigned - long) == 8. */ - -#define CALL_FN_W_v(lval, orig) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[3+0]; \ - volatile unsigned long _res; \ - /* _argvec[0] holds current r2 across the call */ \ - _argvec[1] = (unsigned long)_orig.r2; \ - _argvec[2] = (unsigned long)_orig.nraddr; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - VG_EXPAND_FRAME_BY_trashes_r3(512) \ - "std 2,-16(11)\n\t" /* save tocptr */ \ - "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ - "ld 11, 0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "mr 11,%1\n\t" \ - "mr %0,3\n\t" \ - "ld 2,-16(11)\n\t" /* restore tocptr */ \ - VG_CONTRACT_FRAME_BY(512) \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[2]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_W(lval, orig, arg1) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[3+1]; \ - volatile unsigned long _res; \ - /* _argvec[0] holds current r2 across the call */ \ - _argvec[1] = (unsigned long)_orig.r2; \ - _argvec[2] = (unsigned long)_orig.nraddr; \ - _argvec[2+1] = (unsigned long)arg1; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - VG_EXPAND_FRAME_BY_trashes_r3(512) \ - "std 2,-16(11)\n\t" /* save tocptr */ \ - "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ - "ld 3, 8(11)\n\t" /* arg1->r3 */ \ - "ld 11, 0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "mr 11,%1\n\t" \ - "mr %0,3\n\t" \ - "ld 2,-16(11)\n\t" /* restore tocptr */ \ - VG_CONTRACT_FRAME_BY(512) \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[2]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_WW(lval, orig, arg1,arg2) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[3+2]; \ - volatile unsigned long _res; \ - /* _argvec[0] holds current r2 across the call */ \ - _argvec[1] = (unsigned long)_orig.r2; \ - _argvec[2] = (unsigned long)_orig.nraddr; \ - _argvec[2+1] = (unsigned long)arg1; \ - _argvec[2+2] = (unsigned long)arg2; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - VG_EXPAND_FRAME_BY_trashes_r3(512) \ - "std 2,-16(11)\n\t" /* save tocptr */ \ - "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ - "ld 3, 8(11)\n\t" /* arg1->r3 */ \ - "ld 4, 16(11)\n\t" /* arg2->r4 */ \ - "ld 11, 0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "mr 11,%1\n\t" \ - "mr %0,3\n\t" \ - "ld 2,-16(11)\n\t" /* restore tocptr */ \ - VG_CONTRACT_FRAME_BY(512) \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[2]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_WWW(lval, orig, arg1,arg2,arg3) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[3+3]; \ - volatile unsigned long _res; \ - /* _argvec[0] holds current r2 across the call */ \ - _argvec[1] = (unsigned long)_orig.r2; \ - _argvec[2] = (unsigned long)_orig.nraddr; \ - _argvec[2+1] = (unsigned long)arg1; \ - _argvec[2+2] = (unsigned long)arg2; \ - _argvec[2+3] = (unsigned long)arg3; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - VG_EXPAND_FRAME_BY_trashes_r3(512) \ - "std 2,-16(11)\n\t" /* save tocptr */ \ - "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ - "ld 3, 8(11)\n\t" /* arg1->r3 */ \ - "ld 4, 16(11)\n\t" /* arg2->r4 */ \ - "ld 5, 24(11)\n\t" /* arg3->r5 */ \ - "ld 11, 0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "mr 11,%1\n\t" \ - "mr %0,3\n\t" \ - "ld 2,-16(11)\n\t" /* restore tocptr */ \ - VG_CONTRACT_FRAME_BY(512) \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[2]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_WWWW(lval, orig, arg1,arg2,arg3,arg4) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[3+4]; \ - volatile unsigned long _res; \ - /* _argvec[0] holds current r2 across the call */ \ - _argvec[1] = (unsigned long)_orig.r2; \ - _argvec[2] = (unsigned long)_orig.nraddr; \ - _argvec[2+1] = (unsigned long)arg1; \ - _argvec[2+2] = (unsigned long)arg2; \ - _argvec[2+3] = (unsigned long)arg3; \ - _argvec[2+4] = (unsigned long)arg4; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - VG_EXPAND_FRAME_BY_trashes_r3(512) \ - "std 2,-16(11)\n\t" /* save tocptr */ \ - "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ - "ld 3, 8(11)\n\t" /* arg1->r3 */ \ - "ld 4, 16(11)\n\t" /* arg2->r4 */ \ - "ld 5, 24(11)\n\t" /* arg3->r5 */ \ - "ld 6, 32(11)\n\t" /* arg4->r6 */ \ - "ld 11, 0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "mr 11,%1\n\t" \ - "mr %0,3\n\t" \ - "ld 2,-16(11)\n\t" /* restore tocptr */ \ - VG_CONTRACT_FRAME_BY(512) \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[2]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_5W(lval, orig, arg1,arg2,arg3,arg4,arg5) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[3+5]; \ - volatile unsigned long _res; \ - /* _argvec[0] holds current r2 across the call */ \ - _argvec[1] = (unsigned long)_orig.r2; \ - _argvec[2] = (unsigned long)_orig.nraddr; \ - _argvec[2+1] = (unsigned long)arg1; \ - _argvec[2+2] = (unsigned long)arg2; \ - _argvec[2+3] = (unsigned long)arg3; \ - _argvec[2+4] = (unsigned long)arg4; \ - _argvec[2+5] = (unsigned long)arg5; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - VG_EXPAND_FRAME_BY_trashes_r3(512) \ - "std 2,-16(11)\n\t" /* save tocptr */ \ - "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ - "ld 3, 8(11)\n\t" /* arg1->r3 */ \ - "ld 4, 16(11)\n\t" /* arg2->r4 */ \ - "ld 5, 24(11)\n\t" /* arg3->r5 */ \ - "ld 6, 32(11)\n\t" /* arg4->r6 */ \ - "ld 7, 40(11)\n\t" /* arg5->r7 */ \ - "ld 11, 0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "mr 11,%1\n\t" \ - "mr %0,3\n\t" \ - "ld 2,-16(11)\n\t" /* restore tocptr */ \ - VG_CONTRACT_FRAME_BY(512) \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[2]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_6W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[3+6]; \ - volatile unsigned long _res; \ - /* _argvec[0] holds current r2 across the call */ \ - _argvec[1] = (unsigned long)_orig.r2; \ - _argvec[2] = (unsigned long)_orig.nraddr; \ - _argvec[2+1] = (unsigned long)arg1; \ - _argvec[2+2] = (unsigned long)arg2; \ - _argvec[2+3] = (unsigned long)arg3; \ - _argvec[2+4] = (unsigned long)arg4; \ - _argvec[2+5] = (unsigned long)arg5; \ - _argvec[2+6] = (unsigned long)arg6; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - VG_EXPAND_FRAME_BY_trashes_r3(512) \ - "std 2,-16(11)\n\t" /* save tocptr */ \ - "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ - "ld 3, 8(11)\n\t" /* arg1->r3 */ \ - "ld 4, 16(11)\n\t" /* arg2->r4 */ \ - "ld 5, 24(11)\n\t" /* arg3->r5 */ \ - "ld 6, 32(11)\n\t" /* arg4->r6 */ \ - "ld 7, 40(11)\n\t" /* arg5->r7 */ \ - "ld 8, 48(11)\n\t" /* arg6->r8 */ \ - "ld 11, 0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "mr 11,%1\n\t" \ - "mr %0,3\n\t" \ - "ld 2,-16(11)\n\t" /* restore tocptr */ \ - VG_CONTRACT_FRAME_BY(512) \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[2]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_7W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ - arg7) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[3+7]; \ - volatile unsigned long _res; \ - /* _argvec[0] holds current r2 across the call */ \ - _argvec[1] = (unsigned long)_orig.r2; \ - _argvec[2] = (unsigned long)_orig.nraddr; \ - _argvec[2+1] = (unsigned long)arg1; \ - _argvec[2+2] = (unsigned long)arg2; \ - _argvec[2+3] = (unsigned long)arg3; \ - _argvec[2+4] = (unsigned long)arg4; \ - _argvec[2+5] = (unsigned long)arg5; \ - _argvec[2+6] = (unsigned long)arg6; \ - _argvec[2+7] = (unsigned long)arg7; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - VG_EXPAND_FRAME_BY_trashes_r3(512) \ - "std 2,-16(11)\n\t" /* save tocptr */ \ - "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ - "ld 3, 8(11)\n\t" /* arg1->r3 */ \ - "ld 4, 16(11)\n\t" /* arg2->r4 */ \ - "ld 5, 24(11)\n\t" /* arg3->r5 */ \ - "ld 6, 32(11)\n\t" /* arg4->r6 */ \ - "ld 7, 40(11)\n\t" /* arg5->r7 */ \ - "ld 8, 48(11)\n\t" /* arg6->r8 */ \ - "ld 9, 56(11)\n\t" /* arg7->r9 */ \ - "ld 11, 0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "mr 11,%1\n\t" \ - "mr %0,3\n\t" \ - "ld 2,-16(11)\n\t" /* restore tocptr */ \ - VG_CONTRACT_FRAME_BY(512) \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[2]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_8W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ - arg7,arg8) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[3+8]; \ - volatile unsigned long _res; \ - /* _argvec[0] holds current r2 across the call */ \ - _argvec[1] = (unsigned long)_orig.r2; \ - _argvec[2] = (unsigned long)_orig.nraddr; \ - _argvec[2+1] = (unsigned long)arg1; \ - _argvec[2+2] = (unsigned long)arg2; \ - _argvec[2+3] = (unsigned long)arg3; \ - _argvec[2+4] = (unsigned long)arg4; \ - _argvec[2+5] = (unsigned long)arg5; \ - _argvec[2+6] = (unsigned long)arg6; \ - _argvec[2+7] = (unsigned long)arg7; \ - _argvec[2+8] = (unsigned long)arg8; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - VG_EXPAND_FRAME_BY_trashes_r3(512) \ - "std 2,-16(11)\n\t" /* save tocptr */ \ - "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ - "ld 3, 8(11)\n\t" /* arg1->r3 */ \ - "ld 4, 16(11)\n\t" /* arg2->r4 */ \ - "ld 5, 24(11)\n\t" /* arg3->r5 */ \ - "ld 6, 32(11)\n\t" /* arg4->r6 */ \ - "ld 7, 40(11)\n\t" /* arg5->r7 */ \ - "ld 8, 48(11)\n\t" /* arg6->r8 */ \ - "ld 9, 56(11)\n\t" /* arg7->r9 */ \ - "ld 10, 64(11)\n\t" /* arg8->r10 */ \ - "ld 11, 0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "mr 11,%1\n\t" \ - "mr %0,3\n\t" \ - "ld 2,-16(11)\n\t" /* restore tocptr */ \ - VG_CONTRACT_FRAME_BY(512) \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[2]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_9W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ - arg7,arg8,arg9) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[3+9]; \ - volatile unsigned long _res; \ - /* _argvec[0] holds current r2 across the call */ \ - _argvec[1] = (unsigned long)_orig.r2; \ - _argvec[2] = (unsigned long)_orig.nraddr; \ - _argvec[2+1] = (unsigned long)arg1; \ - _argvec[2+2] = (unsigned long)arg2; \ - _argvec[2+3] = (unsigned long)arg3; \ - _argvec[2+4] = (unsigned long)arg4; \ - _argvec[2+5] = (unsigned long)arg5; \ - _argvec[2+6] = (unsigned long)arg6; \ - _argvec[2+7] = (unsigned long)arg7; \ - _argvec[2+8] = (unsigned long)arg8; \ - _argvec[2+9] = (unsigned long)arg9; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - VG_EXPAND_FRAME_BY_trashes_r3(512) \ - "std 2,-16(11)\n\t" /* save tocptr */ \ - "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ - VG_EXPAND_FRAME_BY_trashes_r3(128) \ - /* arg9 */ \ - "ld 3,72(11)\n\t" \ - "std 3,112(1)\n\t" \ - /* args1-8 */ \ - "ld 3, 8(11)\n\t" /* arg1->r3 */ \ - "ld 4, 16(11)\n\t" /* arg2->r4 */ \ - "ld 5, 24(11)\n\t" /* arg3->r5 */ \ - "ld 6, 32(11)\n\t" /* arg4->r6 */ \ - "ld 7, 40(11)\n\t" /* arg5->r7 */ \ - "ld 8, 48(11)\n\t" /* arg6->r8 */ \ - "ld 9, 56(11)\n\t" /* arg7->r9 */ \ - "ld 10, 64(11)\n\t" /* arg8->r10 */ \ - "ld 11, 0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "mr 11,%1\n\t" \ - "mr %0,3\n\t" \ - "ld 2,-16(11)\n\t" /* restore tocptr */ \ - VG_CONTRACT_FRAME_BY(128) \ - VG_CONTRACT_FRAME_BY(512) \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[2]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_10W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ - arg7,arg8,arg9,arg10) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[3+10]; \ - volatile unsigned long _res; \ - /* _argvec[0] holds current r2 across the call */ \ - _argvec[1] = (unsigned long)_orig.r2; \ - _argvec[2] = (unsigned long)_orig.nraddr; \ - _argvec[2+1] = (unsigned long)arg1; \ - _argvec[2+2] = (unsigned long)arg2; \ - _argvec[2+3] = (unsigned long)arg3; \ - _argvec[2+4] = (unsigned long)arg4; \ - _argvec[2+5] = (unsigned long)arg5; \ - _argvec[2+6] = (unsigned long)arg6; \ - _argvec[2+7] = (unsigned long)arg7; \ - _argvec[2+8] = (unsigned long)arg8; \ - _argvec[2+9] = (unsigned long)arg9; \ - _argvec[2+10] = (unsigned long)arg10; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - VG_EXPAND_FRAME_BY_trashes_r3(512) \ - "std 2,-16(11)\n\t" /* save tocptr */ \ - "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ - VG_EXPAND_FRAME_BY_trashes_r3(128) \ - /* arg10 */ \ - "ld 3,80(11)\n\t" \ - "std 3,120(1)\n\t" \ - /* arg9 */ \ - "ld 3,72(11)\n\t" \ - "std 3,112(1)\n\t" \ - /* args1-8 */ \ - "ld 3, 8(11)\n\t" /* arg1->r3 */ \ - "ld 4, 16(11)\n\t" /* arg2->r4 */ \ - "ld 5, 24(11)\n\t" /* arg3->r5 */ \ - "ld 6, 32(11)\n\t" /* arg4->r6 */ \ - "ld 7, 40(11)\n\t" /* arg5->r7 */ \ - "ld 8, 48(11)\n\t" /* arg6->r8 */ \ - "ld 9, 56(11)\n\t" /* arg7->r9 */ \ - "ld 10, 64(11)\n\t" /* arg8->r10 */ \ - "ld 11, 0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "mr 11,%1\n\t" \ - "mr %0,3\n\t" \ - "ld 2,-16(11)\n\t" /* restore tocptr */ \ - VG_CONTRACT_FRAME_BY(128) \ - VG_CONTRACT_FRAME_BY(512) \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[2]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_11W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ - arg7,arg8,arg9,arg10,arg11) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[3+11]; \ - volatile unsigned long _res; \ - /* _argvec[0] holds current r2 across the call */ \ - _argvec[1] = (unsigned long)_orig.r2; \ - _argvec[2] = (unsigned long)_orig.nraddr; \ - _argvec[2+1] = (unsigned long)arg1; \ - _argvec[2+2] = (unsigned long)arg2; \ - _argvec[2+3] = (unsigned long)arg3; \ - _argvec[2+4] = (unsigned long)arg4; \ - _argvec[2+5] = (unsigned long)arg5; \ - _argvec[2+6] = (unsigned long)arg6; \ - _argvec[2+7] = (unsigned long)arg7; \ - _argvec[2+8] = (unsigned long)arg8; \ - _argvec[2+9] = (unsigned long)arg9; \ - _argvec[2+10] = (unsigned long)arg10; \ - _argvec[2+11] = (unsigned long)arg11; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - VG_EXPAND_FRAME_BY_trashes_r3(512) \ - "std 2,-16(11)\n\t" /* save tocptr */ \ - "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ - VG_EXPAND_FRAME_BY_trashes_r3(144) \ - /* arg11 */ \ - "ld 3,88(11)\n\t" \ - "std 3,128(1)\n\t" \ - /* arg10 */ \ - "ld 3,80(11)\n\t" \ - "std 3,120(1)\n\t" \ - /* arg9 */ \ - "ld 3,72(11)\n\t" \ - "std 3,112(1)\n\t" \ - /* args1-8 */ \ - "ld 3, 8(11)\n\t" /* arg1->r3 */ \ - "ld 4, 16(11)\n\t" /* arg2->r4 */ \ - "ld 5, 24(11)\n\t" /* arg3->r5 */ \ - "ld 6, 32(11)\n\t" /* arg4->r6 */ \ - "ld 7, 40(11)\n\t" /* arg5->r7 */ \ - "ld 8, 48(11)\n\t" /* arg6->r8 */ \ - "ld 9, 56(11)\n\t" /* arg7->r9 */ \ - "ld 10, 64(11)\n\t" /* arg8->r10 */ \ - "ld 11, 0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "mr 11,%1\n\t" \ - "mr %0,3\n\t" \ - "ld 2,-16(11)\n\t" /* restore tocptr */ \ - VG_CONTRACT_FRAME_BY(144) \ - VG_CONTRACT_FRAME_BY(512) \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[2]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#define CALL_FN_W_12W(lval, orig, arg1,arg2,arg3,arg4,arg5,arg6, \ - arg7,arg8,arg9,arg10,arg11,arg12) \ - do { \ - volatile OrigFn _orig = (orig); \ - volatile unsigned long _argvec[3+12]; \ - volatile unsigned long _res; \ - /* _argvec[0] holds current r2 across the call */ \ - _argvec[1] = (unsigned long)_orig.r2; \ - _argvec[2] = (unsigned long)_orig.nraddr; \ - _argvec[2+1] = (unsigned long)arg1; \ - _argvec[2+2] = (unsigned long)arg2; \ - _argvec[2+3] = (unsigned long)arg3; \ - _argvec[2+4] = (unsigned long)arg4; \ - _argvec[2+5] = (unsigned long)arg5; \ - _argvec[2+6] = (unsigned long)arg6; \ - _argvec[2+7] = (unsigned long)arg7; \ - _argvec[2+8] = (unsigned long)arg8; \ - _argvec[2+9] = (unsigned long)arg9; \ - _argvec[2+10] = (unsigned long)arg10; \ - _argvec[2+11] = (unsigned long)arg11; \ - _argvec[2+12] = (unsigned long)arg12; \ - __asm__ volatile( \ - "mr 11,%1\n\t" \ - VG_EXPAND_FRAME_BY_trashes_r3(512) \ - "std 2,-16(11)\n\t" /* save tocptr */ \ - "ld 2,-8(11)\n\t" /* use nraddr's tocptr */ \ - VG_EXPAND_FRAME_BY_trashes_r3(144) \ - /* arg12 */ \ - "ld 3,96(11)\n\t" \ - "std 3,136(1)\n\t" \ - /* arg11 */ \ - "ld 3,88(11)\n\t" \ - "std 3,128(1)\n\t" \ - /* arg10 */ \ - "ld 3,80(11)\n\t" \ - "std 3,120(1)\n\t" \ - /* arg9 */ \ - "ld 3,72(11)\n\t" \ - "std 3,112(1)\n\t" \ - /* args1-8 */ \ - "ld 3, 8(11)\n\t" /* arg1->r3 */ \ - "ld 4, 16(11)\n\t" /* arg2->r4 */ \ - "ld 5, 24(11)\n\t" /* arg3->r5 */ \ - "ld 6, 32(11)\n\t" /* arg4->r6 */ \ - "ld 7, 40(11)\n\t" /* arg5->r7 */ \ - "ld 8, 48(11)\n\t" /* arg6->r8 */ \ - "ld 9, 56(11)\n\t" /* arg7->r9 */ \ - "ld 10, 64(11)\n\t" /* arg8->r10 */ \ - "ld 11, 0(11)\n\t" /* target->r11 */ \ - VALGRIND_BRANCH_AND_LINK_TO_NOREDIR_R11 \ - "mr 11,%1\n\t" \ - "mr %0,3\n\t" \ - "ld 2,-16(11)\n\t" /* restore tocptr */ \ - VG_CONTRACT_FRAME_BY(144) \ - VG_CONTRACT_FRAME_BY(512) \ - : /*out*/ "=r" (_res) \ - : /*in*/ "r" (&_argvec[2]) \ - : /*trash*/ "cc", "memory", __CALLER_SAVED_REGS \ - ); \ - lval = (__typeof__(lval)) _res; \ - } while (0) - -#endif /* PLAT_ppc64_aix5 */ - - -/* ------------------------------------------------------------------ */ -/* ARCHITECTURE INDEPENDENT MACROS for CLIENT REQUESTS. */ -/* */ -/* ------------------------------------------------------------------ */ - -/* Some request codes. There are many more of these, but most are not - exposed to end-user view. These are the public ones, all of the - form 0x1000 + small_number. - - Core ones are in the range 0x00000000--0x0000ffff. The non-public - ones start at 0x2000. -*/ - -/* These macros are used by tools -- they must be public, but don't - embed them into other programs. */ -#define VG_USERREQ_TOOL_BASE(a,b) \ - ((unsigned int)(((a)&0xff) << 24 | ((b)&0xff) << 16)) -#define VG_IS_TOOL_USERREQ(a, b, v) \ - (VG_USERREQ_TOOL_BASE(a,b) == ((v) & 0xffff0000)) - -/* !! ABIWARNING !! ABIWARNING !! ABIWARNING !! ABIWARNING !! - This enum comprises an ABI exported by Valgrind to programs - which use client requests. DO NOT CHANGE THE ORDER OF THESE - ENTRIES, NOR DELETE ANY -- add new ones at the end. */ -typedef - enum { VG_USERREQ__RUNNING_ON_VALGRIND = 0x1001, - VG_USERREQ__DISCARD_TRANSLATIONS = 0x1002, - - /* These allow any function to be called from the simulated - CPU but run on the real CPU. Nb: the first arg passed to - the function is always the ThreadId of the running - thread! So CLIENT_CALL0 actually requires a 1 arg - function, etc. */ - VG_USERREQ__CLIENT_CALL0 = 0x1101, - VG_USERREQ__CLIENT_CALL1 = 0x1102, - VG_USERREQ__CLIENT_CALL2 = 0x1103, - VG_USERREQ__CLIENT_CALL3 = 0x1104, - - /* Can be useful in regression testing suites -- eg. can - send Valgrind's output to /dev/null and still count - errors. */ - VG_USERREQ__COUNT_ERRORS = 0x1201, - - /* These are useful and can be interpreted by any tool that - tracks malloc() et al, by using vg_replace_malloc.c. */ - VG_USERREQ__MALLOCLIKE_BLOCK = 0x1301, - VG_USERREQ__FREELIKE_BLOCK = 0x1302, - /* Memory pool support. */ - VG_USERREQ__CREATE_MEMPOOL = 0x1303, - VG_USERREQ__DESTROY_MEMPOOL = 0x1304, - VG_USERREQ__MEMPOOL_ALLOC = 0x1305, - VG_USERREQ__MEMPOOL_FREE = 0x1306, - VG_USERREQ__MEMPOOL_TRIM = 0x1307, - VG_USERREQ__MOVE_MEMPOOL = 0x1308, - VG_USERREQ__MEMPOOL_CHANGE = 0x1309, - VG_USERREQ__MEMPOOL_EXISTS = 0x130a, - - /* Allow printfs to valgrind log. */ - /* The first two pass the va_list argument by value, which - assumes it is the same size as or smaller than a UWord, - which generally isn't the case. Hence are deprecated. - The second two pass the vargs by reference and so are - immune to this problem. */ - /* both :: char* fmt, va_list vargs (DEPRECATED) */ - VG_USERREQ__PRINTF = 0x1401, - VG_USERREQ__PRINTF_BACKTRACE = 0x1402, - /* both :: char* fmt, va_list* vargs */ - VG_USERREQ__PRINTF_VALIST_BY_REF = 0x1403, - VG_USERREQ__PRINTF_BACKTRACE_VALIST_BY_REF = 0x1404, - - /* Stack support. */ - VG_USERREQ__STACK_REGISTER = 0x1501, - VG_USERREQ__STACK_DEREGISTER = 0x1502, - VG_USERREQ__STACK_CHANGE = 0x1503, - - /* Wine support */ - VG_USERREQ__LOAD_PDB_DEBUGINFO = 0x1601 - } Vg_ClientRequest; - -#if !defined(__GNUC__) -# define __extension__ /* */ -#endif - -/* Returns the number of Valgrinds this code is running under. That - is, 0 if running natively, 1 if running under Valgrind, 2 if - running under Valgrind which is running under another Valgrind, - etc. */ -#define RUNNING_ON_VALGRIND __extension__ \ - ({unsigned int _qzz_res; \ - VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0 /* if not */, \ - VG_USERREQ__RUNNING_ON_VALGRIND, \ - 0, 0, 0, 0, 0); \ - _qzz_res; \ - }) - - -/* Discard translation of code in the range [_qzz_addr .. _qzz_addr + - _qzz_len - 1]. Useful if you are debugging a JITter or some such, - since it provides a way to make sure valgrind will retranslate the - invalidated area. Returns no value. */ -#define VALGRIND_DISCARD_TRANSLATIONS(_qzz_addr,_qzz_len) \ - {unsigned int _qzz_res; \ - VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ - VG_USERREQ__DISCARD_TRANSLATIONS, \ - _qzz_addr, _qzz_len, 0, 0, 0); \ - } - - -/* These requests are for getting Valgrind itself to print something. - Possibly with a backtrace. This is a really ugly hack. The return value - is the number of characters printed, excluding the "**** " part at the - start and the backtrace (if present). */ - -#if defined(NVALGRIND) - -# define VALGRIND_PRINTF(...) -# define VALGRIND_PRINTF_BACKTRACE(...) - -#else /* NVALGRIND */ - -/* Modern GCC will optimize the static routine out if unused, - and unused attribute will shut down warnings about it. */ -static int VALGRIND_PRINTF(const char *format, ...) - __attribute__((format(__printf__, 1, 2), __unused__)); -static int -VALGRIND_PRINTF(const char *format, ...) -{ - unsigned long _qzz_res; - va_list vargs; - va_start(vargs, format); - VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, - VG_USERREQ__PRINTF_VALIST_BY_REF, - (unsigned long)format, - (unsigned long)&vargs, - 0, 0, 0); - va_end(vargs); - return (int)_qzz_res; -} - -static int VALGRIND_PRINTF_BACKTRACE(const char *format, ...) - __attribute__((format(__printf__, 1, 2), __unused__)); -static int -VALGRIND_PRINTF_BACKTRACE(const char *format, ...) -{ - unsigned long _qzz_res; - va_list vargs; - va_start(vargs, format); - VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, - VG_USERREQ__PRINTF_BACKTRACE_VALIST_BY_REF, - (unsigned long)format, - (unsigned long)&vargs, - 0, 0, 0); - va_end(vargs); - return (int)_qzz_res; -} - -#endif /* NVALGRIND */ - - -/* These requests allow control to move from the simulated CPU to the - real CPU, calling an arbitary function. - - Note that the current ThreadId is inserted as the first argument. - So this call: - - VALGRIND_NON_SIMD_CALL2(f, arg1, arg2) - - requires f to have this signature: - - Word f(Word tid, Word arg1, Word arg2) - - where "Word" is a word-sized type. - - Note that these client requests are not entirely reliable. For example, - if you call a function with them that subsequently calls printf(), - there's a high chance Valgrind will crash. Generally, your prospects of - these working are made higher if the called function does not refer to - any global variables, and does not refer to any libc or other functions - (printf et al). Any kind of entanglement with libc or dynamic linking is - likely to have a bad outcome, for tricky reasons which we've grappled - with a lot in the past. -*/ -#define VALGRIND_NON_SIMD_CALL0(_qyy_fn) \ - __extension__ \ - ({unsigned long _qyy_res; \ - VALGRIND_DO_CLIENT_REQUEST(_qyy_res, 0 /* default return */, \ - VG_USERREQ__CLIENT_CALL0, \ - _qyy_fn, \ - 0, 0, 0, 0); \ - _qyy_res; \ - }) - -#define VALGRIND_NON_SIMD_CALL1(_qyy_fn, _qyy_arg1) \ - __extension__ \ - ({unsigned long _qyy_res; \ - VALGRIND_DO_CLIENT_REQUEST(_qyy_res, 0 /* default return */, \ - VG_USERREQ__CLIENT_CALL1, \ - _qyy_fn, \ - _qyy_arg1, 0, 0, 0); \ - _qyy_res; \ - }) - -#define VALGRIND_NON_SIMD_CALL2(_qyy_fn, _qyy_arg1, _qyy_arg2) \ - __extension__ \ - ({unsigned long _qyy_res; \ - VALGRIND_DO_CLIENT_REQUEST(_qyy_res, 0 /* default return */, \ - VG_USERREQ__CLIENT_CALL2, \ - _qyy_fn, \ - _qyy_arg1, _qyy_arg2, 0, 0); \ - _qyy_res; \ - }) - -#define VALGRIND_NON_SIMD_CALL3(_qyy_fn, _qyy_arg1, _qyy_arg2, _qyy_arg3) \ - __extension__ \ - ({unsigned long _qyy_res; \ - VALGRIND_DO_CLIENT_REQUEST(_qyy_res, 0 /* default return */, \ - VG_USERREQ__CLIENT_CALL3, \ - _qyy_fn, \ - _qyy_arg1, _qyy_arg2, \ - _qyy_arg3, 0); \ - _qyy_res; \ - }) - - -/* Counts the number of errors that have been recorded by a tool. Nb: - the tool must record the errors with VG_(maybe_record_error)() or - VG_(unique_error)() for them to be counted. */ -#define VALGRIND_COUNT_ERRORS \ - __extension__ \ - ({unsigned int _qyy_res; \ - VALGRIND_DO_CLIENT_REQUEST(_qyy_res, 0 /* default return */, \ - VG_USERREQ__COUNT_ERRORS, \ - 0, 0, 0, 0, 0); \ - _qyy_res; \ - }) - -/* Several Valgrind tools (Memcheck, Massif, Helgrind, DRD) rely on knowing - when heap blocks are allocated in order to give accurate results. This - happens automatically for the standard allocator functions such as - malloc(), calloc(), realloc(), memalign(), new, new[], free(), delete, - delete[], etc. - - But if your program uses a custom allocator, this doesn't automatically - happen, and Valgrind will not do as well. For example, if you allocate - superblocks with mmap() and then allocates chunks of the superblocks, all - Valgrind's observations will be at the mmap() level and it won't know that - the chunks should be considered separate entities. In Memcheck's case, - that means you probably won't get heap block overrun detection (because - there won't be redzones marked as unaddressable) and you definitely won't - get any leak detection. - - The following client requests allow a custom allocator to be annotated so - that it can be handled accurately by Valgrind. - - VALGRIND_MALLOCLIKE_BLOCK marks a region of memory as having been allocated - by a malloc()-like function. For Memcheck (an illustrative case), this - does two things: - - - It records that the block has been allocated. This means any addresses - within the block mentioned in error messages will be - identified as belonging to the block. It also means that if the block - isn't freed it will be detected by the leak checker. - - - It marks the block as being addressable and undefined (if 'is_zeroed' is - not set), or addressable and defined (if 'is_zeroed' is set). This - controls how accesses to the block by the program are handled. - - 'addr' is the start of the usable block (ie. after any - redzone), 'sizeB' is its size. 'rzB' is the redzone size if the allocator - can apply redzones -- these are blocks of padding at the start and end of - each block. Adding redzones is recommended as it makes it much more likely - Valgrind will spot block overruns. `is_zeroed' indicates if the memory is - zeroed (or filled with another predictable value), as is the case for - calloc(). - - VALGRIND_MALLOCLIKE_BLOCK should be put immediately after the point where a - heap block -- that will be used by the client program -- is allocated. - It's best to put it at the outermost level of the allocator if possible; - for example, if you have a function my_alloc() which calls - internal_alloc(), and the client request is put inside internal_alloc(), - stack traces relating to the heap block will contain entries for both - my_alloc() and internal_alloc(), which is probably not what you want. - - For Memcheck users: if you use VALGRIND_MALLOCLIKE_BLOCK to carve out - custom blocks from within a heap block, B, that has been allocated with - malloc/calloc/new/etc, then block B will be *ignored* during leak-checking - -- the custom blocks will take precedence. - - VALGRIND_FREELIKE_BLOCK is the partner to VALGRIND_MALLOCLIKE_BLOCK. For - Memcheck, it does two things: - - - It records that the block has been deallocated. This assumes that the - block was annotated as having been allocated via - VALGRIND_MALLOCLIKE_BLOCK. Otherwise, an error will be issued. - - - It marks the block as being unaddressable. - - VALGRIND_FREELIKE_BLOCK should be put immediately after the point where a - heap block is deallocated. - - In many cases, these two client requests will not be enough to get your - allocator working well with Memcheck. More specifically, if your allocator - writes to freed blocks in any way then a VALGRIND_MAKE_MEM_UNDEFINED call - will be necessary to mark the memory as addressable just before the zeroing - occurs, otherwise you'll get a lot of invalid write errors. For example, - you'll need to do this if your allocator recycles freed blocks, but it - zeroes them before handing them back out (via VALGRIND_MALLOCLIKE_BLOCK). - Alternatively, if your allocator reuses freed blocks for allocator-internal - data structures, VALGRIND_MAKE_MEM_UNDEFINED calls will also be necessary. - - Really, what's happening is a blurring of the lines between the client - program and the allocator... after VALGRIND_FREELIKE_BLOCK is called, the - memory should be considered unaddressable to the client program, but the - allocator knows more than the rest of the client program and so may be able - to safely access it. Extra client requests are necessary for Valgrind to - understand the distinction between the allocator and the rest of the - program. - - Note: there is currently no VALGRIND_REALLOCLIKE_BLOCK client request; it - has to be emulated with MALLOCLIKE/FREELIKE and memory copying. - - Ignored if addr == 0. -*/ -#define VALGRIND_MALLOCLIKE_BLOCK(addr, sizeB, rzB, is_zeroed) \ - {unsigned int _qzz_res; \ - VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ - VG_USERREQ__MALLOCLIKE_BLOCK, \ - addr, sizeB, rzB, is_zeroed, 0); \ - } - -/* See the comment for VALGRIND_MALLOCLIKE_BLOCK for details. - Ignored if addr == 0. -*/ -#define VALGRIND_FREELIKE_BLOCK(addr, rzB) \ - {unsigned int _qzz_res; \ - VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ - VG_USERREQ__FREELIKE_BLOCK, \ - addr, rzB, 0, 0, 0); \ - } - -/* Create a memory pool. */ -#define VALGRIND_CREATE_MEMPOOL(pool, rzB, is_zeroed) \ - {unsigned int _qzz_res; \ - VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ - VG_USERREQ__CREATE_MEMPOOL, \ - pool, rzB, is_zeroed, 0, 0); \ - } - -/* Destroy a memory pool. */ -#define VALGRIND_DESTROY_MEMPOOL(pool) \ - {unsigned int _qzz_res; \ - VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ - VG_USERREQ__DESTROY_MEMPOOL, \ - pool, 0, 0, 0, 0); \ - } - -/* Associate a piece of memory with a memory pool. */ -#define VALGRIND_MEMPOOL_ALLOC(pool, addr, size) \ - {unsigned int _qzz_res; \ - VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ - VG_USERREQ__MEMPOOL_ALLOC, \ - pool, addr, size, 0, 0); \ - } - -/* Disassociate a piece of memory from a memory pool. */ -#define VALGRIND_MEMPOOL_FREE(pool, addr) \ - {unsigned int _qzz_res; \ - VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ - VG_USERREQ__MEMPOOL_FREE, \ - pool, addr, 0, 0, 0); \ - } - -/* Disassociate any pieces outside a particular range. */ -#define VALGRIND_MEMPOOL_TRIM(pool, addr, size) \ - {unsigned int _qzz_res; \ - VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ - VG_USERREQ__MEMPOOL_TRIM, \ - pool, addr, size, 0, 0); \ - } - -/* Resize and/or move a piece associated with a memory pool. */ -#define VALGRIND_MOVE_MEMPOOL(poolA, poolB) \ - {unsigned int _qzz_res; \ - VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ - VG_USERREQ__MOVE_MEMPOOL, \ - poolA, poolB, 0, 0, 0); \ - } - -/* Resize and/or move a piece associated with a memory pool. */ -#define VALGRIND_MEMPOOL_CHANGE(pool, addrA, addrB, size) \ - {unsigned int _qzz_res; \ - VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ - VG_USERREQ__MEMPOOL_CHANGE, \ - pool, addrA, addrB, size, 0); \ - } - -/* Return 1 if a mempool exists, else 0. */ -#define VALGRIND_MEMPOOL_EXISTS(pool) \ - __extension__ \ - ({unsigned int _qzz_res; \ - VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ - VG_USERREQ__MEMPOOL_EXISTS, \ - pool, 0, 0, 0, 0); \ - _qzz_res; \ - }) - -/* Mark a piece of memory as being a stack. Returns a stack id. */ -#define VALGRIND_STACK_REGISTER(start, end) \ - __extension__ \ - ({unsigned int _qzz_res; \ - VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ - VG_USERREQ__STACK_REGISTER, \ - start, end, 0, 0, 0); \ - _qzz_res; \ - }) - -/* Unmark the piece of memory associated with a stack id as being a - stack. */ -#define VALGRIND_STACK_DEREGISTER(id) \ - {unsigned int _qzz_res; \ - VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ - VG_USERREQ__STACK_DEREGISTER, \ - id, 0, 0, 0, 0); \ - } - -/* Change the start and end address of the stack id. */ -#define VALGRIND_STACK_CHANGE(id, start, end) \ - {unsigned int _qzz_res; \ - VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ - VG_USERREQ__STACK_CHANGE, \ - id, start, end, 0, 0); \ - } - -/* Load PDB debug info for Wine PE image_map. */ -#define VALGRIND_LOAD_PDB_DEBUGINFO(fd, ptr, total_size, delta) \ - {unsigned int _qzz_res; \ - VALGRIND_DO_CLIENT_REQUEST(_qzz_res, 0, \ - VG_USERREQ__LOAD_PDB_DEBUGINFO, \ - fd, ptr, total_size, delta, 0); \ - } - - -#undef PLAT_x86_linux -#undef PLAT_amd64_linux -#undef PLAT_ppc32_linux -#undef PLAT_ppc64_linux -#undef PLAT_arm_linux -#undef PLAT_ppc32_aix5 -#undef PLAT_ppc64_aix5 - -#endif /* __VALGRIND_H */ diff --git a/dbms/src/Functions/FunctionsStringSearch.cpp b/dbms/src/Functions/FunctionsStringSearch.cpp index 38ad210aa76..8f65f1336e8 100644 --- a/dbms/src/Functions/FunctionsStringSearch.cpp +++ b/dbms/src/Functions/FunctionsStringSearch.cpp @@ -623,7 +623,7 @@ struct ReplaceRegexpImpl { re2_st::StringPiece matches[max_captures]; - int start_pos = 0; + size_t start_pos = 0; while (start_pos < input.length()) { /// If no more replacements possible for current string