Merge remote-tracking branch 'upstream/master'

This commit is contained in:
BayoNet 2018-02-01 18:18:01 +03:00
commit df75399de8
352 changed files with 8761 additions and 30686 deletions

42
.gitlab-ci.yml Normal file
View File

@ -0,0 +1,42 @@
stages:
- build
variables:
GIT_SUBMODULE_STRATEGY: recursive
# NEXUS_USER:
# NEXUS_PASSWORD:
# NEXUS_HOST:
builder:
stage: builder
when: manual
services:
- docker:dind
before_script:
- docker info
- apk add --no-cache git curl binutils ca-certificates
- docker login -u gitlab -p nopasswd $CI_REGISTRY
- docker build -t yandex/clickhouse-builder ./docker/builder
script:
- docker tag yandex/clickhouse-builder $CI_REGISTRY/yandex/clickhouse-builder
- docker push $CI_REGISTRY/yandex/clickhouse-builder
tags:
- docker
build:
stage: build
services:
- docker:dind
before_script:
- cd ./docker/builder
- docker info
- docker build -t yandex/clickhouse-builder .
script:
- docker run --rm --volumes-from "${HOSTNAME}-build" --workdir "${CI_PROJECT_DIR}" --env CI_PROJECT_DIR=${CI_PROJECT_DIR} $CI_REGISTRY/yandex/clickhouse-builder /build_gitlab_ci.sh
# # You can upload your binary to nexus
# after_script:
# - curl -v --keepalive-time 60 --keepalive --user "$NEXUS_USER:$NEXUS_PASSWORD" -XPUT "http://$NEXUS_HOST/repository/binaries/$CI_PROJECT_NAME" --upload-file ./dbms/src/Server/clickhouse
# Or download artifacts from gitlab
artifacts:
paths:
- ./dbms/src/Server/clickhouse
expire_in: 1 day
tags:
- docker

3
.gitmodules vendored
View File

@ -28,3 +28,6 @@
[submodule "contrib/double-conversion"]
path = contrib/double-conversion
url = https://github.com/google/double-conversion.git
[submodule "contrib/re2"]
path = contrib/re2
url = https://github.com/google/re2.git

View File

@ -1,6 +1,7 @@
language: cpp
language: generic
matrix:
fast_finish: true
include:
- os: linux
@ -13,8 +14,9 @@ matrix:
sources:
- ubuntu-toolchain-r-test
packages: [ g++-7, libicu-dev, libreadline-dev, libmysqlclient-dev, unixodbc-dev, libltdl-dev, libssl-dev, libboost-dev, zlib1g-dev, libdouble-conversion-dev, libzookeeper-mt-dev, libsparsehash-dev, librdkafka-dev, libcapnp-dev, libsparsehash-dev, libgoogle-perftools-dev, bash, expect, python, python-lxml, python-termcolor, curl, perl, sudo ]
env:
- MATRIX_EVAL="CC=gcc-7 && CXX=g++-7"
- MATRIX_EVAL="export CC=gcc-7 && export CXX=g++-7"
script:
- utils/travis/normal.sh
@ -35,12 +37,7 @@ matrix:
packages: [ g++-7, clang-5.0, libicu-dev, libreadline-dev, libmysqlclient-dev, unixodbc-dev, libltdl-dev, libssl-dev, libboost-dev, zlib1g-dev, libdouble-conversion-dev, libzookeeper-mt-dev, libsparsehash-dev, librdkafka-dev, libcapnp-dev, libsparsehash-dev, libgoogle-perftools-dev, bash, expect, python, python-lxml, python-termcolor, curl, perl, sudo ]
env:
- MATRIX_EVAL="CC=clang-5.0 && CXX=clang++-5.0"
before_script:
# fix gcc after clang
- export CC=clang-5.0
- export CXX=clang++-5.0
- MATRIX_EVAL="export CC=clang-5.0 && export CXX=clang++-5.0"
script:
- utils/travis/normal.sh
@ -59,8 +56,34 @@ matrix:
apt:
packages: [ pbuilder, fakeroot, debhelper ]
env:
- MATRIX_EVAL="export CC=clang-5.0 && export CXX=clang++-5.0"
script:
- utils/travis/pbuilder.sh
before_install:
# Cant fit to time limit (48min)
# - os: osx
# osx_image: xcode9.2
#
# cache:
# ccache: true
# timeout: 1000
#
# before_install:
# - brew install unixodbc gcc ccache libtool gettext zlib readline double-conversion gperftools google-sparsehash lz4 zstd || true
# - brew link --overwrite gcc || true
#
# env:
# - MATRIX_EVAL="export CC=gcc-7 && export CXX=g++-7"
#
# script:
# - env CMAKE_FLAGS="-DUSE_INTERNAL_BOOST_LIBRARY=1" utils/travis/normal.sh
allow_failures:
- os: osx
before_script:
- eval "${MATRIX_EVAL}"

View File

@ -1,109 +1,122 @@
# ClickHouse release 1.1.54342, 2018-01-22
This release contains bug fixes for the previous release 1.1.54342:
* Fixed a regression in 1.1.54337: if the default user have readonly access, then the server refuse to start up with messages `Cannot create database in readonly mode`.
* Fixed a regression in 1.1.54337: on the systems with `systemd` logs are always written to syslog regardless to the configuration; watchdog script still uses `init.d`.
* Fixed a regression in 1.1.54337: wrong default configuration in the Docker image.
* Fixed nondeterministic behaviour of GraphiteMergeTree (you can notice it in log messages `Data after merge is not byte-identical to data on another replicas`).
* Fixed a bug that may lead to inconsistent merges after OPTIMIZE query to Replicated tables (you may notice it in log messages `Part ... intersects previous part`).
* Buffer tables now work file in presense of MATERIALIZED columns in destination table (by zhang2014).
* Fixed a bug in implementation of NULL.
# ClickHouse release 1.1.54337, 2018-01-18
## New features:
* Added support for storage of multidimensional arrays and tuples (`Tuple` data type) in tables.
* Added support for usage of table functions in `DESCRIBE` and `INSERT` queries. Added supoort for subqueries in `DESCRIBE`. Examples: `DESC TABLE remote('host', default.hits)`; `DESC TABLE (SELECT 1)`; `INSERT INTO TABLE FUNCTION remote('host', default.hits)`. Support for `INSERT INTO TABLE` syntax in addition to `INSERT INTO`.
* Improvement of timezones support. `DateTime` data type can be annotated with timezone, that is used for parsing and formatting in text formats. Example: `DateTime('Europe/Moscow')`. When specifying timezones in functions for DateTime arguments, the return type will track timezone, and the value will be displayed as expected.
* Added functions `toTimeZone`, `timeDiff`, `toQuarter`, `toRelativeQuarterNum`. Functions `toRelativeHour`/`Minute`/`Second` can take a value of type `Date` as an argument. The name of function `now` has been made case insensitive.
* Added function `toStartOfFifteenMinutes` (Kirill Shvakov).
* Added tool `clickhouse format` for formatting queries.
* Added configuration parameter `format_schema_path` (Marek Vavruša). It is used for specifying schema of `Cap'n'Proto` format. Schema files can be located only in specified directory.
* Added support for table functions in `DESCRIBE` and `INSERT` queries. Added support for subqueries in `DESCRIBE`. Examples: `DESC TABLE remote('host', default.hits)`; `DESC TABLE (SELECT 1)`; `INSERT INTO TABLE FUNCTION remote('host', default.hits)`. Support for `INSERT INTO TABLE` syntax in addition to `INSERT INTO`.
* Improved support for timezones. The `DateTime` data type can be annotated with the timezone that is used for parsing and formatting in text formats. Example: `DateTime('Europe/Moscow')`. When timezones are specified in functions for DateTime arguments, the return type will track the timezone, and the value will be displayed as expected.
* Added the functions `toTimeZone`, `timeDiff`, `toQuarter`, `toRelativeQuarterNum`. The `toRelativeHour`/`Minute`/`Second` functions can take a value of type `Date` as an argument. The name of the `now` function has been made case-insensitive.
* Added the `toStartOfFifteenMinutes` function (Kirill Shvakov).
* Added the `clickhouse format` tool for formatting queries.
* Added the `format_schema_path` configuration parameter (Marek Vavruša). It is used for specifying a schema in `Cap'n'Proto` format. Schema files can be located only in the specified directory.
* Added support for config substitutions (`incl` and `conf.d`) for configuration of external dictionaries and models (Pavel Yakunin).
* Added column with documentation for table `system.settings` (Kirill Shvakov).
* Added table `system.parts_columns` with information about column sizes in each data part of `MergeTree` tables.
* Added table `system.models` with information about loaded `CatBoost` machine learning models.
* Added table functions `mysql` and `odbc` along with corresponding table engines `MySQL`, `ODBC` to work with foreign databases. This feature is in beta stage.
* Added possibility to pass an argument of type `AggregateFunction` for `groupArray` aggregate function (so you can create an array of states of some aggregate function).
* Removed restrictions on various combinations of aggregate function combinators. For example, you can use `avgForEachIf` as well as `avgIfForEach` aggregate functions, that have different behaviour.
* Aggregate function combinator `-ForEach` is extended for the case of aggregate functions of multiple arguments.
* Added support for aggregate functions of `Nullable` arguments even for cases when the function returns non `Nullable` result (added with contribution of Silviu Caragea). Examples: `groupArray`, `groupUniqArray`, `topK`.
* Added command line parameter `max_client_network_bandwidth` for `clickhouse-client` (Kirill Shvakov).
* Allowed to work with TEMPORARY tables (CREATE, DROP, INSERT...) for users with `readonly = 2` setting (Kirill Shvakov).
* Added support for using multiple consumers with `Kafka` engine. Extended configuration options for `Kafka` (Marek Vavruša).
* Added functions `intExp2`, `intExp10`.
* Added aggregate function `sumKahan` (computationally stable summation of floating point numbers).
* Added functions to*Number*OrNull, where *Number* is a numeric type.
* Added support for `WITH` clause for `INSERT SELECT` query (by zhang2014).
* Added settings `http_connection_timeout`, `http_send_timeout`, `http_receive_timeout`. In particular, these settings are used for downloading data parts for replication. Changing these settings allows to have more quick failover in the case of overloaded network.
* Added support for `ALTER` query for tables of type `Null` (Anastasiya Tsarkova). Tables of type `Null` are often used with materialized views.
* `reinterpretAsString` function is extended for all data types that are stored contiguously in memory.
* Added option `--silent` for `clickhouse-local` tool. It suppresses printing query execution info in stderr.
* Added support for reading values of type `Date` from text in a format where month and/or day of month is specified in single digit instead of two digits (Amos Bird).
* Added a column with documentation for the `system.settings` table (Kirill Shvakov).
* Added the `system.parts_columns` table with information about column sizes in each data part of `MergeTree` tables.
* Added the `system.models` table with information about loaded `CatBoost` machine learning models.
* Added the `mysql` and `odbc` table functions along with the corresponding `MySQL` and `ODBC` table engines for working with foreign databases. This feature is in the beta stage.
* Added the possibility to pass an argument of type `AggregateFunction` for the `groupArray` aggregate function (so you can create an array of states of some aggregate function).
* Removed restrictions on various combinations of aggregate function combinators. For example, you can use `avgForEachIf` as well as `avgIfForEach` aggregate functions, which have different behaviors.
* The `-ForEach` aggregate function combinator is extended for the case of aggregate functions of multiple arguments.
* Added support for aggregate functions of `Nullable` arguments even for cases when the function returns a non-`Nullable` result (added with the contribution of Silviu Caragea). Examples: `groupArray`, `groupUniqArray`, `topK`.
* Added the `max_client_network_bandwidth` command line parameter for `clickhouse-client` (Kirill Shvakov).
* Users with the `readonly = 2` setting are allowed to work with TEMPORARY tables (CREATE, DROP, INSERT...) (Kirill Shvakov).
* Added support for using multiple consumers with the `Kafka` engine. Extended configuration options for `Kafka` (Marek Vavruša).
* Added the `intExp2` and `intExp10` functions.
* Added the `sumKahan` aggregate function (computationally stable summation of floating point numbers).
* Added to*Number*OrNull functions, where *Number* is a numeric type.
* Added support for the `WITH` clause for an `INSERT SELECT` query (by zhang2014).
* Added the settings `http_connection_timeout`, `http_send_timeout`, and `http_receive_timeout`. In particular, these settings are used for downloading data parts for replication. Changing these settings allows for faster failover if the network is overloaded.
* Added support for the `ALTER` query for tables of type `Null` (Anastasiya Tsarkova). Tables of type `Null` are often used with materialized views.
* The `reinterpretAsString` function is extended for all data types that are stored contiguously in memory.
* Added the `--silent` option for the `clickhouse-local` tool. It suppresses printing query execution info in stderr.
* Added support for reading values of type `Date` from text in a format where the month and/or day of the month is specified using a single digit instead of two digits (Amos Bird).
## Performance optimizations:
* Improved performance of `min`, `max`, `any`, `anyLast`, `anyHeavy`, `argMin`, `argMax` aggregate functions in case of String argument.
* Improved performance of `min`, `max`, `any`, `anyLast`, `anyHeavy`, `argMin`, `argMax` aggregate functions for String arguments.
* Improved performance of `isInfinite`, `isFinite`, `isNaN`, `roundToExp2` functions.
* Improved performance of parsing and formatting values of type `Date` and `DateTime` in text formats.
* Improved performance and precision of parsing floating point numbers.
* Lowered memory usage for `JOIN` in the case when left and right hands have columns with identical names that are not containing in `USING`.
* Improved performance of `varSamp`, `varPop`, `stddevSamp`, `stddevPop`, `covarSamp`, `covarPop`, `corr` aggregate functions in cost of reducing computational stability. Old functions are available under names: `varSampStable`, `varPopStable`, `stddevSampStable`, `stddevPopStable`, `covarSampStable`, `covarPopStable`, `corrStable`.
* Lowered memory usage for `JOIN` in the case when the left and right parts have columns with identical names that are not contained in `USING`.
* Improved performance of `varSamp`, `varPop`, `stddevSamp`, `stddevPop`, `covarSamp`, `covarPop`, and `corr` aggregate functions by reducing computational stability. The old functions are available under the names: `varSampStable`, `varPopStable`, `stddevSampStable`, `stddevPopStable`, `covarSampStable`, `covarPopStable`, `corrStable`.
## Bug fixes:
* Fixed data deduplication after running `DROP PARTITION` query. In previous version dropping a partition and INSERTing the same data again was not working because INSERTed blocks was considered duplicate.
* Fixed bug that may lead to incorrect interpretation of `WHERE` clause for queries `CREATE MATERIALIZED VIEW` with `POPULATE`.
* Fixed bug in using `root_path` parameter in configuration of `zookeeper_servers`.
* Fixed unexpected result of passing `Date` argument to `toStartOfDay`.
* Fixed function `addMonths`, `subtractMonths` and `INTERVAL n MONTH` arithmetic in the case when the result have year less than argument.
* Added missing support of `UUID` data type for `DISTINCT`, `JOIN`, `uniq` aggregate functions and external dictionaries (Ivanov Evgeniy). Support for `UUID` is still incomplete.
* Fixed `SummingMergeTree` behaviour in cases of rows summed to zero.
* Various fixes for `Kafka` engine (Marek Vavruša).
* Fixed incorrect behaviour of `Join` table engine (Amos Bird).
* Fixed wrong behaviour of allocator under FreeBSD and OS X.
* `extractAll` function now support empty matches.
* Fixed error that blocks usage of `libressl` instead of `openssl`.
* Fixed query `CREATE TABLE AS SELECT` from temporary tables.
* Fixed non-atomicity of updating replication queue. This could lead to desync of replicas until server restart.
* Fixed data deduplication after running a `DROP PARTITION` query. In the previous version, dropping a partition and INSERTing the same data again was not working because INSERTed blocks were considered duplicates.
* Fixed a bug that could lead to incorrect interpretation of the `WHERE` clause for `CREATE MATERIALIZED VIEW` queries with `POPULATE`.
* Fixed a bug in using the `root_path` parameter in the `zookeeper_servers` configuration.
* Fixed unexpected results of passing the `Date` argument to `toStartOfDay`.
* Fixed the `addMonths` and `subtractMonths` functions and the arithmetic for `INTERVAL n MONTH` in cases when the result has the previous year.
* Added missing support for the `UUID` data type for `DISTINCT`, `JOIN`, and `uniq` aggregate functions and external dictionaries (Evgeniy Ivanov). Support for `UUID` is still incomplete.
* Fixed `SummingMergeTree` behavior in cases when the rows summed to zero.
* Various fixes for the `Kafka` engine (Marek Vavruša).
* Fixed incorrect behavior of the `Join` table engine (Amos Bird).
* Fixed incorrect allocator behavior under FreeBSD and OS X.
* The `extractAll` function now supports empty matches.
* Fixed an error that blocked usage of `libressl` instead of `openssl`.
* Fixed the `CREATE TABLE AS SELECT` query from temporary tables.
* Fixed non-atomicity of updating the replication queue. This could lead to replicas being out of sync until the server restarts.
* Fixed possible overflow in `gcd`, `lcm` and `modulo` (`%` operator) (Maks Skorokhod).
* `-preprocessed` files now created after changing of `umask` (`umask` can be changed in config).
* Fixed bug in background check of parts (`MergeTreePartChecker`) in the case of using custom partition key.
* Fixed parsing of tuples (values of `Tuple` data type) in text formats.
* `-preprocessed` files are now created after changing `umask` (`umask` can be changed in the config).
* Fixed a bug in the background check of parts (`MergeTreePartChecker`) when using a custom partition key.
* Fixed parsing of tuples (values of the `Tuple` data type) in text formats.
* Improved error messages about incompatible types passed to `multiIf`, `array` and some other functions.
* Support for `Nullable` types is completely reworked. Fixed bugs, that may lead to server crash. Fixed almost all other bugs related to NULL support: incorrect type conversions in INSERT SELECT, unsufficient support for Nullable in HAVING and PREWHERE, `join_use_nulls` mode, Nullable types as arguments of OR operator, etc.
* Fixed various bugs related to internal semantics of data types. Examples: unnecessary summing fields of `Enum` type in `SummingMergeTree`; alignment of `Enum` types in Pretty formats, etc.
* More strict checks for allowed combinations of composite columns - fixed several bugs, that could lead to server crash.
* Fixed overflow when specifying very large parameter for `FixedString` data type.
* Fixed bug in aggregate function `topK` in generic case.
* Added missing check for equality of array sizes in arguments of n-ary variants of aggregate functions with `-Array` combinator.
* Fixed `--pager` option for `clickhouse-client` (by ks1322).
* Fixed precision of `exp10` function.
* Fixed behavior of `visitParamExtract` function for better compliance with documentation.
* Fixed crash when specifying incorrect data types.
* Fixed behavior of `DISTINCT` in the case when all columns are constants.
* Fixed query formatting in the case of using `tupleElement` function with complex constant expression as tuple element index.
* Fixed `Dictionary` table engine for dictionaries of type `range_hashed`.
* Fixed bug that leads to excessive rows in the result of `FULL` and `RIGHT JOIN` (Amos Bird).
* Fixed server crash when creating and removing temporary files in `config.d` directories during config reload.
* Fixed `SYSTEM DROP DNS CACHE` query: the cache was flushed but addresses of cluster nodes was not updated.
* Fixed behavior of `MATERIALIZED VIEW` after executing `DETACH TABLE` for the table under the view (Marek Vavruša).
* Support for `Nullable` types is completely reworked. Fixed bugs that may lead to a server crash. Fixed almost all other bugs related to NULL support: incorrect type conversions in INSERT SELECT, insufficient support for Nullable in HAVING and PREWHERE, `join_use_nulls` mode, Nullable types as arguments of OR operator, etc.
* Fixed various bugs related to internal semantics of data types. Examples: unnecessary summing of `Enum` type fields in `SummingMergeTree`; alignment of `Enum` types in Pretty formats, etc.
* Stricter checks for allowed combinations of composite columns. Fixed several bugs that could lead to a server crash.
* Fixed the overflow when specifying a very large parameter for the `FixedString` data type.
* Fixed a bug in the `topK` aggregate function in a generic case.
* Added the missing check for equality of array sizes in arguments of n-ary variants of aggregate functions with an `-Array` combinator.
* Fixed the `--pager` option for `clickhouse-client` (by ks1322).
* Fixed the precision of the `exp10` function.
* Fixed the behavior of the `visitParamExtract` function for better compliance with documentation.
* Fixed the crash when incorrect data types are specified.
* Fixed the behavior of `DISTINCT` in the case when all columns are constants.
* Fixed query formatting in the case of using the `tupleElement` function with a complex constant expression as the tuple element index.
* Fixed the `Dictionary` table engine for dictionaries of type `range_hashed`.
* Fixed a bug that leads to excessive rows in the result of `FULL` and `RIGHT JOIN` (Amos Bird).
* Fixed a server crash when creating and removing temporary files in `config.d` directories during config reload.
* Fixed the `SYSTEM DROP DNS CACHE` query: the cache was flushed but addresses of cluster nodes were not updated.
* Fixed the behavior of `MATERIALIZED VIEW` after executing `DETACH TABLE` for the table under the view (Marek Vavruša).
## Build improvements:
* Using `pbuilder` for builds. Build process is almost independent on the environment of build host.
* Single build is used for different OS versions. Packages and binaries have been made compatible with wide range of Linux systems.
* Added `clickhouse-test` package. It can be used to run functional tests.
* Added publishing of source tarball to the repository. It can be used to reproduce build without using GitHub.
* Added limited integration with Travis CI. Due to limits on build time in Travis, only debug build is tested and limited subset of tests are run.
* Added support for `Cap'n'Proto` in default build.
* Changed documentation sources format from `Restructured Text` to `Markdown`.
* Builds use `pbuilder`. The build process is almost completely independent of the build host environment.
* A single build is used for different OS versions. Packages and binaries have been made compatible with a wide range of Linux systems.
* Added the `clickhouse-test` package. It can be used to run functional tests.
* The source tarball can now be published to the repository. It can be used to reproduce the build without using GitHub.
* Added limited integration with Travis CI. Due to limits on build time in Travis, only the debug build is tested and a limited subset of tests are run.
* Added support for `Cap'n'Proto` in the default build.
* Changed the format of documentation sources from `Restructured Text` to `Markdown`.
* Added support for `systemd` (Vladimir Smirnov). It is disabled by default due to incompatibility with some OS images and can be enabled manually.
* For dynamic code generation, `clang` and `lld` are embedded into the `clickhouse` binary. They can also be invoked as `clickhouse clang` and `clickhouse lld`.
* Removed usage of GNU extensions from the code. Enabled `-Wextra` option. When building with `clang`, `libc++` is used instead of `libstdc++`.
* Removed usage of GNU extensions from the code. Enabled the `-Wextra` option. When building with `clang`, `libc++` is used instead of `libstdc++`.
* Extracted `clickhouse_parsers` and `clickhouse_common_io` libraries to speed up builds of various tools.
## Backward incompatible changes:
* Marks format for tables of type `Log`, that contain `Nullable` columns, was changed in backward incompatible way. If you have these tables, you should convert them to `TinyLog` type before staring up new server version. To do that, you should replace `ENGINE = Log` to `ENGINE = TinyLog` in corresponding `.sql` file in `metadata` directory. If your table don't have `Nullable` columns or if type of your table is not `Log`, then no actions required.
* Removed setting `experimental_allow_extended_storage_definition_syntax`. Now this feature is enabled by default.
* To avoid confusion, `runningIncome` function is renamed to `runningDifferenceStartingWithFirstValue`.
* Removed `FROM ARRAY JOIN arr` syntax: when ARRAY JOIN is specified directly after FROM with no table (Amos Bird).
* Removed `BlockTabSeparated` format that was used solely for demonstration purposes.
* Changed serialization format of intermediate states of `varSamp`, `varPop`, `stddevSamp`, `stddevPop`, `covarSamp`, `covarPop`, `corr` aggregate functions. If you have stored states of these aggregate functions in tables (using AggregateFunction data type or materialized views with corresponding States), please write a message to clickhouse-feedback@yandex-team.com.
* The format for marks in `Log` type tables that contain `Nullable` columns was changed in a backward incompatible way. If you have these tables, you should convert them to the `TinyLog` type before starting up the new server version. To do this, replace `ENGINE = Log` with `ENGINE = TinyLog` in the corresponding `.sql` file in the `metadata` directory. If your table doesn't have `Nullable` columns or if the type of your table is not `Log`, then you don't need to do anything.
* Removed the `experimental_allow_extended_storage_definition_syntax` setting. Now this feature is enabled by default.
* To avoid confusion, the `runningIncome` function has been renamed to `runningDifferenceStartingWithFirstValue`.
* Removed the `FROM ARRAY JOIN arr` syntax when ARRAY JOIN is specified directly after FROM with no table (Amos Bird).
* Removed the `BlockTabSeparated` format that was used solely for demonstration purposes.
* Changed the serialization format of intermediate states of the aggregate functions `varSamp`, `varPop`, `stddevSamp`, `stddevPop`, `covarSamp`, `covarPop`, and `corr`. If you have stored states of these aggregate functions in tables (using the AggregateFunction data type or materialized views with corresponding states), please write to clickhouse-feedback@yandex-team.com.
* In previous server versions there was an undocumented feature: if an aggregate function depend on parameters, you still may specify it without parameters in AggregateFunction data type. Example: `AggregateFunction(quantiles, UInt64)` instead of `AggregateFunction(quantiles(0.5, 0.9), UInt64)`. This feature is lost. Despite it was undocumented feature, we plan to support it again in next releases.
## Please note when upgrading:
* When doing rolling update on cluster, when some of replicas run old version of ClickHouse and some of replicas run new version, replication is temporarily stopped and messages `unknown parameter 'shard'` will appear in log. Replication will continue after updating all replicas of the cluster.
* If you have different ClickHouse versions on the cluster, you can get wrong results for distributed queries with aggregate functions `varSamp`, `varPop`, `stddevSamp`, `stddevPop`, `covarSamp`, `covarPop`, `corr`. You should update all cluster nodes.
* When doing a rolling update on a cluster, at the point when some of the replicas are running the old version of ClickHouse and some are running the new version, replication is temporarily stopped and the message `unknown parameter 'shard'` appears in the log. Replication will continue after all replicas of the cluster are updated.
* If you have different ClickHouse versions on the cluster, you can get incorrect results for distributed queries with the aggregate functions `varSamp`, `varPop`, `stddevSamp`, `stddevPop`, `covarSamp`, `covarPop`, and `corr`. You should update all cluster nodes.
# ClickHouse release 1.1.54327, 2017-12-21

View File

@ -1,6 +1,18 @@
# Релиз ClickHouse 1.1.54342, 2018-01-22
Релиз содержит исправление к предыдущему релизу 1.1.54318:
* Исправлена регрессия в версии 1.1.54337: если пользователь по-умолчанию имеет readonly доступ, то сервер отказывался стартовать с сообщением `Cannot create database in readonly mode`.
* Исправлена регрессия в версии 1.1.54337: на системах под управлением systemd, логи по ошибке всегда записываются в syslog; watchdog скрипт по ошибке использует init.d.
* Исправлена регрессия в версии 1.1.54337: неправильная конфигурация по-умоланию в Docker образе.
* Исправлена недетерминированная работа GraphiteMergeTree (в логах видно по сообщениям `Data after merge is not byte-identical to data on another replicas`).
* Исправлена ошибка, в связи с которой запрос OPTIMIZE к Replicated таблицам мог приводить к неконсистентным мержам (в логах видно по сообщениям `Part ... intersects previous part`).
* Таблицы типа Buffer теперь работают при наличии MATERIALIZED столбцов в таблице назначения (by zhang2014).
* Исправлена одна из ошибок в реализации NULL.
# Релиз ClickHouse 1.1.54337, 2018-01-18
## Новые возможности:
* Добавлена поддержка хранения многомерных массивов и кортежей (тип данных `Tuple`) в таблицах.
* Поддержка табличных функций для запросов `DESCRIBE` и `INSERT`. Поддержка подзапроса в запросе `DESCRIBE`. Примеры: `DESC TABLE remote('host', default.hits)`; `DESC TABLE (SELECT 1)`; `INSERT INTO TABLE FUNCTION remote('host', default.hits)`. Возможность писать `INSERT INTO TABLE` вместо `INSERT INTO`.
* Улучшена поддержка часовых поясов. В типе `DateTime` может быть указана таймзона, которая используется для парсинга и отображения данных в текстовом виде. Пример: `DateTime('Europe/Moscow')`. При указании таймзоны в функциях работы с `DateTime`, тип возвращаемого значения будет запоминать таймзону, для того, чтобы значение отображалось ожидаемым образом.
@ -100,6 +112,7 @@
* Удалена возможность написания `FROM ARRAY JOIN arr` без указания таблицы после FROM (Amos Bird).
* Удалён формат `BlockTabSeparated`, использовавшийся лишь для демонстрационных целей.
* Изменён формат состояния агрегатных функций `varSamp`, `varPop`, `stddevSamp`, `stddevPop`, `covarSamp`, `covarPop`, `corr`. Если вы использовали эти состояния для хранения в таблицах (тип данных `AggregateFunction` от этих функций или материализованные представления, хранящие эти состояния), напишите на clickhouse-feedback@yandex-team.com.
* В предыдущих версиях существовала недокументированная возможность: в типе данных AggregateFunction можно было не указывать параметры для агрегатной функции, которая зависит от параметров. Пример: `AggregateFunction(quantiles, UInt64)` вместо `AggregateFunction(quantiles(0.5, 0.9), UInt64)`. Эта возможность потеряна. Не смотря на то, что возможность не документирована, мы собираемся вернуть её в ближайших релизах.
## На что обратить внимание при обновлении:
* При обновлении кластера, на время, когда на одних репликах работает новая версия сервера, а на других - старая, репликация будет приостановлена и в логе появятся сообщения вида `unknown parameter 'shard'`. Репликация продолжится после обновления всех реплик кластера.

View File

@ -17,9 +17,6 @@ else ()
message (WARNING "You are using an unsupported compiler! Compilation has only been tested with Clang 5+ and GCC 7+.")
endif ()
if (CMAKE_SYSTEM MATCHES "FreeBSD")
set (PLATFORM_EXTRA_CXX_FLAG "-DCLOCK_MONOTONIC_COARSE=CLOCK_MONOTONIC_FAST")
endif ()
# Write compile_commands.json
set(CMAKE_EXPORT_COMPILE_COMMANDS 1)
@ -51,6 +48,16 @@ if ( ( ARCH_ARM AND NOT ARCH_AARCH64 ) OR ARCH_I386)
set (ARCH_32 1)
message (WARNING "Support for 32bit platforms is highly experimental")
endif ()
if (CMAKE_SYSTEM MATCHES "Linux")
set (ARCH_LINUX 1)
endif ()
if (CMAKE_SYSTEM MATCHES "FreeBSD")
set (ARCH_FREEBSD 1)
endif ()
if (ARCH_FREEBSD)
set (PLATFORM_EXTRA_CXX_FLAG "-DCLOCK_MONOTONIC_COARSE=CLOCK_MONOTONIC_FAST")
endif ()
set (COMMON_WARNING_FLAGS "${COMMON_WARNING_FLAGS} -Wall") # -Werror is also added inside directories with our own code.
set (CXX_WARNING_FLAGS "${CXX_WARNING_FLAGS} -Wnon-virtual-dtor")
@ -60,7 +67,10 @@ if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
set (COMMON_WARNING_FLAGS "${COMMON_WARNING_FLAGS} -Wno-unused-command-line-argument")
endif ()
set (CXX11_ABI "ENABLE" CACHE STRING "Use C++11 ABI: DEFAULT, ENABLE, DISABLE")
if (ARCH_LINUX)
set (CXX11_ABI "ENABLE" CACHE STRING "Use C++11 ABI: DEFAULT, ENABLE, DISABLE")
endif ()
option (TEST_COVERAGE "Enables flags for test coverage" OFF)
option (ENABLE_TESTS "Enables tests" ON)
@ -82,7 +92,7 @@ endif ()
if (CMAKE_LIBRARY_ARCHITECTURE MATCHES "amd64.*|x86_64.*|AMD64.*")
option (USE_INTERNAL_MEMCPY "Use internal implementation of 'memcpy' function instead of provided by libc. Only for x86_64." ON)
if (CMAKE_SYSTEM MATCHES "Linux")
if (ARCH_LINUX)
option (GLIBC_COMPATIBILITY "Set to TRUE to enable compatibility with older glibc libraries. Only for x86_64, Linux. Implies USE_INTERNAL_MEMCPY." ON)
endif()
endif ()
@ -154,7 +164,7 @@ set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${COMPILER_FLAGS} -fn
set (CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -O3 ${CMAKE_C_FLAGS_ADD}")
set (CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O0 -g3 -ggdb3 -fno-inline ${CMAKE_C_FLAGS_ADD}")
if (MAKE_STATIC_LIBRARIES AND NOT APPLE AND NOT (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" AND CMAKE_SYSTEM MATCHES "FreeBSD"))
if (MAKE_STATIC_LIBRARIES AND NOT APPLE AND NOT (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" AND ARCH_FREEBSD))
set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libgcc -static-libstdc++")
endif ()
@ -162,7 +172,7 @@ set(THREADS_PREFER_PTHREAD_FLAG ON)
include (cmake/test_compiler.cmake)
if (CMAKE_SYSTEM MATCHES "Linux" AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
if (ARCH_LINUX AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${GLIBC_COMPATIBILITY_LINK_FLAGS} ${CXX11_ABI_FLAGS}")
option (USE_LIBCXX "Use libc++ and libc++abi instead of libstdc++ (only make sense on Linux with Clang)" ${HAVE_LIBCXX})
@ -246,7 +256,7 @@ else ()
set(NOT_UNBUNDLED 1)
endif ()
# Using system libs can cause lot of warnings in includes.
if (UNBUNDLED OR NOT (CMAKE_SYSTEM MATCHES "Linux" OR APPLE) OR ARCH_32)
if (UNBUNDLED OR NOT (ARCH_LINUX OR APPLE) OR ARCH_32)
option (NO_WERROR "Disable -Werror compiler option" ON)
endif ()
@ -268,6 +278,8 @@ include (cmake/find_zstd.cmake)
include (cmake/find_ltdl.cmake) # for odbc
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/poco/cmake/FindODBC.cmake)
include (${CMAKE_CURRENT_SOURCE_DIR}/contrib/poco/cmake/FindODBC.cmake) # for poco
else ()
include (cmake/find_odbc.cmake)
endif ()
include (cmake/find_poco.cmake)
include (cmake/find_lz4.cmake)

View File

@ -193,6 +193,12 @@ endforeach()
if(Poco_DataODBC_LIBRARY)
list(APPEND Poco_DataODBC_LIBRARY ${ODBC_LIBRARIES} ${LTDL_LIBRARY})
list(APPEND Poco_INCLUDE_DIRS ${ODBC_INCLUDE_DIRECTORIES})
endif()
if(Poco_NetSSL_LIBRARY)
list(APPEND Poco_NetSSL_LIBRARY ${OPENSSL_LIBRARIES})
list(APPEND Poco_INCLUDE_DIRS ${OPENSSL_INCLUDE_DIR})
endif()
if(DEFINED Poco_LIBRARIES)

View File

@ -1,4 +1,4 @@
if (CMAKE_SYSTEM MATCHES "FreeBSD")
if (ARCH_FREEBSD)
find_library (EXECINFO_LIBRARY execinfo)
message (STATUS "Using execinfo: ${EXECINFO_LIBRARY}")
else ()

View File

@ -1,17 +1,26 @@
option (USE_INTERNAL_GTEST_LIBRARY "Set to FALSE to use system Google Test instead of bundled" ${NOT_UNBUNDLED})
if (USE_INTERNAL_GTEST_LIBRARY AND NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/googletest/googletest/CMakeLists.txt")
message (WARNING "submodule contrib/googletest is missing. to fix try run: \n git submodule update --init --recursive")
set (USE_INTERNAL_GTEST_LIBRARY 0)
if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/googletest/googletest/CMakeLists.txt")
if (USE_INTERNAL_GTEST_LIBRARY)
message (WARNING "submodule contrib/googletest is missing. to fix try run: \n git submodule update --init --recursive")
set (USE_INTERNAL_GTEST_LIBRARY 0)
endif ()
set (MISSING_INTERNAL_GTEST_LIBRARY 1)
endif ()
if (NOT USE_INTERNAL_GTEST_LIBRARY)
find_library (GTEST_LIBRARY gtest_main)
find_path (GTEST_INCLUDE_DIR NAMES /gtest/gtest.h PATHS ${GTEST_INCLUDE_PATHS})
find_path (GTEST_ROOT NAMES src/gtest-all.cc PATHS /usr/src/googletest/googletest /usr/src/gtest)
endif ()
if (GTEST_LIBRARY AND GTEST_INCLUDE_DIR)
else ()
if (GTEST_INCLUDE_DIR AND GTEST_ROOT)
# googletest package have no lib
add_library(gtest ${GTEST_ROOT}/src/gtest-all.cc)
add_library(gtest_main ${GTEST_ROOT}/src/gtest_main.cc)
target_include_directories(gtest PRIVATE ${GTEST_ROOT})
target_link_libraries(gtest_main gtest)
set (GTEST_LIBRARY gtest_main)
elseif (NOT MISSING_INTERNAL_GTEST_LIBRARY)
set (USE_INTERNAL_GTEST_LIBRARY 1)
set (GTEST_LIBRARY gtest_main)
endif ()

View File

@ -15,7 +15,7 @@ if (USE_EMBEDDED_COMPILER)
# llvm_map_components_to_libraries - Maps LLVM used components to required libraries.
# Usage: llvm_map_components_to_libraries(REQUIRED_LLVM_LIBRARIES core jit interpreter native ...)
if (CMAKE_SYSTEM MATCHES "FreeBSD")
if (ARCH_FREEBSD)
set(LLVM_VERSION_POSTFIX "50" CACHE INTERNAL "")
else()
set(LLVM_VERSION_POSTFIX "-5.0" CACHE INTERNAL "")

66
cmake/find_odbc.cmake Normal file
View File

@ -0,0 +1,66 @@
# This file copied from contrib/poco/cmake/FindODBC.cmake to allow build without submodules
#
# Find the ODBC driver manager includes and library.
#
# ODBC is an open standard for connecting to different databases in a
# semi-vendor-independent fashion. First you install the ODBC driver
# manager. Then you need a driver for each separate database you want
# to connect to (unless a generic one works). VTK includes neither
# the driver manager nor the vendor-specific drivers: you have to find
# those yourself.
#
# This module defines
# ODBC_INCLUDE_DIRECTORIES, where to find sql.h
# ODBC_LIBRARIES, the libraries to link against to use ODBC
# ODBC_FOUND. If false, you cannot build anything that requires MySQL.
find_path(ODBC_INCLUDE_DIRECTORIES
NAMES sql.h
HINTS
/usr/include
/usr/include/odbc
/usr/include/iodbc
/usr/local/include
/usr/local/include/odbc
/usr/local/include/iodbc
/usr/local/odbc/include
/usr/local/iodbc/include
"C:/Program Files/ODBC/include"
"C:/Program Files/Microsoft SDKs/Windows/v7.0/include"
"C:/Program Files/Microsoft SDKs/Windows/v6.0a/include"
"C:/ODBC/include"
DOC "Specify the directory containing sql.h."
)
find_library(ODBC_LIBRARIES
NAMES iodbc odbc iodbcinst odbcinst odbc32
HINTS
/usr/lib
/usr/lib/odbc
/usr/lib/iodbc
/usr/local/lib
/usr/local/lib/odbc
/usr/local/lib/iodbc
/usr/local/odbc/lib
/usr/local/iodbc/lib
"C:/Program Files/ODBC/lib"
"C:/ODBC/lib/debug"
"C:/Program Files (x86)/Microsoft SDKs/Windows/v7.0A/Lib"
DOC "Specify the ODBC driver manager library here."
)
# MinGW find usually fails
if(MINGW)
set(ODBC_INCLUDE_DIRECTORIES ".")
set(ODBC_LIBRARIES odbc32)
endif()
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(ODBC
DEFAULT_MSG
ODBC_INCLUDE_DIRECTORIES
ODBC_LIBRARIES
)
mark_as_advanced(ODBC_FOUND ODBC_LIBRARIES ODBC_INCLUDE_DIRECTORIES)

View File

@ -53,10 +53,6 @@ elseif (NOT MISSING_INTERNAL_POCO_LIBRARY)
set (Poco_NetSSL_FOUND 1)
set (Poco_NetSSL_LIBRARY PocoNetSSL)
set (Poco_Crypto_LIBRARY PocoCrypto)
set (Poco_NetSSL_INCLUDE_DIRS
"${ClickHouse_SOURCE_DIR}/contrib/poco/NetSSL_OpenSSL/include/"
"${ClickHouse_SOURCE_DIR}/contrib/poco/Crypto/include/"
)
endif ()
if (USE_STATIC_LIBRARIES AND USE_INTERNAL_ZLIB_LIBRARY)
@ -71,7 +67,6 @@ elseif (NOT MISSING_INTERNAL_POCO_LIBRARY)
set (Poco_Net_LIBRARY PocoNet)
set (Poco_Data_LIBRARY PocoData)
set (Poco_XML_LIBRARY PocoXML)
endif ()
message(STATUS "Using Poco: ${Poco_INCLUDE_DIRS} : ${Poco_Foundation_LIBRARY},${Poco_Util_LIBRARY},${Poco_Net_LIBRARY},${Poco_NetSSL_LIBRARY},${Poco_XML_LIBRARY},${Poco_Data_LIBRARY},${Poco_DataODBC_LIBRARY},${Poco_MongoDB_LIBRARY}; MongoDB=${Poco_MongoDB_FOUND}, DataODBC=${Poco_DataODBC_FOUND}, NetSSL=${Poco_NetSSL_FOUND}")

View File

@ -13,11 +13,17 @@ endif ()
if (NOT USE_INTERNAL_RDKAFKA_LIBRARY)
find_library (RDKAFKA_LIB rdkafka)
find_path (RDKAFKA_INCLUDE_DIR NAMES librdkafka/rdkafka.h PATHS ${RDKAFKA_INCLUDE_PATHS})
if (USE_STATIC_LIBRARIES AND NOT ARCH_FREEBSD)
find_library (SASL2_LIBRARY sasl2)
endif ()
endif ()
if (RDKAFKA_LIB AND RDKAFKA_INCLUDE_DIR)
set (USE_RDKAFKA 1)
set (RDKAFKA_LIBRARY ${RDKAFKA_LIB} ${OPENSSL_LIBRARIES})
if (SASL2_LIBRARY)
list (APPEND RDKAFKA_LIBRARY ${SASL2_LIBRARY})
endif ()
elseif (NOT MISSING_INTERNAL_RDKAFKA_LIBRARY)
set (USE_INTERNAL_RDKAFKA_LIBRARY 1)
set (RDKAFKA_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/librdkafka/src")

View File

@ -8,7 +8,7 @@ if (NOT ZLIB_FOUND)
set (USE_INTERNAL_ZLIB_LIBRARY 1)
set (ZLIB_COMPAT 1) # for zlib-ng, also enables WITH_GZFILEOP
set (WITH_NATIVE_INSTRUCTIONS ${ARCHNATIVE})
if (CMAKE_SYSTEM MATCHES "FreeBSD" OR ARCH_I386)
if (ARCH_FREEBSD OR ARCH_I386)
set (WITH_OPTIM 0 CACHE INTERNAL "") # Bug in assembler
endif ()
if (ARCH_AARCH64)

View File

@ -13,7 +13,9 @@ if (USE_INTERNAL_ZSTD_LIBRARY)
endif ()
if (USE_INTERNAL_RE2_LIBRARY)
add_subdirectory (libre2)
set(RE2_BUILD_TESTING 0 CACHE INTERNAL "")
add_subdirectory (re2)
add_subdirectory (re2_st)
endif ()
if (USE_INTERNAL_DOUBLE_CONVERSION_LIBRARY)
@ -107,4 +109,8 @@ if (USE_INTERNAL_POCO_LIBRARY)
set (ENABLE_TESTS ${_save})
set (CMAKE_CXX_FLAGS ${save_CMAKE_CXX_FLAGS})
set (CMAKE_C_FLAGS ${save_CMAKE_C_FLAGS})
if (OPENSSL_FOUND)
# Bug in poco https://github.com/pocoproject/poco/pull/2100 found on macos
target_include_directories(Crypto PUBLIC ${OPENSSL_INCLUDE_DIR})
endif ()
endif ()

View File

@ -1,13 +0,0 @@
# This is the official list of RE2 authors for copyright purposes.
# This file is distinct from the CONTRIBUTORS files.
# See the latter for an explanation.
# Names should be added to this file as
# Name or Organization <email address>
# The email address is not required for organizations.
# Please keep the list sorted.
Google Inc.
Samsung Electronics
Stefano Rivera <stefano.rivera@gmail.com>

View File

@ -1,39 +0,0 @@
# This is the official list of people who can contribute
# (and typically have contributed) code to the RE2 repository.
# The AUTHORS file lists the copyright holders; this file
# lists people. For example, Google employees are listed here
# but not in AUTHORS, because Google holds the copyright.
#
# The submission process automatically checks to make sure
# that people submitting code are listed in this file (by email address).
#
# Names should be added to this file only after verifying that
# the individual or the individual's organization has agreed to
# the appropriate Contributor License Agreement, found here:
#
# http://code.google.com/legal/individual-cla-v1.0.html
# http://code.google.com/legal/corporate-cla-v1.0.html
#
# The agreement for individuals can be filled out on the web.
#
# When adding J Random Contributor's name to this file,
# either J's name or J's organization's name should be
# added to the AUTHORS file, depending on whether the
# individual or corporate CLA was used.
# Names should be added to this file like so:
# Name <email address>
# Please keep the list sorted.
Dominic Battré <battre@chromium.org>
Dmitriy Vyukov <dvyukov@google.com>
John Millikin <jmillikin@gmail.com>
Mike Nazarewicz <mpn@google.com>
Pawel Hajdan <phajdan.jr@gmail.com>
Rob Pike <r@google.com>
Russ Cox <rsc@swtch.com>
Sanjay Ghemawat <sanjay@google.com>
Stefano Rivera <stefano.rivera@gmail.com>
Srinivasan Venkatachary <vsri@google.com>
Viatcheslav Ostapenko <sl.ostapenko@samsung.com>

View File

@ -1,27 +0,0 @@
// Copyright (c) 2009 The RE2 Authors. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -1 +0,0 @@
https://github.com/google/re2/tree/7cf8b88e8f70f97fd4926b56aa87e7f53b2717e0

View File

@ -1,113 +0,0 @@
// Copyright 2016 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_BITMAP256_H_
#define RE2_BITMAP256_H_
#ifdef _MSC_VER
#include <intrin.h>
#endif
#include <stdint.h>
#include <string.h>
#include "util/util.h"
#include "util/logging.h"
namespace re2 {
class Bitmap256 {
public:
Bitmap256() {
memset(words_, 0, sizeof words_);
}
// Tests the bit with index c.
bool Test(int c) const {
DCHECK_GE(c, 0);
DCHECK_LE(c, 255);
return (words_[c / 64] & (1ULL << (c % 64))) != 0;
}
// Sets the bit with index c.
void Set(int c) {
DCHECK_GE(c, 0);
DCHECK_LE(c, 255);
words_[c / 64] |= (1ULL << (c % 64));
}
// Finds the next non-zero bit with index >= c.
// Returns -1 if no such bit exists.
int FindNextSetBit(int c) const;
private:
// Finds the least significant non-zero bit in n.
static int FindLSBSet(uint64_t n) {
DCHECK_NE(n, 0);
#if defined(__GNUC__)
return __builtin_ctzll(n);
#elif defined(_MSC_VER) && defined(_M_X64)
unsigned long c;
_BitScanForward64(&c, n);
return static_cast<int>(c);
#elif defined(_MSC_VER) && defined(_M_IX86)
unsigned long c;
if (static_cast<uint32_t>(n) != 0) {
_BitScanForward(&c, static_cast<uint32_t>(n));
return static_cast<int>(c);
} else {
_BitScanForward(&c, static_cast<uint32_t>(n >> 32));
return static_cast<int>(c) + 32;
}
#else
int c = 63;
for (int shift = 1 << 5; shift != 0; shift >>= 1) {
uint64_t word = n << shift;
if (word != 0) {
n = word;
c -= shift;
}
}
return c;
#endif
}
uint64_t words_[4];
};
int Bitmap256::FindNextSetBit(int c) const {
DCHECK_GE(c, 0);
DCHECK_LE(c, 255);
// Check the word that contains the bit. Mask out any lower bits.
int i = c / 64;
uint64_t word = words_[i] & (~0ULL << (c % 64));
if (word != 0)
return (i * 64) + FindLSBSet(word);
// Check any following words.
i++;
switch (i) {
case 1:
if (words_[1] != 0)
return (1 * 64) + FindLSBSet(words_[1]);
FALLTHROUGH_INTENDED;
case 2:
if (words_[2] != 0)
return (2 * 64) + FindLSBSet(words_[2]);
FALLTHROUGH_INTENDED;
case 3:
if (words_[3] != 0)
return (3 * 64) + FindLSBSet(words_[3]);
FALLTHROUGH_INTENDED;
default:
return -1;
}
}
} // namespace re2
#endif // RE2_BITMAP256_H_

View File

@ -1,401 +0,0 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Tested by search_test.cc, exhaustive_test.cc, tester.cc
// Prog::SearchBitState is a regular expression search with submatch
// tracking for small regular expressions and texts. Like
// testing/backtrack.cc, it allocates a bit vector with (length of
// text) * (length of prog) bits, to make sure it never explores the
// same (character position, instruction) state multiple times. This
// limits the search to run in time linear in the length of the text.
//
// Unlike testing/backtrack.cc, SearchBitState is not recursive
// on the text.
//
// SearchBitState is a fast replacement for the NFA code on small
// regexps and texts when SearchOnePass cannot be used.
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#include "util/logging.h"
#include "re2/prog.h"
#include "re2/regexp.h"
namespace re2 {
struct Job {
int id;
int arg;
const char* p;
};
class BitState {
public:
explicit BitState(Prog* prog);
~BitState();
// The usual Search prototype.
// Can only call Search once per BitState.
bool Search(const StringPiece& text, const StringPiece& context,
bool anchored, bool longest,
StringPiece* submatch, int nsubmatch);
private:
inline bool ShouldVisit(int id, const char* p);
void Push(int id, const char* p, int arg);
bool GrowStack();
bool TrySearch(int id, const char* p);
// Search parameters
Prog* prog_; // program being run
StringPiece text_; // text being searched
StringPiece context_; // greater context of text being searched
bool anchored_; // whether search is anchored at text.begin()
bool longest_; // whether search wants leftmost-longest match
bool endmatch_; // whether match must end at text.end()
StringPiece *submatch_; // submatches to fill in
int nsubmatch_; // # of submatches to fill in
// Search state
const char** cap_; // capture registers
int ncap_;
static const int VisitedBits = 32;
uint32_t *visited_; // bitmap: (Inst*, char*) pairs already backtracked
size_t nvisited_; // # of words in bitmap
Job *job_; // stack of text positions to explore
int njob_;
int maxjob_;
};
BitState::BitState(Prog* prog)
: prog_(prog),
anchored_(false),
longest_(false),
endmatch_(false),
submatch_(NULL),
nsubmatch_(0),
cap_(NULL),
ncap_(0),
visited_(NULL),
nvisited_(0),
job_(NULL),
njob_(0),
maxjob_(0) {
}
BitState::~BitState() {
delete[] visited_;
delete[] job_;
delete[] cap_;
}
// Should the search visit the pair ip, p?
// If so, remember that it was visited so that the next time,
// we don't repeat the visit.
bool BitState::ShouldVisit(int id, const char* p) {
size_t n = id * (text_.size() + 1) + (p - text_.begin());
if (visited_[n/VisitedBits] & (1 << (n & (VisitedBits-1))))
return false;
visited_[n/VisitedBits] |= 1 << (n & (VisitedBits-1));
return true;
}
// Grow the stack.
bool BitState::GrowStack() {
maxjob_ *= 2;
Job* newjob = new Job[maxjob_];
memmove(newjob, job_, njob_*sizeof job_[0]);
delete[] job_;
job_ = newjob;
if (njob_ >= maxjob_) {
LOG(DFATAL) << "Job stack overflow.";
return false;
}
return true;
}
// Push the triple (id, p, arg) onto the stack, growing it if necessary.
void BitState::Push(int id, const char* p, int arg) {
if (njob_ >= maxjob_) {
if (!GrowStack())
return;
}
int op = prog_->inst(id)->opcode();
if (op == kInstFail)
return;
// Only check ShouldVisit when arg == 0.
// When arg > 0, we are continuing a previous visit.
if (arg == 0 && !ShouldVisit(id, p))
return;
Job* j = &job_[njob_++];
j->id = id;
j->p = p;
j->arg = arg;
}
// Try a search from instruction id0 in state p0.
// Return whether it succeeded.
bool BitState::TrySearch(int id0, const char* p0) {
bool matched = false;
bool inaltmatch = false;
const char* end = text_.end();
njob_ = 0;
Push(id0, p0, 0);
while (njob_ > 0) {
// Pop job off stack.
--njob_;
int id = job_[njob_].id;
const char* p = job_[njob_].p;
int arg = job_[njob_].arg;
// Optimization: rather than push and pop,
// code that is going to Push and continue
// the loop simply updates ip, p, and arg
// and jumps to CheckAndLoop. We have to
// do the ShouldVisit check that Push
// would have, but we avoid the stack
// manipulation.
if (0) {
Next:
// If the Match of a non-greedy AltMatch failed,
// we stop ourselves from trying the ByteRange,
// which would steer us off the short circuit.
if (prog_->inst(id)->last() || inaltmatch)
continue;
id++;
CheckAndLoop:
if (!ShouldVisit(id, p))
continue;
}
// Visit ip, p.
Prog::Inst* ip = prog_->inst(id);
switch (ip->opcode()) {
default:
LOG(DFATAL) << "Unexpected opcode: " << ip->opcode() << " arg " << arg;
return false;
case kInstFail:
continue;
case kInstAltMatch:
switch (arg) {
case 0:
inaltmatch = true;
Push(id, p, 1); // come back when we're done
// One opcode is ByteRange; the other leads to Match
// (possibly via Nop or Capture).
if (ip->greedy(prog_)) {
// out1 is the match
Push(ip->out1(), p, 0);
id = ip->out1();
p = end;
goto CheckAndLoop;
}
// out is the match - non-greedy
Push(ip->out(), end, 0);
id = ip->out();
goto CheckAndLoop;
case 1:
inaltmatch = false;
continue;
}
LOG(DFATAL) << "Bad arg in kInstAltMatch: " << arg;
continue;
case kInstByteRange: {
int c = -1;
if (p < end)
c = *p & 0xFF;
if (!ip->Matches(c))
goto Next;
if (!ip->last())
Push(id+1, p, 0); // try the next when we're done
id = ip->out();
p++;
goto CheckAndLoop;
}
case kInstCapture:
switch (arg) {
case 0:
if (!ip->last())
Push(id+1, p, 0); // try the next when we're done
if (0 <= ip->cap() && ip->cap() < ncap_) {
// Capture p to register, but save old value.
Push(id, cap_[ip->cap()], 1); // come back when we're done
cap_[ip->cap()] = p;
}
// Continue on.
id = ip->out();
goto CheckAndLoop;
case 1:
// Finished ip->out(); restore the old value.
cap_[ip->cap()] = p;
continue;
}
LOG(DFATAL) << "Bad arg in kInstCapture: " << arg;
continue;
case kInstEmptyWidth:
if (ip->empty() & ~Prog::EmptyFlags(context_, p))
goto Next;
if (!ip->last())
Push(id+1, p, 0); // try the next when we're done
id = ip->out();
goto CheckAndLoop;
case kInstNop:
if (!ip->last())
Push(id+1, p, 0); // try the next when we're done
id = ip->out();
goto CheckAndLoop;
case kInstMatch: {
if (endmatch_ && p != text_.end())
goto Next;
// We found a match. If the caller doesn't care
// where the match is, no point going further.
if (nsubmatch_ == 0)
return true;
// Record best match so far.
// Only need to check end point, because this entire
// call is only considering one start position.
matched = true;
cap_[1] = p;
if (submatch_[0].data() == NULL ||
(longest_ && p > submatch_[0].end())) {
for (int i = 0; i < nsubmatch_; i++)
submatch_[i] =
StringPiece(cap_[2 * i],
static_cast<size_t>(cap_[2 * i + 1] - cap_[2 * i]));
}
// If going for first match, we're done.
if (!longest_)
return true;
// If we used the entire text, no longer match is possible.
if (p == text_.end())
return true;
// Otherwise, continue on in hope of a longer match.
goto Next;
}
}
}
return matched;
}
// Search text (within context) for prog_.
bool BitState::Search(const StringPiece& text, const StringPiece& context,
bool anchored, bool longest,
StringPiece* submatch, int nsubmatch) {
// Search parameters.
text_ = text;
context_ = context;
if (context_.begin() == NULL)
context_ = text;
if (prog_->anchor_start() && context_.begin() != text.begin())
return false;
if (prog_->anchor_end() && context_.end() != text.end())
return false;
anchored_ = anchored || prog_->anchor_start();
longest_ = longest || prog_->anchor_end();
endmatch_ = prog_->anchor_end();
submatch_ = submatch;
nsubmatch_ = nsubmatch;
for (int i = 0; i < nsubmatch_; i++)
submatch_[i] = StringPiece();
// Allocate scratch space.
nvisited_ = (prog_->size() * (text.size()+1) + VisitedBits-1) / VisitedBits;
visited_ = new uint32_t[nvisited_];
memset(visited_, 0, nvisited_*sizeof visited_[0]);
ncap_ = 2*nsubmatch;
if (ncap_ < 2)
ncap_ = 2;
cap_ = new const char*[ncap_];
memset(cap_, 0, ncap_*sizeof cap_[0]);
maxjob_ = 256;
job_ = new Job[maxjob_];
// Anchored search must start at text.begin().
if (anchored_) {
cap_[0] = text.begin();
return TrySearch(prog_->start(), text.begin());
}
// Unanchored search, starting from each possible text position.
// Notice that we have to try the empty string at the end of
// the text, so the loop condition is p <= text.end(), not p < text.end().
// This looks like it's quadratic in the size of the text,
// but we are not clearing visited_ between calls to TrySearch,
// so no work is duplicated and it ends up still being linear.
for (const char* p = text.begin(); p <= text.end(); p++) {
// Try to use memchr to find the first byte quickly.
int fb = prog_->first_byte();
if (fb >= 0 && p < text.end() && (p[0] & 0xFF) != fb) {
p = reinterpret_cast<const char*>(memchr(p, fb, text.end() - p));
if (p == NULL)
p = text.end();
}
cap_[0] = p;
if (TrySearch(prog_->start(), p)) // Match must be leftmost; done.
return true;
}
return false;
}
// Bit-state search.
bool Prog::SearchBitState(const StringPiece& text,
const StringPiece& context,
Anchor anchor,
MatchKind kind,
StringPiece* match,
int nmatch) {
// If full match, we ask for an anchored longest match
// and then check that match[0] == text.
// So make sure match[0] exists.
StringPiece sp0;
if (kind == kFullMatch) {
anchor = kAnchored;
if (nmatch < 1) {
match = &sp0;
nmatch = 1;
}
}
// Run the search.
BitState b(this);
bool anchored = anchor == kAnchored;
bool longest = kind != kFirstMatch;
if (!b.Search(text, context, anchored, longest, match, nmatch))
return false;
if (kind == kFullMatch && match[0].end() != text.end())
return false;
return true;
}
} // namespace re2

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,121 +0,0 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "re2/filtered_re2.h"
#include <stddef.h>
#include <string>
#include "util/util.h"
#include "util/logging.h"
#include "re2/prefilter.h"
#include "re2/prefilter_tree.h"
namespace re2 {
FilteredRE2::FilteredRE2()
: compiled_(false),
prefilter_tree_(new PrefilterTree()) {
}
FilteredRE2::FilteredRE2(int min_atom_len)
: compiled_(false),
prefilter_tree_(new PrefilterTree(min_atom_len)) {
}
FilteredRE2::~FilteredRE2() {
for (size_t i = 0; i < re2_vec_.size(); i++)
delete re2_vec_[i];
delete prefilter_tree_;
}
RE2::ErrorCode FilteredRE2::Add(const StringPiece& pattern,
const RE2::Options& options, int* id) {
RE2* re = new RE2(pattern, options);
RE2::ErrorCode code = re->error_code();
if (!re->ok()) {
if (options.log_errors()) {
LOG(ERROR) << "Couldn't compile regular expression, skipping: "
<< re << " due to error " << re->error();
}
delete re;
} else {
*id = static_cast<int>(re2_vec_.size());
re2_vec_.push_back(re);
}
return code;
}
void FilteredRE2::Compile(std::vector<string>* atoms) {
if (compiled_) {
LOG(ERROR) << "Compile called already.";
return;
}
if (re2_vec_.empty()) {
LOG(ERROR) << "Compile called before Add.";
return;
}
for (size_t i = 0; i < re2_vec_.size(); i++) {
Prefilter* prefilter = Prefilter::FromRE2(re2_vec_[i]);
prefilter_tree_->Add(prefilter);
}
atoms->clear();
prefilter_tree_->Compile(atoms);
compiled_ = true;
}
int FilteredRE2::SlowFirstMatch(const StringPiece& text) const {
for (size_t i = 0; i < re2_vec_.size(); i++)
if (RE2::PartialMatch(text, *re2_vec_[i]))
return static_cast<int>(i);
return -1;
}
int FilteredRE2::FirstMatch(const StringPiece& text,
const std::vector<int>& atoms) const {
if (!compiled_) {
LOG(DFATAL) << "FirstMatch called before Compile.";
return -1;
}
std::vector<int> regexps;
prefilter_tree_->RegexpsGivenStrings(atoms, &regexps);
for (size_t i = 0; i < regexps.size(); i++)
if (RE2::PartialMatch(text, *re2_vec_[regexps[i]]))
return regexps[i];
return -1;
}
bool FilteredRE2::AllMatches(
const StringPiece& text,
const std::vector<int>& atoms,
std::vector<int>* matching_regexps) const {
matching_regexps->clear();
std::vector<int> regexps;
prefilter_tree_->RegexpsGivenStrings(atoms, &regexps);
for (size_t i = 0; i < regexps.size(); i++)
if (RE2::PartialMatch(text, *re2_vec_[regexps[i]]))
matching_regexps->push_back(regexps[i]);
return !matching_regexps->empty();
}
void FilteredRE2::AllPotentials(
const std::vector<int>& atoms,
std::vector<int>* potential_regexps) const {
prefilter_tree_->RegexpsGivenStrings(atoms, potential_regexps);
}
void FilteredRE2::RegexpsGivenStrings(const std::vector<int>& matched_atoms,
std::vector<int>* passed_regexps) {
prefilter_tree_->RegexpsGivenStrings(matched_atoms, passed_regexps);
}
void FilteredRE2::PrintPrefilter(int regexpid) {
prefilter_tree_->PrintPrefilter(regexpid);
}
} // namespace re2

View File

@ -1,110 +0,0 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_FILTERED_RE2_H_
#define RE2_FILTERED_RE2_H_
// The class FilteredRE2 is used as a wrapper to multiple RE2 regexps.
// It provides a prefilter mechanism that helps in cutting down the
// number of regexps that need to be actually searched.
//
// By design, it does not include a string matching engine. This is to
// allow the user of the class to use their favorite string match
// engine. The overall flow is: Add all the regexps using Add, then
// Compile the FilteredRE2. The compile returns strings that need to
// be matched. Note that all returned strings are lowercase. For
// applying regexps to a search text, the caller does the string
// matching using the strings returned. When doing the string match,
// note that the caller has to do that on lower cased version of the
// search text. Then call FirstMatch or AllMatches with a vector of
// indices of strings that were found in the text to get the actual
// regexp matches.
#include <string>
#include <vector>
#include "re2/re2.h"
namespace re2 {
class PrefilterTree;
class FilteredRE2 {
public:
FilteredRE2();
explicit FilteredRE2(int min_atom_len);
~FilteredRE2();
// Uses RE2 constructor to create a RE2 object (re). Returns
// re->error_code(). If error_code is other than NoError, then re is
// deleted and not added to re2_vec_.
RE2::ErrorCode Add(const StringPiece& pattern,
const RE2::Options& options,
int *id);
// Prepares the regexps added by Add for filtering. Returns a set
// of strings that the caller should check for in candidate texts.
// The returned strings are lowercased. When doing string matching,
// the search text should be lowercased first to find matching
// strings from the set of strings returned by Compile. Call after
// all Add calls are done.
void Compile(std::vector<string>* strings_to_match);
// Returns the index of the first matching regexp.
// Returns -1 on no match. Can be called prior to Compile.
// Does not do any filtering: simply tries to Match the
// regexps in a loop.
int SlowFirstMatch(const StringPiece& text) const;
// Returns the index of the first matching regexp.
// Returns -1 on no match. Compile has to be called before
// calling this.
int FirstMatch(const StringPiece& text,
const std::vector<int>& atoms) const;
// Returns the indices of all matching regexps, after first clearing
// matched_regexps.
bool AllMatches(const StringPiece& text,
const std::vector<int>& atoms,
std::vector<int>* matching_regexps) const;
// Returns the indices of all potentially matching regexps after first
// clearing potential_regexps.
// A regexp is potentially matching if it passes the filter.
// If a regexp passes the filter it may still not match.
// A regexp that does not pass the filter is guaranteed to not match.
void AllPotentials(const std::vector<int>& atoms,
std::vector<int>* potential_regexps) const;
// The number of regexps added.
int NumRegexps() const { return static_cast<int>(re2_vec_.size()); }
private:
// Get the individual RE2 objects. Useful for testing.
RE2* GetRE2(int regexpid) const { return re2_vec_[regexpid]; }
// Print prefilter.
void PrintPrefilter(int regexpid);
// Useful for testing and debugging.
void RegexpsGivenStrings(const std::vector<int>& matched_atoms,
std::vector<int>* passed_regexps);
// All the regexps in the FilteredRE2.
std::vector<RE2*> re2_vec_;
// Has the FilteredRE2 been compiled using Compile()
bool compiled_;
// An AND-OR tree of string atoms used for filtering regexps.
PrefilterTree* prefilter_tree_;
FilteredRE2(const FilteredRE2&) = delete;
FilteredRE2& operator=(const FilteredRE2&) = delete;
};
} // namespace re2
#endif // RE2_FILTERED_RE2_H_

View File

@ -1,187 +0,0 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Determine whether this library should match PCRE exactly
// for a particular Regexp. (If so, the testing framework can
// check that it does.)
//
// This library matches PCRE except in these cases:
// * the regexp contains a repetition of an empty string,
// like (a*)* or (a*)+. In this case, PCRE will treat
// the repetition sequence as ending with an empty string,
// while this library does not.
// * Perl and PCRE differ on whether \v matches \n.
// For historical reasons, this library implements the Perl behavior.
// * Perl and PCRE allow $ in one-line mode to match either the very
// end of the text or just before a \n at the end of the text.
// This library requires it to match only the end of the text.
// * Similarly, Perl and PCRE do not allow ^ in multi-line mode to
// match the end of the text if the last character is a \n.
// This library does allow it.
//
// Regexp::MimicsPCRE checks for any of these conditions.
#include "util/util.h"
#include "util/logging.h"
#include "re2/regexp.h"
#include "re2/walker-inl.h"
namespace re2 {
// Returns whether re might match an empty string.
static bool CanBeEmptyString(Regexp *re);
// Walker class to compute whether library handles a regexp
// exactly as PCRE would. See comment at top for conditions.
class PCREWalker : public Regexp::Walker<bool> {
public:
PCREWalker() {}
bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, bool* child_args,
int nchild_args);
bool ShortVisit(Regexp* re, bool a) {
// Should never be called: we use Walk not WalkExponential.
LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
return a;
}
};
// Called after visiting each of re's children and accumulating
// the return values in child_args. So child_args contains whether
// this library mimics PCRE for those subexpressions.
bool PCREWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
bool* child_args, int nchild_args) {
// If children failed, so do we.
for (int i = 0; i < nchild_args; i++)
if (!child_args[i])
return false;
// Otherwise look for other reasons to fail.
switch (re->op()) {
// Look for repeated empty string.
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
if (CanBeEmptyString(re->sub()[0]))
return false;
break;
case kRegexpRepeat:
if (re->max() == -1 && CanBeEmptyString(re->sub()[0]))
return false;
break;
// Look for \v
case kRegexpLiteral:
if (re->rune() == '\v')
return false;
break;
// Look for $ in single-line mode.
case kRegexpEndText:
case kRegexpEmptyMatch:
if (re->parse_flags() & Regexp::WasDollar)
return false;
break;
// Look for ^ in multi-line mode.
case kRegexpBeginLine:
// No condition: in single-line mode ^ becomes kRegexpBeginText.
return false;
default:
break;
}
// Not proven guilty.
return true;
}
// Returns whether this regexp's behavior will mimic PCRE's exactly.
bool Regexp::MimicsPCRE() {
PCREWalker w;
return w.Walk(this, true);
}
// Walker class to compute whether a Regexp can match an empty string.
// It is okay to overestimate. For example, \b\B cannot match an empty
// string, because \b and \B are mutually exclusive, but this isn't
// that smart and will say it can. Spurious empty strings
// will reduce the number of regexps we sanity check against PCRE,
// but they won't break anything.
class EmptyStringWalker : public Regexp::Walker<bool> {
public:
EmptyStringWalker() { }
bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
bool* child_args, int nchild_args);
bool ShortVisit(Regexp* re, bool a) {
// Should never be called: we use Walk not WalkExponential.
LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
return a;
}
private:
EmptyStringWalker(const EmptyStringWalker&) = delete;
EmptyStringWalker& operator=(const EmptyStringWalker&) = delete;
};
// Called after visiting re's children. child_args contains the return
// value from each of the children's PostVisits (i.e., whether each child
// can match an empty string). Returns whether this clause can match an
// empty string.
bool EmptyStringWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
bool* child_args, int nchild_args) {
switch (re->op()) {
case kRegexpNoMatch: // never empty
case kRegexpLiteral:
case kRegexpAnyChar:
case kRegexpAnyByte:
case kRegexpCharClass:
case kRegexpLiteralString:
return false;
case kRegexpEmptyMatch: // always empty
case kRegexpBeginLine: // always empty, when they match
case kRegexpEndLine:
case kRegexpNoWordBoundary:
case kRegexpWordBoundary:
case kRegexpBeginText:
case kRegexpEndText:
case kRegexpStar: // can always be empty
case kRegexpQuest:
case kRegexpHaveMatch:
return true;
case kRegexpConcat: // can be empty if all children can
for (int i = 0; i < nchild_args; i++)
if (!child_args[i])
return false;
return true;
case kRegexpAlternate: // can be empty if any child can
for (int i = 0; i < nchild_args; i++)
if (child_args[i])
return true;
return false;
case kRegexpPlus: // can be empty if the child can
case kRegexpCapture:
return child_args[0];
case kRegexpRepeat: // can be empty if child can or is x{0}
return child_args[0] || re->min() == 0;
}
return false;
}
// Returns whether re can match an empty string.
static bool CanBeEmptyString(Regexp* re) {
EmptyStringWalker w;
return w.Walk(re, true);
}
} // namespace re2

View File

@ -1,791 +0,0 @@
// Copyright 2006-2007 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Tested by search_test.cc.
//
// Prog::SearchNFA, an NFA search.
// This is an actual NFA like the theorists talk about,
// not the pseudo-NFA found in backtracking regexp implementations.
//
// IMPLEMENTATION
//
// This algorithm is a variant of one that appeared in Rob Pike's sam editor,
// which is a variant of the one described in Thompson's 1968 CACM paper.
// See http://swtch.com/~rsc/regexp/ for various history. The main feature
// over the DFA implementation is that it tracks submatch boundaries.
//
// When the choice of submatch boundaries is ambiguous, this particular
// implementation makes the same choices that traditional backtracking
// implementations (in particular, Perl and PCRE) do.
// Note that unlike in Perl and PCRE, this algorithm *cannot* take exponential
// time in the length of the input.
//
// Like Thompson's original machine and like the DFA implementation, this
// implementation notices a match only once it is one byte past it.
#include <stdio.h>
#include <string.h>
#include <algorithm>
#include <string>
#include <utility>
#include <vector>
#include "re2/prog.h"
#include "re2/regexp.h"
#include "util/logging.h"
#include "util/sparse_array.h"
#include "util/sparse_set.h"
#include "util/strutil.h"
namespace re2 {
static const bool ExtraDebug = false;
class NFA {
public:
NFA(Prog* prog);
~NFA();
// Searches for a matching string.
// * If anchored is true, only considers matches starting at offset.
// Otherwise finds lefmost match at or after offset.
// * If longest is true, returns the longest match starting
// at the chosen start point. Otherwise returns the so-called
// left-biased match, the one traditional backtracking engines
// (like Perl and PCRE) find.
// Records submatch boundaries in submatch[1..nsubmatch-1].
// Submatch[0] is the entire match. When there is a choice in
// which text matches each subexpression, the submatch boundaries
// are chosen to match what a backtracking implementation would choose.
bool Search(const StringPiece& text, const StringPiece& context,
bool anchored, bool longest,
StringPiece* submatch, int nsubmatch);
private:
struct Thread {
union {
int ref;
Thread* next; // when on free list
};
const char** capture;
};
// State for explicit stack in AddToThreadq.
struct AddState {
int id; // Inst to process
Thread* t; // if not null, set t0 = t before processing id
AddState()
: id(0), t(NULL) {}
explicit AddState(int id)
: id(id), t(NULL) {}
AddState(int id, Thread* t)
: id(id), t(t) {}
};
// Threadq is a list of threads. The list is sorted by the order
// in which Perl would explore that particular state -- the earlier
// choices appear earlier in the list.
typedef SparseArray<Thread*> Threadq;
inline Thread* AllocThread();
inline Thread* Incref(Thread* t);
inline void Decref(Thread* t);
// Follows all empty arrows from id0 and enqueues all the states reached.
// Enqueues only the ByteRange instructions that match byte c.
// The bits in flag (Bol, Eol, etc.) specify whether ^, $ and \b match.
// p is the current input position, and t0 is the current thread.
void AddToThreadq(Threadq* q, int id0, int c, int flag,
const char* p, Thread* t0);
// Run runq on byte c, appending new states to nextq.
// Updates matched_ and match_ as new, better matches are found.
// p is the position of byte c in the input string for AddToThreadq;
// p-1 will be used when processing Match instructions.
// flag is the bitwise OR of Bol, Eol, etc., specifying whether
// ^, $ and \b match the current input position (after c).
// Frees all the threads on runq.
// If there is a shortcut to the end, returns that shortcut.
inline int Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p);
// Returns text version of capture information, for debugging.
string FormatCapture(const char** capture);
inline void CopyCapture(const char** dst, const char** src);
Prog* prog_; // underlying program
int start_; // start instruction in program
int ncapture_; // number of submatches to track
bool longest_; // whether searching for longest match
bool endmatch_; // whether match must end at text.end()
const char* btext_; // beginning of text being matched (for FormatSubmatch)
const char* etext_; // end of text being matched (for endmatch_)
Threadq q0_, q1_; // pre-allocated for Search.
const char** match_; // best match so far
bool matched_; // any match so far?
AddState* astack_; // pre-allocated for AddToThreadq
int nastack_;
Thread* free_threads_; // free list
NFA(const NFA&) = delete;
NFA& operator=(const NFA&) = delete;
};
NFA::NFA(Prog* prog) {
prog_ = prog;
start_ = prog_->start();
ncapture_ = 0;
longest_ = false;
endmatch_ = false;
btext_ = NULL;
etext_ = NULL;
q0_.resize(prog_->size());
q1_.resize(prog_->size());
// See NFA::AddToThreadq() for why this is so.
nastack_ = 2*prog_->inst_count(kInstCapture) +
prog_->inst_count(kInstEmptyWidth) +
prog_->inst_count(kInstNop) + 1; // + 1 for start inst
astack_ = new AddState[nastack_];
match_ = NULL;
matched_ = false;
free_threads_ = NULL;
}
NFA::~NFA() {
delete[] match_;
delete[] astack_;
Thread* next;
for (Thread* t = free_threads_; t; t = next) {
next = t->next;
delete[] t->capture;
delete t;
}
}
NFA::Thread* NFA::AllocThread() {
Thread* t = free_threads_;
if (t == NULL) {
t = new Thread;
t->ref = 1;
t->capture = new const char*[ncapture_];
return t;
}
free_threads_ = t->next;
t->ref = 1;
return t;
}
NFA::Thread* NFA::Incref(Thread* t) {
DCHECK(t != NULL);
t->ref++;
return t;
}
void NFA::Decref(Thread* t) {
if (t == NULL)
return;
t->ref--;
if (t->ref > 0)
return;
DCHECK_EQ(t->ref, 0);
t->next = free_threads_;
free_threads_ = t;
}
void NFA::CopyCapture(const char** dst, const char** src) {
for (int i = 0; i < ncapture_; i+=2) {
dst[i] = src[i];
dst[i+1] = src[i+1];
}
}
// Follows all empty arrows from id0 and enqueues all the states reached.
// Enqueues only the ByteRange instructions that match byte c.
// The bits in flag (Bol, Eol, etc.) specify whether ^, $ and \b match.
// p is the current input position, and t0 is the current thread.
void NFA::AddToThreadq(Threadq* q, int id0, int c, int flag,
const char* p, Thread* t0) {
if (id0 == 0)
return;
// Use astack_ to hold our stack of instructions yet to process.
// It was preallocated as follows:
// two entries per Capture;
// one entry per EmptyWidth; and
// one entry per Nop.
// This reflects the maximum number of stack pushes that each can
// perform. (Each instruction can be processed at most once.)
AddState* stk = astack_;
int nstk = 0;
stk[nstk++] = AddState(id0);
while (nstk > 0) {
DCHECK_LE(nstk, nastack_);
AddState a = stk[--nstk];
Loop:
if (a.t != NULL) {
// t0 was a thread that we allocated and copied in order to
// record the capture, so we must now decref it.
Decref(t0);
t0 = a.t;
}
int id = a.id;
if (id == 0)
continue;
if (q->has_index(id)) {
if (ExtraDebug)
fprintf(stderr, " [%d%s]\n", id, FormatCapture(t0->capture).c_str());
continue;
}
// Create entry in q no matter what. We might fill it in below,
// or we might not. Even if not, it is necessary to have it,
// so that we don't revisit id0 during the recursion.
q->set_new(id, NULL);
Thread** tp = &q->find(id)->second;
int j;
Thread* t;
Prog::Inst* ip = prog_->inst(id);
switch (ip->opcode()) {
default:
LOG(DFATAL) << "unhandled " << ip->opcode() << " in AddToThreadq";
break;
case kInstFail:
break;
case kInstAltMatch:
// Save state; will pick up at next byte.
t = Incref(t0);
*tp = t;
DCHECK(!ip->last());
a = AddState(id+1);
goto Loop;
case kInstNop:
if (!ip->last())
stk[nstk++] = AddState(id+1);
// Continue on.
a = AddState(ip->out());
goto Loop;
case kInstCapture:
if (!ip->last())
stk[nstk++] = AddState(id+1);
if ((j=ip->cap()) < ncapture_) {
// Push a dummy whose only job is to restore t0
// once we finish exploring this possibility.
stk[nstk++] = AddState(0, t0);
// Record capture.
t = AllocThread();
CopyCapture(t->capture, t0->capture);
t->capture[j] = p;
t0 = t;
}
a = AddState(ip->out());
goto Loop;
case kInstByteRange:
if (!ip->Matches(c))
goto Next;
FALLTHROUGH_INTENDED;
case kInstMatch:
// Save state; will pick up at next byte.
t = Incref(t0);
*tp = t;
if (ExtraDebug)
fprintf(stderr, " + %d%s\n", id, FormatCapture(t0->capture).c_str());
Next:
if (ip->last())
break;
a = AddState(id+1);
goto Loop;
case kInstEmptyWidth:
if (!ip->last())
stk[nstk++] = AddState(id+1);
// Continue on if we have all the right flag bits.
if (ip->empty() & ~flag)
break;
a = AddState(ip->out());
goto Loop;
}
}
}
// Run runq on byte c, appending new states to nextq.
// Updates matched_ and match_ as new, better matches are found.
// p is the position of byte c in the input string for AddToThreadq;
// p-1 will be used when processing Match instructions.
// flag is the bitwise OR of Bol, Eol, etc., specifying whether
// ^, $ and \b match the current input position (after c).
// Frees all the threads on runq.
// If there is a shortcut to the end, returns that shortcut.
int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {
nextq->clear();
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
Thread* t = i->second;
if (t == NULL)
continue;
if (longest_) {
// Can skip any threads started after our current best match.
if (matched_ && match_[0] < t->capture[0]) {
Decref(t);
continue;
}
}
int id = i->index();
Prog::Inst* ip = prog_->inst(id);
switch (ip->opcode()) {
default:
// Should only see the values handled below.
LOG(DFATAL) << "Unhandled " << ip->opcode() << " in step";
break;
case kInstByteRange:
AddToThreadq(nextq, ip->out(), c, flag, p, t);
break;
case kInstAltMatch:
if (i != runq->begin())
break;
// The match is ours if we want it.
if (ip->greedy(prog_) || longest_) {
CopyCapture(match_, t->capture);
matched_ = true;
Decref(t);
for (++i; i != runq->end(); ++i)
Decref(i->second);
runq->clear();
if (ip->greedy(prog_))
return ip->out1();
return ip->out();
}
break;
case kInstMatch: {
// Avoid invoking undefined behavior when p happens
// to be null - and p-1 would be meaningless anyway.
if (p == NULL)
break;
if (endmatch_ && p-1 != etext_)
break;
if (longest_) {
// Leftmost-longest mode: save this match only if
// it is either farther to the left or at the same
// point but longer than an existing match.
if (!matched_ || t->capture[0] < match_[0] ||
(t->capture[0] == match_[0] && p-1 > match_[1])) {
CopyCapture(match_, t->capture);
match_[1] = p-1;
matched_ = true;
}
} else {
// Leftmost-biased mode: this match is by definition
// better than what we've already found (see next line).
CopyCapture(match_, t->capture);
match_[1] = p-1;
matched_ = true;
// Cut off the threads that can only find matches
// worse than the one we just found: don't run the
// rest of the current Threadq.
Decref(t);
for (++i; i != runq->end(); ++i)
Decref(i->second);
runq->clear();
return 0;
}
break;
}
}
Decref(t);
}
runq->clear();
return 0;
}
string NFA::FormatCapture(const char** capture) {
string s;
for (int i = 0; i < ncapture_; i+=2) {
if (capture[i] == NULL)
StringAppendF(&s, "(?,?)");
else if (capture[i+1] == NULL)
StringAppendF(&s, "(%d,?)", (int)(capture[i] - btext_));
else
StringAppendF(&s, "(%d,%d)",
(int)(capture[i] - btext_),
(int)(capture[i+1] - btext_));
}
return s;
}
bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
bool anchored, bool longest,
StringPiece* submatch, int nsubmatch) {
if (start_ == 0)
return false;
StringPiece context = const_context;
if (context.begin() == NULL)
context = text;
// Sanity check: make sure that text lies within context.
if (text.begin() < context.begin() || text.end() > context.end()) {
LOG(DFATAL) << "context does not contain text";
return false;
}
if (prog_->anchor_start() && context.begin() != text.begin())
return false;
if (prog_->anchor_end() && context.end() != text.end())
return false;
anchored |= prog_->anchor_start();
if (prog_->anchor_end()) {
longest = true;
endmatch_ = true;
etext_ = text.end();
}
if (nsubmatch < 0) {
LOG(DFATAL) << "Bad args: nsubmatch=" << nsubmatch;
return false;
}
// Save search parameters.
ncapture_ = 2*nsubmatch;
longest_ = longest;
if (nsubmatch == 0) {
// We need to maintain match[0], both to distinguish the
// longest match (if longest is true) and also to tell
// whether we've seen any matches at all.
ncapture_ = 2;
}
match_ = new const char*[ncapture_];
matched_ = false;
// For debugging prints.
btext_ = context.begin();
if (ExtraDebug)
fprintf(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n",
text.ToString().c_str(), context.ToString().c_str(), anchored,
longest);
// Set up search.
Threadq* runq = &q0_;
Threadq* nextq = &q1_;
runq->clear();
nextq->clear();
memset(&match_[0], 0, ncapture_*sizeof match_[0]);
int wasword = 0;
if (text.begin() > context.begin())
wasword = Prog::IsWordChar(text.begin()[-1] & 0xFF);
// Loop over the text, stepping the machine.
for (const char* p = text.begin();; p++) {
// Check for empty-width specials.
int flag = 0;
// ^ and \A
if (p == context.begin())
flag |= kEmptyBeginText | kEmptyBeginLine;
else if (p <= context.end() && p[-1] == '\n')
flag |= kEmptyBeginLine;
// $ and \z
if (p == context.end())
flag |= kEmptyEndText | kEmptyEndLine;
else if (p < context.end() && p[0] == '\n')
flag |= kEmptyEndLine;
// \b and \B
int isword = 0;
if (p < context.end())
isword = Prog::IsWordChar(p[0] & 0xFF);
if (isword != wasword)
flag |= kEmptyWordBoundary;
else
flag |= kEmptyNonWordBoundary;
if (ExtraDebug) {
int c = 0;
if (p == context.begin())
c = '^';
else if (p > text.end())
c = '$';
else if (p < text.end())
c = p[0] & 0xFF;
fprintf(stderr, "%c[%#x/%d/%d]:", c, flag, isword, wasword);
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
Thread* t = i->second;
if (t == NULL)
continue;
fprintf(stderr, " %d%s", i->index(), FormatCapture(t->capture).c_str());
}
fprintf(stderr, "\n");
}
// This is a no-op the first time around the loop because runq is empty.
int id = Step(runq, nextq, p < text.end() ? p[0] & 0xFF : -1, flag, p);
DCHECK_EQ(runq->size(), 0);
using std::swap;
swap(nextq, runq);
nextq->clear();
if (id != 0) {
// We're done: full match ahead.
p = text.end();
for (;;) {
Prog::Inst* ip = prog_->inst(id);
switch (ip->opcode()) {
default:
LOG(DFATAL) << "Unexpected opcode in short circuit: " << ip->opcode();
break;
case kInstCapture:
if (ip->cap() < ncapture_)
match_[ip->cap()] = p;
id = ip->out();
continue;
case kInstNop:
id = ip->out();
continue;
case kInstMatch:
match_[1] = p;
matched_ = true;
break;
}
break;
}
break;
}
if (p > text.end())
break;
// Start a new thread if there have not been any matches.
// (No point in starting a new thread if there have been
// matches, since it would be to the right of the match
// we already found.)
if (!matched_ && (!anchored || p == text.begin())) {
// If there's a required first byte for an unanchored search
// and we're not in the middle of any possible matches,
// use memchr to search for the byte quickly.
int fb = prog_->first_byte();
if (!anchored && runq->size() == 0 &&
fb >= 0 && p < text.end() && (p[0] & 0xFF) != fb) {
p = reinterpret_cast<const char*>(memchr(p, fb, text.end() - p));
if (p == NULL) {
p = text.end();
isword = 0;
} else {
isword = Prog::IsWordChar(p[0] & 0xFF);
}
flag = Prog::EmptyFlags(context, p);
}
Thread* t = AllocThread();
CopyCapture(t->capture, match_);
t->capture[0] = p;
AddToThreadq(runq, start_, p < text.end() ? p[0] & 0xFF : -1, flag, p, t);
Decref(t);
}
// If all the threads have died, stop early.
if (runq->size() == 0) {
if (ExtraDebug)
fprintf(stderr, "dead\n");
break;
}
wasword = isword;
}
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i)
Decref(i->second);
if (matched_) {
for (int i = 0; i < nsubmatch; i++)
submatch[i] =
StringPiece(match_[2 * i],
static_cast<size_t>(match_[2 * i + 1] - match_[2 * i]));
if (ExtraDebug)
fprintf(stderr, "match (%td,%td)\n",
match_[0] - btext_, match_[1] - btext_);
return true;
}
return false;
}
// Computes whether all successful matches have a common first byte,
// and if so, returns that byte. If not, returns -1.
int Prog::ComputeFirstByte() {
int b = -1;
SparseSet q(size());
q.insert(start());
for (SparseSet::iterator it = q.begin(); it != q.end(); ++it) {
int id = *it;
Prog::Inst* ip = inst(id);
switch (ip->opcode()) {
default:
LOG(DFATAL) << "unhandled " << ip->opcode() << " in ComputeFirstByte";
break;
case kInstMatch:
// The empty string matches: no first byte.
return -1;
case kInstByteRange:
if (!ip->last())
q.insert(id+1);
// Must match only a single byte
if (ip->lo() != ip->hi())
return -1;
if (ip->foldcase() && 'a' <= ip->lo() && ip->lo() <= 'z')
return -1;
// If we haven't seen any bytes yet, record it;
// otherwise must match the one we saw before.
if (b == -1)
b = ip->lo();
else if (b != ip->lo())
return -1;
break;
case kInstNop:
case kInstCapture:
case kInstEmptyWidth:
if (!ip->last())
q.insert(id+1);
// Continue on.
// Ignore ip->empty() flags for kInstEmptyWidth
// in order to be as conservative as possible
// (assume all possible empty-width flags are true).
if (ip->out())
q.insert(ip->out());
break;
case kInstAltMatch:
DCHECK(!ip->last());
q.insert(id+1);
break;
case kInstFail:
break;
}
}
return b;
}
bool
Prog::SearchNFA(const StringPiece& text, const StringPiece& context,
Anchor anchor, MatchKind kind,
StringPiece* match, int nmatch) {
if (ExtraDebug)
Dump();
NFA nfa(this);
StringPiece sp;
if (kind == kFullMatch) {
anchor = kAnchored;
if (nmatch == 0) {
match = &sp;
nmatch = 1;
}
}
if (!nfa.Search(text, context, anchor == kAnchored, kind != kFirstMatch, match, nmatch))
return false;
if (kind == kFullMatch && match[0].end() != text.end())
return false;
return true;
}
// For each instruction i in the program reachable from the start, compute the
// number of instructions reachable from i by following only empty transitions
// and record that count as fanout[i].
//
// fanout holds the results and is also the work queue for the outer iteration.
// reachable holds the reached nodes for the inner iteration.
void Prog::Fanout(SparseArray<int>* fanout) {
DCHECK_EQ(fanout->max_size(), size());
SparseSet reachable(size());
fanout->clear();
fanout->set_new(start(), 0);
for (SparseArray<int>::iterator i = fanout->begin(); i != fanout->end(); ++i) {
int* count = &i->second;
reachable.clear();
reachable.insert(i->index());
for (SparseSet::iterator j = reachable.begin(); j != reachable.end(); ++j) {
int id = *j;
Prog::Inst* ip = inst(id);
switch (ip->opcode()) {
default:
LOG(DFATAL) << "unhandled " << ip->opcode() << " in Prog::Fanout()";
break;
case kInstByteRange:
if (!ip->last())
reachable.insert(id+1);
(*count)++;
if (!fanout->has_index(ip->out())) {
fanout->set_new(ip->out(), 0);
}
break;
case kInstAltMatch:
DCHECK(!ip->last());
reachable.insert(id+1);
break;
case kInstCapture:
case kInstEmptyWidth:
case kInstNop:
if (!ip->last())
reachable.insert(id+1);
reachable.insert(ip->out());
break;
case kInstMatch:
if (!ip->last())
reachable.insert(id+1);
break;
case kInstFail:
break;
}
}
}
}
} // namespace re2

View File

@ -1,627 +0,0 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Tested by search_test.cc.
//
// Prog::SearchOnePass is an efficient implementation of
// regular expression search with submatch tracking for
// what I call "one-pass regular expressions". (An alternate
// name might be "backtracking-free regular expressions".)
//
// One-pass regular expressions have the property that
// at each input byte during an anchored match, there may be
// multiple alternatives but only one can proceed for any
// given input byte.
//
// For example, the regexp /x*yx*/ is one-pass: you read
// x's until a y, then you read the y, then you keep reading x's.
// At no point do you have to guess what to do or back up
// and try a different guess.
//
// On the other hand, /x*x/ is not one-pass: when you're
// looking at an input "x", it's not clear whether you should
// use it to extend the x* or as the final x.
//
// More examples: /([^ ]*) (.*)/ is one-pass; /(.*) (.*)/ is not.
// /(\d+)-(\d+)/ is one-pass; /(\d+).(\d+)/ is not.
//
// A simple intuition for identifying one-pass regular expressions
// is that it's always immediately obvious when a repetition ends.
// It must also be immediately obvious which branch of an | to take:
//
// /x(y|z)/ is one-pass, but /(xy|xz)/ is not.
//
// The NFA-based search in nfa.cc does some bookkeeping to
// avoid the need for backtracking and its associated exponential blowup.
// But if we have a one-pass regular expression, there is no
// possibility of backtracking, so there is no need for the
// extra bookkeeping. Hence, this code.
//
// On a one-pass regular expression, the NFA code in nfa.cc
// runs at about 1/20 of the backtracking-based PCRE speed.
// In contrast, the code in this file runs at about the same
// speed as PCRE.
//
// One-pass regular expressions get used a lot when RE is
// used for parsing simple strings, so it pays off to
// notice them and handle them efficiently.
//
// See also Anne Brüggemann-Klein and Derick Wood,
// "One-unambiguous regular languages", Information and Computation 142(2).
#include <stdint.h>
#include <string.h>
#include <algorithm>
#include <map>
#include <string>
#include <vector>
#include "util/util.h"
#include "util/logging.h"
#include "util/sparse_set.h"
#include "util/strutil.h"
#include "util/utf.h"
#include "re2/prog.h"
#include "re2/stringpiece.h"
// Silence "zero-sized array in struct/union" warning for OneState::action.
#ifdef _MSC_VER
#pragma warning(disable: 4200)
#endif
namespace re2 {
static const bool ExtraDebug = false;
// The key insight behind this implementation is that the
// non-determinism in an NFA for a one-pass regular expression
// is contained. To explain what that means, first a
// refresher about what regular expression programs look like
// and how the usual NFA execution runs.
//
// In a regular expression program, only the kInstByteRange
// instruction processes an input byte c and moves on to the
// next byte in the string (it does so if c is in the given range).
// The kInstByteRange instructions correspond to literal characters
// and character classes in the regular expression.
//
// The kInstAlt instructions are used as wiring to connect the
// kInstByteRange instructions together in interesting ways when
// implementing | + and *.
// The kInstAlt instruction forks execution, like a goto that
// jumps to ip->out() and ip->out1() in parallel. Each of the
// resulting computation paths is called a thread.
//
// The other instructions -- kInstEmptyWidth, kInstMatch, kInstCapture --
// are interesting in their own right but like kInstAlt they don't
// advance the input pointer. Only kInstByteRange does.
//
// The automaton execution in nfa.cc runs all the possible
// threads of execution in lock-step over the input. To process
// a particular byte, each thread gets run until it either dies
// or finds a kInstByteRange instruction matching the byte.
// If the latter happens, the thread stops just past the
// kInstByteRange instruction (at ip->out()) and waits for
// the other threads to finish processing the input byte.
// Then, once all the threads have processed that input byte,
// the whole process repeats. The kInstAlt state instruction
// might create new threads during input processing, but no
// matter what, all the threads stop after a kInstByteRange
// and wait for the other threads to "catch up".
// Running in lock step like this ensures that the NFA reads
// the input string only once.
//
// Each thread maintains its own set of capture registers
// (the string positions at which it executed the kInstCapture
// instructions corresponding to capturing parentheses in the
// regular expression). Repeated copying of the capture registers
// is the main performance bottleneck in the NFA implementation.
//
// A regular expression program is "one-pass" if, no matter what
// the input string, there is only one thread that makes it
// past a kInstByteRange instruction at each input byte. This means
// that there is in some sense only one active thread throughout
// the execution. Other threads might be created during the
// processing of an input byte, but they are ephemeral: only one
// thread is left to start processing the next input byte.
// This is what I meant above when I said the non-determinism
// was "contained".
//
// To execute a one-pass regular expression program, we can build
// a DFA (no non-determinism) that has at most as many states as
// the NFA (compare this to the possibly exponential number of states
// in the general case). Each state records, for each possible
// input byte, the next state along with the conditions required
// before entering that state -- empty-width flags that must be true
// and capture operations that must be performed. It also records
// whether a set of conditions required to finish a match at that
// point in the input rather than process the next byte.
// A state in the one-pass NFA - just an array of actions indexed
// by the bytemap_[] of the next input byte. (The bytemap
// maps next input bytes into equivalence classes, to reduce
// the memory footprint.)
struct OneState {
uint32_t matchcond; // conditions to match right now.
uint32_t action[];
};
// The uint32_t conditions in the action are a combination of
// condition and capture bits and the next state. The bottom 16 bits
// are the condition and capture bits, and the top 16 are the index of
// the next state.
//
// Bits 0-5 are the empty-width flags from prog.h.
// Bit 6 is kMatchWins, which means the match takes
// priority over moving to next in a first-match search.
// The remaining bits mark capture registers that should
// be set to the current input position. The capture bits
// start at index 2, since the search loop can take care of
// cap[0], cap[1] (the overall match position).
// That means we can handle up to 5 capturing parens: $1 through $4, plus $0.
// No input position can satisfy both kEmptyWordBoundary
// and kEmptyNonWordBoundary, so we can use that as a sentinel
// instead of needing an extra bit.
static const int kIndexShift = 16; // number of bits below index
static const int kEmptyShift = 6; // number of empty flags in prog.h
static const int kRealCapShift = kEmptyShift + 1;
static const int kRealMaxCap = (kIndexShift - kRealCapShift) / 2 * 2;
// Parameters used to skip over cap[0], cap[1].
static const int kCapShift = kRealCapShift - 2;
static const int kMaxCap = kRealMaxCap + 2;
static const uint32_t kMatchWins = 1 << kEmptyShift;
static const uint32_t kCapMask = ((1 << kRealMaxCap) - 1) << kRealCapShift;
static const uint32_t kImpossible = kEmptyWordBoundary | kEmptyNonWordBoundary;
// Check, at compile time, that prog.h agrees with math above.
// This function is never called.
void OnePass_Checks() {
static_assert((1<<kEmptyShift)-1 == kEmptyAllFlags,
"kEmptyShift disagrees with kEmptyAllFlags");
// kMaxCap counts pointers, kMaxOnePassCapture counts pairs.
static_assert(kMaxCap == Prog::kMaxOnePassCapture*2,
"kMaxCap disagrees with kMaxOnePassCapture");
}
static bool Satisfy(uint32_t cond, const StringPiece& context, const char* p) {
uint32_t satisfied = Prog::EmptyFlags(context, p);
if (cond & kEmptyAllFlags & ~satisfied)
return false;
return true;
}
// Apply the capture bits in cond, saving p to the appropriate
// locations in cap[].
static void ApplyCaptures(uint32_t cond, const char* p,
const char** cap, int ncap) {
for (int i = 2; i < ncap; i++)
if (cond & (1 << kCapShift << i))
cap[i] = p;
}
// Computes the OneState* for the given nodeindex.
static inline OneState* IndexToNode(uint8_t* nodes, int statesize,
int nodeindex) {
return reinterpret_cast<OneState*>(nodes + statesize*nodeindex);
}
bool Prog::SearchOnePass(const StringPiece& text,
const StringPiece& const_context,
Anchor anchor, MatchKind kind,
StringPiece* match, int nmatch) {
if (anchor != kAnchored && kind != kFullMatch) {
LOG(DFATAL) << "Cannot use SearchOnePass for unanchored matches.";
return false;
}
// Make sure we have at least cap[1],
// because we use it to tell if we matched.
int ncap = 2*nmatch;
if (ncap < 2)
ncap = 2;
const char* cap[kMaxCap];
for (int i = 0; i < ncap; i++)
cap[i] = NULL;
const char* matchcap[kMaxCap];
for (int i = 0; i < ncap; i++)
matchcap[i] = NULL;
StringPiece context = const_context;
if (context.begin() == NULL)
context = text;
if (anchor_start() && context.begin() != text.begin())
return false;
if (anchor_end() && context.end() != text.end())
return false;
if (anchor_end())
kind = kFullMatch;
uint8_t* nodes = onepass_nodes_;
int statesize = sizeof(OneState) + bytemap_range()*sizeof(uint32_t);
// start() is always mapped to the zeroth OneState.
OneState* state = IndexToNode(nodes, statesize, 0);
uint8_t* bytemap = bytemap_;
const char* bp = text.begin();
const char* ep = text.end();
const char* p;
bool matched = false;
matchcap[0] = bp;
cap[0] = bp;
uint32_t nextmatchcond = state->matchcond;
for (p = bp; p < ep; p++) {
int c = bytemap[*p & 0xFF];
uint32_t matchcond = nextmatchcond;
uint32_t cond = state->action[c];
// Determine whether we can reach act->next.
// If so, advance state and nextmatchcond.
if ((cond & kEmptyAllFlags) == 0 || Satisfy(cond, context, p)) {
uint32_t nextindex = cond >> kIndexShift;
state = IndexToNode(nodes, statesize, nextindex);
nextmatchcond = state->matchcond;
} else {
state = NULL;
nextmatchcond = kImpossible;
}
// This code section is carefully tuned.
// The goto sequence is about 10% faster than the
// obvious rewrite as a large if statement in the
// ASCIIMatchRE2 and DotMatchRE2 benchmarks.
// Saving the match capture registers is expensive.
// Is this intermediate match worth thinking about?
// Not if we want a full match.
if (kind == kFullMatch)
goto skipmatch;
// Not if it's impossible.
if (matchcond == kImpossible)
goto skipmatch;
// Not if the possible match is beaten by the certain
// match at the next byte. When this test is useless
// (e.g., HTTPPartialMatchRE2) it slows the loop by
// about 10%, but when it avoids work (e.g., DotMatchRE2),
// it cuts the loop execution by about 45%.
if ((cond & kMatchWins) == 0 && (nextmatchcond & kEmptyAllFlags) == 0)
goto skipmatch;
// Finally, the match conditions must be satisfied.
if ((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p)) {
for (int i = 2; i < 2*nmatch; i++)
matchcap[i] = cap[i];
if (nmatch > 1 && (matchcond & kCapMask))
ApplyCaptures(matchcond, p, matchcap, ncap);
matchcap[1] = p;
matched = true;
// If we're in longest match mode, we have to keep
// going and see if we find a longer match.
// In first match mode, we can stop if the match
// takes priority over the next state for this input byte.
// That bit is per-input byte and thus in cond, not matchcond.
if (kind == kFirstMatch && (cond & kMatchWins))
goto done;
}
skipmatch:
if (state == NULL)
goto done;
if ((cond & kCapMask) && nmatch > 1)
ApplyCaptures(cond, p, cap, ncap);
}
// Look for match at end of input.
{
uint32_t matchcond = state->matchcond;
if (matchcond != kImpossible &&
((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p))) {
if (nmatch > 1 && (matchcond & kCapMask))
ApplyCaptures(matchcond, p, cap, ncap);
for (int i = 2; i < ncap; i++)
matchcap[i] = cap[i];
matchcap[1] = p;
matched = true;
}
}
done:
if (!matched)
return false;
for (int i = 0; i < nmatch; i++)
match[i] =
StringPiece(matchcap[2 * i],
static_cast<size_t>(matchcap[2 * i + 1] - matchcap[2 * i]));
return true;
}
// Analysis to determine whether a given regexp program is one-pass.
// If ip is not on workq, adds ip to work queue and returns true.
// If ip is already on work queue, does nothing and returns false.
// If ip is NULL, does nothing and returns true (pretends to add it).
typedef SparseSet Instq;
static bool AddQ(Instq *q, int id) {
if (id == 0)
return true;
if (q->contains(id))
return false;
q->insert(id);
return true;
}
struct InstCond {
int id;
uint32_t cond;
};
// Returns whether this is a one-pass program; that is,
// returns whether it is safe to use SearchOnePass on this program.
// These conditions must be true for any instruction ip:
//
// (1) for any other Inst nip, there is at most one input-free
// path from ip to nip.
// (2) there is at most one kInstByte instruction reachable from
// ip that matches any particular byte c.
// (3) there is at most one input-free path from ip to a kInstMatch
// instruction.
//
// This is actually just a conservative approximation: it might
// return false when the answer is true, when kInstEmptyWidth
// instructions are involved.
// Constructs and saves corresponding one-pass NFA on success.
bool Prog::IsOnePass() {
if (did_onepass_)
return onepass_nodes_ != NULL;
did_onepass_ = true;
if (start() == 0) // no match
return false;
// Steal memory for the one-pass NFA from the overall DFA budget.
// Willing to use at most 1/4 of the DFA budget (heuristic).
// Limit max node count to 65000 as a conservative estimate to
// avoid overflowing 16-bit node index in encoding.
int maxnodes = 2 + inst_count(kInstByteRange);
int statesize = sizeof(OneState) + bytemap_range()*sizeof(uint32_t);
if (maxnodes >= 65000 || dfa_mem_ / 4 / statesize < maxnodes)
return false;
// Flood the graph starting at the start state, and check
// that in each reachable state, each possible byte leads
// to a unique next state.
int stacksize = inst_count(kInstCapture) +
inst_count(kInstEmptyWidth) +
inst_count(kInstNop) + 1; // + 1 for start inst
InstCond* stack = new InstCond[stacksize];
int size = this->size();
int* nodebyid = new int[size]; // indexed by ip
memset(nodebyid, 0xFF, size*sizeof nodebyid[0]);
// Originally, nodes was a uint8_t[maxnodes*statesize], but that was
// unnecessarily optimistic: why allocate a large amount of memory
// upfront for a large program when it is unlikely to be one-pass?
std::vector<uint8_t> nodes;
Instq tovisit(size), workq(size);
AddQ(&tovisit, start());
nodebyid[start()] = 0;
int nalloc = 1;
nodes.insert(nodes.end(), statesize, 0);
for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
int id = *it;
int nodeindex = nodebyid[id];
OneState* node = IndexToNode(nodes.data(), statesize, nodeindex);
// Flood graph using manual stack, filling in actions as found.
// Default is none.
for (int b = 0; b < bytemap_range_; b++)
node->action[b] = kImpossible;
node->matchcond = kImpossible;
workq.clear();
bool matched = false;
int nstack = 0;
stack[nstack].id = id;
stack[nstack++].cond = 0;
while (nstack > 0) {
int id = stack[--nstack].id;
uint32_t cond = stack[nstack].cond;
Loop:
Prog::Inst* ip = inst(id);
switch (ip->opcode()) {
default:
LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
break;
case kInstAltMatch:
// TODO(rsc): Ignoring kInstAltMatch optimization.
// Should implement it in this engine, but it's subtle.
DCHECK(!ip->last());
// If already on work queue, (1) is violated: bail out.
if (!AddQ(&workq, id+1))
goto fail;
id = id+1;
goto Loop;
case kInstByteRange: {
int nextindex = nodebyid[ip->out()];
if (nextindex == -1) {
if (nalloc >= maxnodes) {
if (ExtraDebug)
LOG(ERROR) << StringPrintf(
"Not OnePass: hit node limit %d >= %d", nalloc, maxnodes);
goto fail;
}
nextindex = nalloc;
AddQ(&tovisit, ip->out());
nodebyid[ip->out()] = nalloc;
nalloc++;
nodes.insert(nodes.end(), statesize, 0);
// Update node because it might have been invalidated.
node = IndexToNode(nodes.data(), statesize, nodeindex);
}
for (int c = ip->lo(); c <= ip->hi(); c++) {
int b = bytemap_[c];
// Skip any bytes immediately after c that are also in b.
while (c < 256-1 && bytemap_[c+1] == b)
c++;
uint32_t act = node->action[b];
uint32_t newact = (nextindex << kIndexShift) | cond;
if (matched)
newact |= kMatchWins;
if ((act & kImpossible) == kImpossible) {
node->action[b] = newact;
} else if (act != newact) {
if (ExtraDebug)
LOG(ERROR) << StringPrintf(
"Not OnePass: conflict on byte %#x at state %d", c, *it);
goto fail;
}
}
if (ip->foldcase()) {
Rune lo = std::max<Rune>(ip->lo(), 'a') + 'A' - 'a';
Rune hi = std::min<Rune>(ip->hi(), 'z') + 'A' - 'a';
for (int c = lo; c <= hi; c++) {
int b = bytemap_[c];
// Skip any bytes immediately after c that are also in b.
while (c < 256-1 && bytemap_[c+1] == b)
c++;
uint32_t act = node->action[b];
uint32_t newact = (nextindex << kIndexShift) | cond;
if (matched)
newact |= kMatchWins;
if ((act & kImpossible) == kImpossible) {
node->action[b] = newact;
} else if (act != newact) {
if (ExtraDebug)
LOG(ERROR) << StringPrintf(
"Not OnePass: conflict on byte %#x at state %d", c, *it);
goto fail;
}
}
}
if (ip->last())
break;
// If already on work queue, (1) is violated: bail out.
if (!AddQ(&workq, id+1))
goto fail;
id = id+1;
goto Loop;
}
case kInstCapture:
case kInstEmptyWidth:
case kInstNop:
if (!ip->last()) {
// If already on work queue, (1) is violated: bail out.
if (!AddQ(&workq, id+1))
goto fail;
stack[nstack].id = id+1;
stack[nstack++].cond = cond;
}
if (ip->opcode() == kInstCapture && ip->cap() < kMaxCap)
cond |= (1 << kCapShift) << ip->cap();
if (ip->opcode() == kInstEmptyWidth)
cond |= ip->empty();
// kInstCapture and kInstNop always proceed to ip->out().
// kInstEmptyWidth only sometimes proceeds to ip->out(),
// but as a conservative approximation we assume it always does.
// We could be a little more precise by looking at what c
// is, but that seems like overkill.
// If already on work queue, (1) is violated: bail out.
if (!AddQ(&workq, ip->out())) {
if (ExtraDebug)
LOG(ERROR) << StringPrintf(
"Not OnePass: multiple paths %d -> %d\n", *it, ip->out());
goto fail;
}
id = ip->out();
goto Loop;
case kInstMatch:
if (matched) {
// (3) is violated
if (ExtraDebug)
LOG(ERROR) << StringPrintf(
"Not OnePass: multiple matches from %d\n", *it);
goto fail;
}
matched = true;
node->matchcond = cond;
if (ip->last())
break;
// If already on work queue, (1) is violated: bail out.
if (!AddQ(&workq, id+1))
goto fail;
id = id+1;
goto Loop;
case kInstFail:
break;
}
}
}
if (ExtraDebug) { // For debugging, dump one-pass NFA to LOG(ERROR).
LOG(ERROR) << "bytemap:\n" << DumpByteMap();
LOG(ERROR) << "prog:\n" << Dump();
std::map<int, int> idmap;
for (int i = 0; i < size; i++)
if (nodebyid[i] != -1)
idmap[nodebyid[i]] = i;
string dump;
for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
int id = *it;
int nodeindex = nodebyid[id];
if (nodeindex == -1)
continue;
OneState* node = IndexToNode(nodes.data(), statesize, nodeindex);
StringAppendF(&dump, "node %d id=%d: matchcond=%#x\n",
nodeindex, id, node->matchcond);
for (int i = 0; i < bytemap_range_; i++) {
if ((node->action[i] & kImpossible) == kImpossible)
continue;
StringAppendF(&dump, " %d cond %#x -> %d id=%d\n",
i, node->action[i] & 0xFFFF,
node->action[i] >> kIndexShift,
idmap[node->action[i] >> kIndexShift]);
}
}
LOG(ERROR) << "nodes:\n" << dump;
}
dfa_mem_ -= nalloc*statesize;
onepass_nodes_ = new uint8_t[nalloc*statesize];
memmove(onepass_nodes_, nodes.data(), nalloc*statesize);
delete[] stack;
delete[] nodebyid;
return true;
fail:
delete[] stack;
delete[] nodebyid;
return false;
}
} // namespace re2

File diff suppressed because it is too large Load Diff

View File

@ -1,119 +0,0 @@
// GENERATED BY make_perl_groups.pl; DO NOT EDIT.
// make_perl_groups.pl >perl_groups.cc
#include "re2/unicode_groups.h"
namespace re2 {
static const URange16 code1[] = { /* \d */
{ 0x30, 0x39 },
};
static const URange16 code2[] = { /* \s */
{ 0x9, 0xa },
{ 0xc, 0xd },
{ 0x20, 0x20 },
};
static const URange16 code3[] = { /* \w */
{ 0x30, 0x39 },
{ 0x41, 0x5a },
{ 0x5f, 0x5f },
{ 0x61, 0x7a },
};
const UGroup perl_groups[] = {
{ "\\d", +1, code1, 1 },
{ "\\D", -1, code1, 1 },
{ "\\s", +1, code2, 3 },
{ "\\S", -1, code2, 3 },
{ "\\w", +1, code3, 4 },
{ "\\W", -1, code3, 4 },
};
const int num_perl_groups = 6;
static const URange16 code4[] = { /* [:alnum:] */
{ 0x30, 0x39 },
{ 0x41, 0x5a },
{ 0x61, 0x7a },
};
static const URange16 code5[] = { /* [:alpha:] */
{ 0x41, 0x5a },
{ 0x61, 0x7a },
};
static const URange16 code6[] = { /* [:ascii:] */
{ 0x0, 0x7f },
};
static const URange16 code7[] = { /* [:blank:] */
{ 0x9, 0x9 },
{ 0x20, 0x20 },
};
static const URange16 code8[] = { /* [:cntrl:] */
{ 0x0, 0x1f },
{ 0x7f, 0x7f },
};
static const URange16 code9[] = { /* [:digit:] */
{ 0x30, 0x39 },
};
static const URange16 code10[] = { /* [:graph:] */
{ 0x21, 0x7e },
};
static const URange16 code11[] = { /* [:lower:] */
{ 0x61, 0x7a },
};
static const URange16 code12[] = { /* [:print:] */
{ 0x20, 0x7e },
};
static const URange16 code13[] = { /* [:punct:] */
{ 0x21, 0x2f },
{ 0x3a, 0x40 },
{ 0x5b, 0x60 },
{ 0x7b, 0x7e },
};
static const URange16 code14[] = { /* [:space:] */
{ 0x9, 0xd },
{ 0x20, 0x20 },
};
static const URange16 code15[] = { /* [:upper:] */
{ 0x41, 0x5a },
};
static const URange16 code16[] = { /* [:word:] */
{ 0x30, 0x39 },
{ 0x41, 0x5a },
{ 0x5f, 0x5f },
{ 0x61, 0x7a },
};
static const URange16 code17[] = { /* [:xdigit:] */
{ 0x30, 0x39 },
{ 0x41, 0x46 },
{ 0x61, 0x66 },
};
const UGroup posix_groups[] = {
{ "[:alnum:]", +1, code4, 3 },
{ "[:^alnum:]", -1, code4, 3 },
{ "[:alpha:]", +1, code5, 2 },
{ "[:^alpha:]", -1, code5, 2 },
{ "[:ascii:]", +1, code6, 1 },
{ "[:^ascii:]", -1, code6, 1 },
{ "[:blank:]", +1, code7, 2 },
{ "[:^blank:]", -1, code7, 2 },
{ "[:cntrl:]", +1, code8, 2 },
{ "[:^cntrl:]", -1, code8, 2 },
{ "[:digit:]", +1, code9, 1 },
{ "[:^digit:]", -1, code9, 1 },
{ "[:graph:]", +1, code10, 1 },
{ "[:^graph:]", -1, code10, 1 },
{ "[:lower:]", +1, code11, 1 },
{ "[:^lower:]", -1, code11, 1 },
{ "[:print:]", +1, code12, 1 },
{ "[:^print:]", -1, code12, 1 },
{ "[:punct:]", +1, code13, 4 },
{ "[:^punct:]", -1, code13, 4 },
{ "[:space:]", +1, code14, 2 },
{ "[:^space:]", -1, code14, 2 },
{ "[:upper:]", +1, code15, 1 },
{ "[:^upper:]", -1, code15, 1 },
{ "[:word:]", +1, code16, 4 },
{ "[:^word:]", -1, code16, 4 },
{ "[:xdigit:]", +1, code17, 3 },
{ "[:^xdigit:]", -1, code17, 3 },
};
const int num_posix_groups = 28;
} // namespace re2

View File

@ -1,711 +0,0 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "re2/prefilter.h"
#include <stddef.h>
#include <stdint.h>
#include <string>
#include <vector>
#include "util/util.h"
#include "util/logging.h"
#include "util/strutil.h"
#include "util/utf.h"
#include "re2/re2.h"
#include "re2/unicode_casefold.h"
#include "re2/walker-inl.h"
namespace re2 {
static const bool ExtraDebug = false;
typedef std::set<string>::iterator SSIter;
typedef std::set<string>::const_iterator ConstSSIter;
// Initializes a Prefilter, allocating subs_ as necessary.
Prefilter::Prefilter(Op op) {
op_ = op;
subs_ = NULL;
if (op_ == AND || op_ == OR)
subs_ = new std::vector<Prefilter*>;
}
// Destroys a Prefilter.
Prefilter::~Prefilter() {
if (subs_) {
for (size_t i = 0; i < subs_->size(); i++)
delete (*subs_)[i];
delete subs_;
subs_ = NULL;
}
}
// Simplify if the node is an empty Or or And.
Prefilter* Prefilter::Simplify() {
if (op_ != AND && op_ != OR) {
return this;
}
// Nothing left in the AND/OR.
if (subs_->empty()) {
if (op_ == AND)
op_ = ALL; // AND of nothing is true
else
op_ = NONE; // OR of nothing is false
return this;
}
// Just one subnode: throw away wrapper.
if (subs_->size() == 1) {
Prefilter* a = (*subs_)[0];
subs_->clear();
delete this;
return a->Simplify();
}
return this;
}
// Combines two Prefilters together to create an "op" (AND or OR).
// The passed Prefilters will be part of the returned Prefilter or deleted.
// Does lots of work to avoid creating unnecessarily complicated structures.
Prefilter* Prefilter::AndOr(Op op, Prefilter* a, Prefilter* b) {
// If a, b can be rewritten as op, do so.
a = a->Simplify();
b = b->Simplify();
// Canonicalize: a->op <= b->op.
if (a->op() > b->op()) {
Prefilter* t = a;
a = b;
b = t;
}
// Trivial cases.
// ALL AND b = b
// NONE OR b = b
// ALL OR b = ALL
// NONE AND b = NONE
// Don't need to look at b, because of canonicalization above.
// ALL and NONE are smallest opcodes.
if (a->op() == ALL || a->op() == NONE) {
if ((a->op() == ALL && op == AND) ||
(a->op() == NONE && op == OR)) {
delete a;
return b;
} else {
delete b;
return a;
}
}
// If a and b match op, merge their contents.
if (a->op() == op && b->op() == op) {
for (size_t i = 0; i < b->subs()->size(); i++) {
Prefilter* bb = (*b->subs())[i];
a->subs()->push_back(bb);
}
b->subs()->clear();
delete b;
return a;
}
// If a already has the same op as the op that is under construction
// add in b (similarly if b already has the same op, add in a).
if (b->op() == op) {
Prefilter* t = a;
a = b;
b = t;
}
if (a->op() == op) {
a->subs()->push_back(b);
return a;
}
// Otherwise just return the op.
Prefilter* c = new Prefilter(op);
c->subs()->push_back(a);
c->subs()->push_back(b);
return c;
}
Prefilter* Prefilter::And(Prefilter* a, Prefilter* b) {
return AndOr(AND, a, b);
}
Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) {
return AndOr(OR, a, b);
}
static void SimplifyStringSet(std::set<string> *ss) {
// Now make sure that the strings aren't redundant. For example, if
// we know "ab" is a required string, then it doesn't help at all to
// know that "abc" is also a required string, so delete "abc". This
// is because, when we are performing a string search to filter
// regexps, matching ab will already allow this regexp to be a
// candidate for match, so further matching abc is redundant.
for (SSIter i = ss->begin(); i != ss->end(); ++i) {
SSIter j = i;
++j;
while (j != ss->end()) {
// Increment j early so that we can erase the element it points to.
SSIter old_j = j;
++j;
if (old_j->find(*i) != string::npos)
ss->erase(old_j);
}
}
}
Prefilter* Prefilter::OrStrings(std::set<string>* ss) {
SimplifyStringSet(ss);
Prefilter* or_prefilter = NULL;
if (!ss->empty()) {
or_prefilter = new Prefilter(NONE);
for (SSIter i = ss->begin(); i != ss->end(); ++i)
or_prefilter = Or(or_prefilter, FromString(*i));
}
return or_prefilter;
}
static Rune ToLowerRune(Rune r) {
if (r < Runeself) {
if ('A' <= r && r <= 'Z')
r += 'a' - 'A';
return r;
}
const CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r);
if (f == NULL || r < f->lo)
return r;
return ApplyFold(f, r);
}
static Rune ToLowerRuneLatin1(Rune r) {
if ('A' <= r && r <= 'Z')
r += 'a' - 'A';
return r;
}
Prefilter* Prefilter::FromString(const string& str) {
Prefilter* m = new Prefilter(Prefilter::ATOM);
m->atom_ = str;
return m;
}
// Information about a regexp used during computation of Prefilter.
// Can be thought of as information about the set of strings matching
// the given regular expression.
class Prefilter::Info {
public:
Info();
~Info();
// More constructors. They delete their Info* arguments.
static Info* Alt(Info* a, Info* b);
static Info* Concat(Info* a, Info* b);
static Info* And(Info* a, Info* b);
static Info* Star(Info* a);
static Info* Plus(Info* a);
static Info* Quest(Info* a);
static Info* EmptyString();
static Info* NoMatch();
static Info* AnyChar();
static Info* CClass(CharClass* cc, bool latin1);
static Info* Literal(Rune r);
static Info* LiteralLatin1(Rune r);
static Info* AnyMatch();
// Format Info as a string.
string ToString();
// Caller takes ownership of the Prefilter.
Prefilter* TakeMatch();
std::set<string>& exact() { return exact_; }
bool is_exact() const { return is_exact_; }
class Walker;
private:
std::set<string> exact_;
// When is_exact_ is true, the strings that match
// are placed in exact_. When it is no longer an exact
// set of strings that match this RE, then is_exact_
// is false and the match_ contains the required match
// criteria.
bool is_exact_;
// Accumulated Prefilter query that any
// match for this regexp is guaranteed to match.
Prefilter* match_;
};
Prefilter::Info::Info()
: is_exact_(false),
match_(NULL) {
}
Prefilter::Info::~Info() {
delete match_;
}
Prefilter* Prefilter::Info::TakeMatch() {
if (is_exact_) {
match_ = Prefilter::OrStrings(&exact_);
is_exact_ = false;
}
Prefilter* m = match_;
match_ = NULL;
return m;
}
// Format a Info in string form.
string Prefilter::Info::ToString() {
if (is_exact_) {
int n = 0;
string s;
for (std::set<string>::iterator i = exact_.begin();
i != exact_.end();
++i) {
if (n++ > 0)
s += ",";
s += *i;
}
return s;
}
if (match_)
return match_->DebugString();
return "";
}
// Add the strings from src to dst.
static void CopyIn(const std::set<string>& src,
std::set<string>* dst) {
for (ConstSSIter i = src.begin(); i != src.end(); ++i)
dst->insert(*i);
}
// Add the cross-product of a and b to dst.
// (For each string i in a and j in b, add i+j.)
static void CrossProduct(const std::set<string>& a,
const std::set<string>& b,
std::set<string>* dst) {
for (ConstSSIter i = a.begin(); i != a.end(); ++i)
for (ConstSSIter j = b.begin(); j != b.end(); ++j)
dst->insert(*i + *j);
}
// Concats a and b. Requires that both are exact sets.
// Forms an exact set that is a crossproduct of a and b.
Prefilter::Info* Prefilter::Info::Concat(Info* a, Info* b) {
if (a == NULL)
return b;
DCHECK(a->is_exact_);
DCHECK(b && b->is_exact_);
Info *ab = new Info();
CrossProduct(a->exact_, b->exact_, &ab->exact_);
ab->is_exact_ = true;
delete a;
delete b;
return ab;
}
// Constructs an inexact Info for ab given a and b.
// Used only when a or b is not exact or when the
// exact cross product is likely to be too big.
Prefilter::Info* Prefilter::Info::And(Info* a, Info* b) {
if (a == NULL)
return b;
if (b == NULL)
return a;
Info *ab = new Info();
ab->match_ = Prefilter::And(a->TakeMatch(), b->TakeMatch());
ab->is_exact_ = false;
delete a;
delete b;
return ab;
}
// Constructs Info for a|b given a and b.
Prefilter::Info* Prefilter::Info::Alt(Info* a, Info* b) {
Info *ab = new Info();
if (a->is_exact_ && b->is_exact_) {
CopyIn(a->exact_, &ab->exact_);
CopyIn(b->exact_, &ab->exact_);
ab->is_exact_ = true;
} else {
// Either a or b has is_exact_ = false. If the other
// one has is_exact_ = true, we move it to match_ and
// then create a OR of a,b. The resulting Info has
// is_exact_ = false.
ab->match_ = Prefilter::Or(a->TakeMatch(), b->TakeMatch());
ab->is_exact_ = false;
}
delete a;
delete b;
return ab;
}
// Constructs Info for a? given a.
Prefilter::Info* Prefilter::Info::Quest(Info *a) {
Info *ab = new Info();
ab->is_exact_ = false;
ab->match_ = new Prefilter(ALL);
delete a;
return ab;
}
// Constructs Info for a* given a.
// Same as a? -- not much to do.
Prefilter::Info* Prefilter::Info::Star(Info *a) {
return Quest(a);
}
// Constructs Info for a+ given a. If a was exact set, it isn't
// anymore.
Prefilter::Info* Prefilter::Info::Plus(Info *a) {
Info *ab = new Info();
ab->match_ = a->TakeMatch();
ab->is_exact_ = false;
delete a;
return ab;
}
static string RuneToString(Rune r) {
char buf[UTFmax];
int n = runetochar(buf, &r);
return string(buf, n);
}
static string RuneToStringLatin1(Rune r) {
char c = r & 0xff;
return string(&c, 1);
}
// Constructs Info for literal rune.
Prefilter::Info* Prefilter::Info::Literal(Rune r) {
Info* info = new Info();
info->exact_.insert(RuneToString(ToLowerRune(r)));
info->is_exact_ = true;
return info;
}
// Constructs Info for literal rune for Latin1 encoded string.
Prefilter::Info* Prefilter::Info::LiteralLatin1(Rune r) {
Info* info = new Info();
info->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
info->is_exact_ = true;
return info;
}
// Constructs Info for dot (any character).
Prefilter::Info* Prefilter::Info::AnyChar() {
Prefilter::Info* info = new Prefilter::Info();
info->match_ = new Prefilter(ALL);
return info;
}
// Constructs Prefilter::Info for no possible match.
Prefilter::Info* Prefilter::Info::NoMatch() {
Prefilter::Info* info = new Prefilter::Info();
info->match_ = new Prefilter(NONE);
return info;
}
// Constructs Prefilter::Info for any possible match.
// This Prefilter::Info is valid for any regular expression,
// since it makes no assertions whatsoever about the
// strings being matched.
Prefilter::Info* Prefilter::Info::AnyMatch() {
Prefilter::Info *info = new Prefilter::Info();
info->match_ = new Prefilter(ALL);
return info;
}
// Constructs Prefilter::Info for just the empty string.
Prefilter::Info* Prefilter::Info::EmptyString() {
Prefilter::Info* info = new Prefilter::Info();
info->is_exact_ = true;
info->exact_.insert("");
return info;
}
// Constructs Prefilter::Info for a character class.
typedef CharClass::iterator CCIter;
Prefilter::Info* Prefilter::Info::CClass(CharClass *cc,
bool latin1) {
if (ExtraDebug) {
LOG(ERROR) << "CharClassInfo:";
for (CCIter i = cc->begin(); i != cc->end(); ++i)
LOG(ERROR) << " " << i->lo << "-" << i->hi;
}
// If the class is too large, it's okay to overestimate.
if (cc->size() > 10)
return AnyChar();
Prefilter::Info *a = new Prefilter::Info();
for (CCIter i = cc->begin(); i != cc->end(); ++i)
for (Rune r = i->lo; r <= i->hi; r++) {
if (latin1) {
a->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
} else {
a->exact_.insert(RuneToString(ToLowerRune(r)));
}
}
a->is_exact_ = true;
if (ExtraDebug)
LOG(ERROR) << " = " << a->ToString();
return a;
}
class Prefilter::Info::Walker : public Regexp::Walker<Prefilter::Info*> {
public:
Walker(bool latin1) : latin1_(latin1) {}
virtual Info* PostVisit(
Regexp* re, Info* parent_arg,
Info* pre_arg,
Info** child_args, int nchild_args);
virtual Info* ShortVisit(
Regexp* re,
Info* parent_arg);
bool latin1() { return latin1_; }
private:
bool latin1_;
Walker(const Walker&) = delete;
Walker& operator=(const Walker&) = delete;
};
Prefilter::Info* Prefilter::BuildInfo(Regexp* re) {
if (ExtraDebug)
LOG(ERROR) << "BuildPrefilter::Info: " << re->ToString();
bool latin1 = (re->parse_flags() & Regexp::Latin1) != 0;
Prefilter::Info::Walker w(latin1);
Prefilter::Info* info = w.WalkExponential(re, NULL, 100000);
if (w.stopped_early()) {
delete info;
return NULL;
}
return info;
}
Prefilter::Info* Prefilter::Info::Walker::ShortVisit(
Regexp* re, Prefilter::Info* parent_arg) {
return AnyMatch();
}
// Constructs the Prefilter::Info for the given regular expression.
// Assumes re is simplified.
Prefilter::Info* Prefilter::Info::Walker::PostVisit(
Regexp* re, Prefilter::Info* parent_arg,
Prefilter::Info* pre_arg, Prefilter::Info** child_args,
int nchild_args) {
Prefilter::Info *info;
switch (re->op()) {
default:
case kRegexpRepeat:
LOG(DFATAL) << "Bad regexp op " << re->op();
info = EmptyString();
break;
case kRegexpNoMatch:
info = NoMatch();
break;
// These ops match the empty string:
case kRegexpEmptyMatch: // anywhere
case kRegexpBeginLine: // at beginning of line
case kRegexpEndLine: // at end of line
case kRegexpBeginText: // at beginning of text
case kRegexpEndText: // at end of text
case kRegexpWordBoundary: // at word boundary
case kRegexpNoWordBoundary: // not at word boundary
info = EmptyString();
break;
case kRegexpLiteral:
if (latin1()) {
info = LiteralLatin1(re->rune());
}
else {
info = Literal(re->rune());
}
break;
case kRegexpLiteralString:
if (re->nrunes() == 0) {
info = NoMatch();
break;
}
if (latin1()) {
info = LiteralLatin1(re->runes()[0]);
for (int i = 1; i < re->nrunes(); i++) {
info = Concat(info, LiteralLatin1(re->runes()[i]));
}
} else {
info = Literal(re->runes()[0]);
for (int i = 1; i < re->nrunes(); i++) {
info = Concat(info, Literal(re->runes()[i]));
}
}
break;
case kRegexpConcat: {
// Accumulate in info.
// Exact is concat of recent contiguous exact nodes.
info = NULL;
Info* exact = NULL;
for (int i = 0; i < nchild_args; i++) {
Info* ci = child_args[i]; // child info
if (!ci->is_exact() ||
(exact && ci->exact().size() * exact->exact().size() > 16)) {
// Exact run is over.
info = And(info, exact);
exact = NULL;
// Add this child's info.
info = And(info, ci);
} else {
// Append to exact run.
exact = Concat(exact, ci);
}
}
info = And(info, exact);
}
break;
case kRegexpAlternate:
info = child_args[0];
for (int i = 1; i < nchild_args; i++)
info = Alt(info, child_args[i]);
break;
case kRegexpStar:
info = Star(child_args[0]);
break;
case kRegexpQuest:
info = Quest(child_args[0]);
break;
case kRegexpPlus:
info = Plus(child_args[0]);
break;
case kRegexpAnyChar:
// Claim nothing, except that it's not empty.
info = AnyChar();
break;
case kRegexpCharClass:
info = CClass(re->cc(), latin1());
break;
case kRegexpCapture:
// These don't affect the set of matching strings.
info = child_args[0];
break;
}
if (ExtraDebug)
LOG(ERROR) << "BuildInfo " << re->ToString()
<< ": " << (info ? info->ToString() : "");
return info;
}
Prefilter* Prefilter::FromRegexp(Regexp* re) {
if (re == NULL)
return NULL;
Regexp* simple = re->Simplify();
Prefilter::Info *info = BuildInfo(simple);
simple->Decref();
if (info == NULL)
return NULL;
Prefilter* m = info->TakeMatch();
delete info;
return m;
}
string Prefilter::DebugString() const {
switch (op_) {
default:
LOG(DFATAL) << "Bad op in Prefilter::DebugString: " << op_;
return StringPrintf("op%d", op_);
case NONE:
return "*no-matches*";
case ATOM:
return atom_;
case ALL:
return "";
case AND: {
string s = "";
for (size_t i = 0; i < subs_->size(); i++) {
if (i > 0)
s += " ";
Prefilter* sub = (*subs_)[i];
s += sub ? sub->DebugString() : "<nil>";
}
return s;
}
case OR: {
string s = "(";
for (size_t i = 0; i < subs_->size(); i++) {
if (i > 0)
s += "|";
Prefilter* sub = (*subs_)[i];
s += sub ? sub->DebugString() : "<nil>";
}
s += ")";
return s;
}
}
}
Prefilter* Prefilter::FromRE2(const RE2* re2) {
if (re2 == NULL)
return NULL;
Regexp* regexp = re2->Regexp();
if (regexp == NULL)
return NULL;
return FromRegexp(regexp);
}
} // namespace re2

View File

@ -1,108 +0,0 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_PREFILTER_H_
#define RE2_PREFILTER_H_
// Prefilter is the class used to extract string guards from regexps.
// Rather than using Prefilter class directly, use FilteredRE2.
// See filtered_re2.h
#include <set>
#include <string>
#include <vector>
#include "util/util.h"
#include "util/logging.h"
namespace re2 {
class RE2;
class Regexp;
class Prefilter {
// Instead of using Prefilter directly, use FilteredRE2; see filtered_re2.h
public:
enum Op {
ALL = 0, // Everything matches
NONE, // Nothing matches
ATOM, // The string atom() must match
AND, // All in subs() must match
OR, // One of subs() must match
};
explicit Prefilter(Op op);
~Prefilter();
Op op() { return op_; }
const string& atom() const { return atom_; }
void set_unique_id(int id) { unique_id_ = id; }
int unique_id() const { return unique_id_; }
// The children of the Prefilter node.
std::vector<Prefilter*>* subs() {
DCHECK(op_ == AND || op_ == OR);
return subs_;
}
// Set the children vector. Prefilter takes ownership of subs and
// subs_ will be deleted when Prefilter is deleted.
void set_subs(std::vector<Prefilter*>* subs) { subs_ = subs; }
// Given a RE2, return a Prefilter. The caller takes ownership of
// the Prefilter and should deallocate it. Returns NULL if Prefilter
// cannot be formed.
static Prefilter* FromRE2(const RE2* re2);
// Returns a readable debug string of the prefilter.
string DebugString() const;
private:
class Info;
// Combines two prefilters together to create an AND. The passed
// Prefilters will be part of the returned Prefilter or deleted.
static Prefilter* And(Prefilter* a, Prefilter* b);
// Combines two prefilters together to create an OR. The passed
// Prefilters will be part of the returned Prefilter or deleted.
static Prefilter* Or(Prefilter* a, Prefilter* b);
// Generalized And/Or
static Prefilter* AndOr(Op op, Prefilter* a, Prefilter* b);
static Prefilter* FromRegexp(Regexp* a);
static Prefilter* FromString(const string& str);
static Prefilter* OrStrings(std::set<string>* ss);
static Info* BuildInfo(Regexp* re);
Prefilter* Simplify();
// Kind of Prefilter.
Op op_;
// Sub-matches for AND or OR Prefilter.
std::vector<Prefilter*>* subs_;
// Actual string to match in leaf node.
string atom_;
// If different prefilters have the same string atom, or if they are
// structurally the same (e.g., OR of same atom strings) they are
// considered the same unique nodes. This is the id for each unique
// node. This field is populated with a unique id for every node,
// and -1 for duplicate nodes.
int unique_id_;
Prefilter(const Prefilter&) = delete;
Prefilter& operator=(const Prefilter&) = delete;
};
} // namespace re2
#endif // RE2_PREFILTER_H_

View File

@ -1,405 +0,0 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "re2/prefilter_tree.h"
#include <stddef.h>
#include <algorithm>
#include <map>
#include <memory>
#include <set>
#include <string>
#include <utility>
#include <vector>
#include "util/util.h"
#include "util/logging.h"
#include "util/strutil.h"
#include "re2/prefilter.h"
#include "re2/re2.h"
namespace re2 {
static const bool ExtraDebug = false;
PrefilterTree::PrefilterTree()
: compiled_(false),
min_atom_len_(3) {
}
PrefilterTree::PrefilterTree(int min_atom_len)
: compiled_(false),
min_atom_len_(min_atom_len) {
}
PrefilterTree::~PrefilterTree() {
for (size_t i = 0; i < prefilter_vec_.size(); i++)
delete prefilter_vec_[i];
for (size_t i = 0; i < entries_.size(); i++)
delete entries_[i].parents;
}
void PrefilterTree::Add(Prefilter* prefilter) {
if (compiled_) {
LOG(DFATAL) << "Add called after Compile.";
return;
}
if (prefilter != NULL && !KeepNode(prefilter)) {
delete prefilter;
prefilter = NULL;
}
prefilter_vec_.push_back(prefilter);
}
void PrefilterTree::Compile(std::vector<string>* atom_vec) {
if (compiled_) {
LOG(DFATAL) << "Compile called already.";
return;
}
// We do this check to support some legacy uses of
// PrefilterTree that call Compile before adding any regexps,
// and expect Compile not to have effect.
if (prefilter_vec_.empty())
return;
compiled_ = true;
// TODO(junyer): Use std::unordered_set<Prefilter*> instead?
NodeMap nodes;
AssignUniqueIds(&nodes, atom_vec);
// Identify nodes that are too common among prefilters and are
// triggering too many parents. Then get rid of them if possible.
// Note that getting rid of a prefilter node simply means they are
// no longer necessary for their parent to trigger; that is, we do
// not miss out on any regexps triggering by getting rid of a
// prefilter node.
for (size_t i = 0; i < entries_.size(); i++) {
StdIntMap* parents = entries_[i].parents;
if (parents->size() > 8) {
// This one triggers too many things. If all the parents are AND
// nodes and have other things guarding them, then get rid of
// this trigger. TODO(vsri): Adjust the threshold appropriately,
// make it a function of total number of nodes?
bool have_other_guard = true;
for (StdIntMap::iterator it = parents->begin();
it != parents->end(); ++it) {
have_other_guard = have_other_guard &&
(entries_[it->first].propagate_up_at_count > 1);
}
if (have_other_guard) {
for (StdIntMap::iterator it = parents->begin();
it != parents->end(); ++it)
entries_[it->first].propagate_up_at_count -= 1;
parents->clear(); // Forget the parents
}
}
}
if (ExtraDebug)
PrintDebugInfo(&nodes);
}
Prefilter* PrefilterTree::CanonicalNode(NodeMap* nodes, Prefilter* node) {
string node_string = NodeString(node);
std::map<string, Prefilter*>::iterator iter = nodes->find(node_string);
if (iter == nodes->end())
return NULL;
return (*iter).second;
}
string PrefilterTree::NodeString(Prefilter* node) const {
// Adding the operation disambiguates AND/OR/atom nodes.
string s = StringPrintf("%d", node->op()) + ":";
if (node->op() == Prefilter::ATOM) {
s += node->atom();
} else {
for (size_t i = 0; i < node->subs()->size(); i++) {
if (i > 0)
s += ',';
s += StringPrintf("%d", (*node->subs())[i]->unique_id());
}
}
return s;
}
bool PrefilterTree::KeepNode(Prefilter* node) const {
if (node == NULL)
return false;
switch (node->op()) {
default:
LOG(DFATAL) << "Unexpected op in KeepNode: " << node->op();
return false;
case Prefilter::ALL:
return false;
case Prefilter::ATOM:
return node->atom().size() >= static_cast<size_t>(min_atom_len_);
case Prefilter::AND: {
int j = 0;
std::vector<Prefilter*>* subs = node->subs();
for (size_t i = 0; i < subs->size(); i++)
if (KeepNode((*subs)[i]))
(*subs)[j++] = (*subs)[i];
else
delete (*subs)[i];
subs->resize(j);
return j > 0;
}
case Prefilter::OR:
for (size_t i = 0; i < node->subs()->size(); i++)
if (!KeepNode((*node->subs())[i]))
return false;
return true;
}
}
void PrefilterTree::AssignUniqueIds(NodeMap* nodes,
std::vector<string>* atom_vec) {
atom_vec->clear();
// Build vector of all filter nodes, sorted topologically
// from top to bottom in v.
std::vector<Prefilter*> v;
// Add the top level nodes of each regexp prefilter.
for (size_t i = 0; i < prefilter_vec_.size(); i++) {
Prefilter* f = prefilter_vec_[i];
if (f == NULL)
unfiltered_.push_back(static_cast<int>(i));
// We push NULL also on to v, so that we maintain the
// mapping of index==regexpid for level=0 prefilter nodes.
v.push_back(f);
}
// Now add all the descendant nodes.
for (size_t i = 0; i < v.size(); i++) {
Prefilter* f = v[i];
if (f == NULL)
continue;
if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) {
const std::vector<Prefilter*>& subs = *f->subs();
for (size_t j = 0; j < subs.size(); j++)
v.push_back(subs[j]);
}
}
// Identify unique nodes.
int unique_id = 0;
for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
Prefilter *node = v[i];
if (node == NULL)
continue;
node->set_unique_id(-1);
Prefilter* canonical = CanonicalNode(nodes, node);
if (canonical == NULL) {
// Any further nodes that have the same node string
// will find this node as the canonical node.
nodes->emplace(NodeString(node), node);
if (node->op() == Prefilter::ATOM) {
atom_vec->push_back(node->atom());
atom_index_to_id_.push_back(unique_id);
}
node->set_unique_id(unique_id++);
} else {
node->set_unique_id(canonical->unique_id());
}
}
entries_.resize(nodes->size());
// Create parent StdIntMap for the entries.
for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
Prefilter* prefilter = v[i];
if (prefilter == NULL)
continue;
if (CanonicalNode(nodes, prefilter) != prefilter)
continue;
Entry* entry = &entries_[prefilter->unique_id()];
entry->parents = new StdIntMap();
}
// Fill the entries.
for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
Prefilter* prefilter = v[i];
if (prefilter == NULL)
continue;
if (CanonicalNode(nodes, prefilter) != prefilter)
continue;
Entry* entry = &entries_[prefilter->unique_id()];
switch (prefilter->op()) {
default:
case Prefilter::ALL:
LOG(DFATAL) << "Unexpected op: " << prefilter->op();
return;
case Prefilter::ATOM:
entry->propagate_up_at_count = 1;
break;
case Prefilter::OR:
case Prefilter::AND: {
std::set<int> uniq_child;
for (size_t j = 0; j < prefilter->subs()->size(); j++) {
Prefilter* child = (*prefilter->subs())[j];
Prefilter* canonical = CanonicalNode(nodes, child);
if (canonical == NULL) {
LOG(DFATAL) << "Null canonical node";
return;
}
int child_id = canonical->unique_id();
uniq_child.insert(child_id);
// To the child, we want to add to parent indices.
Entry* child_entry = &entries_[child_id];
if (child_entry->parents->find(prefilter->unique_id()) ==
child_entry->parents->end()) {
(*child_entry->parents)[prefilter->unique_id()] = 1;
}
}
entry->propagate_up_at_count = prefilter->op() == Prefilter::AND
? static_cast<int>(uniq_child.size())
: 1;
break;
}
}
}
// For top level nodes, populate regexp id.
for (size_t i = 0; i < prefilter_vec_.size(); i++) {
if (prefilter_vec_[i] == NULL)
continue;
int id = CanonicalNode(nodes, prefilter_vec_[i])->unique_id();
DCHECK_LE(0, id);
Entry* entry = &entries_[id];
entry->regexps.push_back(static_cast<int>(i));
}
}
// Functions for triggering during search.
void PrefilterTree::RegexpsGivenStrings(
const std::vector<int>& matched_atoms,
std::vector<int>* regexps) const {
regexps->clear();
if (!compiled_) {
LOG(ERROR) << "RegexpsGivenStrings called before Compile.";
for (size_t i = 0; i < prefilter_vec_.size(); ++i)
regexps->push_back(static_cast<int>(i));
} else {
if (!prefilter_vec_.empty()) {
IntMap regexps_map(static_cast<int>(prefilter_vec_.size()));
std::vector<int> matched_atom_ids;
for (size_t j = 0; j < matched_atoms.size(); j++) {
matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]);
}
PropagateMatch(matched_atom_ids, &regexps_map);
for (IntMap::iterator it = regexps_map.begin();
it != regexps_map.end();
++it)
regexps->push_back(it->index());
regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end());
}
}
std::sort(regexps->begin(), regexps->end());
}
void PrefilterTree::PropagateMatch(const std::vector<int>& atom_ids,
IntMap* regexps) const {
IntMap count(static_cast<int>(entries_.size()));
IntMap work(static_cast<int>(entries_.size()));
for (size_t i = 0; i < atom_ids.size(); i++)
work.set(atom_ids[i], 1);
for (IntMap::iterator it = work.begin(); it != work.end(); ++it) {
const Entry& entry = entries_[it->index()];
// Record regexps triggered.
for (size_t i = 0; i < entry.regexps.size(); i++)
regexps->set(entry.regexps[i], 1);
int c;
// Pass trigger up to parents.
for (StdIntMap::iterator it = entry.parents->begin();
it != entry.parents->end();
++it) {
int j = it->first;
const Entry& parent = entries_[j];
// Delay until all the children have succeeded.
if (parent.propagate_up_at_count > 1) {
if (count.has_index(j)) {
c = count.get_existing(j) + 1;
count.set_existing(j, c);
} else {
c = 1;
count.set_new(j, c);
}
if (c < parent.propagate_up_at_count)
continue;
}
// Trigger the parent.
work.set(j, 1);
}
}
}
// Debugging help.
void PrefilterTree::PrintPrefilter(int regexpid) {
LOG(ERROR) << DebugNodeString(prefilter_vec_[regexpid]);
}
void PrefilterTree::PrintDebugInfo(NodeMap* nodes) {
LOG(ERROR) << "#Unique Atoms: " << atom_index_to_id_.size();
LOG(ERROR) << "#Unique Nodes: " << entries_.size();
for (size_t i = 0; i < entries_.size(); ++i) {
StdIntMap* parents = entries_[i].parents;
const std::vector<int>& regexps = entries_[i].regexps;
LOG(ERROR) << "EntryId: " << i
<< " N: " << parents->size() << " R: " << regexps.size();
for (StdIntMap::iterator it = parents->begin(); it != parents->end(); ++it)
LOG(ERROR) << it->first;
}
LOG(ERROR) << "Map:";
for (std::map<string, Prefilter*>::const_iterator iter = nodes->begin();
iter != nodes->end(); ++iter)
LOG(ERROR) << "NodeId: " << (*iter).second->unique_id()
<< " Str: " << (*iter).first;
}
string PrefilterTree::DebugNodeString(Prefilter* node) const {
string node_string = "";
if (node->op() == Prefilter::ATOM) {
DCHECK(!node->atom().empty());
node_string += node->atom();
} else {
// Adding the operation disambiguates AND and OR nodes.
node_string += node->op() == Prefilter::AND ? "AND" : "OR";
node_string += "(";
for (size_t i = 0; i < node->subs()->size(); i++) {
if (i > 0)
node_string += ',';
node_string += StringPrintf("%d", (*node->subs())[i]->unique_id());
node_string += ":";
node_string += DebugNodeString((*node->subs())[i]);
}
node_string += ")";
}
return node_string;
}
} // namespace re2

View File

@ -1,139 +0,0 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_PREFILTER_TREE_H_
#define RE2_PREFILTER_TREE_H_
// The PrefilterTree class is used to form an AND-OR tree of strings
// that would trigger each regexp. The 'prefilter' of each regexp is
// added tp PrefilterTree, and then PrefilterTree is used to find all
// the unique strings across the prefilters. During search, by using
// matches from a string matching engine, PrefilterTree deduces the
// set of regexps that are to be triggered. The 'string matching
// engine' itself is outside of this class, and the caller can use any
// favorite engine. PrefilterTree provides a set of strings (called
// atoms) that the user of this class should use to do the string
// matching.
#include <map>
#include <string>
#include <vector>
#include "util/util.h"
#include "util/sparse_array.h"
#include "re2/prefilter.h"
namespace re2 {
class PrefilterTree {
public:
PrefilterTree();
explicit PrefilterTree(int min_atom_len);
~PrefilterTree();
// Adds the prefilter for the next regexp. Note that we assume that
// Add called sequentially for all regexps. All Add calls
// must precede Compile.
void Add(Prefilter* prefilter);
// The Compile returns a vector of string in atom_vec.
// Call this after all the prefilters are added through Add.
// No calls to Add after Compile are allowed.
// The caller should use the returned set of strings to do string matching.
// Each time a string matches, the corresponding index then has to be
// and passed to RegexpsGivenStrings below.
void Compile(std::vector<string>* atom_vec);
// Given the indices of the atoms that matched, returns the indexes
// of regexps that should be searched. The matched_atoms should
// contain all the ids of string atoms that were found to match the
// content. The caller can use any string match engine to perform
// this function. This function is thread safe.
void RegexpsGivenStrings(const std::vector<int>& matched_atoms,
std::vector<int>* regexps) const;
// Print debug prefilter. Also prints unique ids associated with
// nodes of the prefilter of the regexp.
void PrintPrefilter(int regexpid);
private:
typedef SparseArray<int> IntMap;
typedef std::map<int, int> StdIntMap;
typedef std::map<string, Prefilter*> NodeMap;
// Each unique node has a corresponding Entry that helps in
// passing the matching trigger information along the tree.
struct Entry {
public:
// How many children should match before this node triggers the
// parent. For an atom and an OR node, this is 1 and for an AND
// node, it is the number of unique children.
int propagate_up_at_count;
// When this node is ready to trigger the parent, what are the indices
// of the parent nodes to trigger. The reason there may be more than
// one is because of sharing. For example (abc | def) and (xyz | def)
// are two different nodes, but they share the atom 'def'. So when
// 'def' matches, it triggers two parents, corresponding to the two
// different OR nodes.
StdIntMap* parents;
// When this node is ready to trigger the parent, what are the
// regexps that are triggered.
std::vector<int> regexps;
};
// Returns true if the prefilter node should be kept.
bool KeepNode(Prefilter* node) const;
// This function assigns unique ids to various parts of the
// prefilter, by looking at if these nodes are already in the
// PrefilterTree.
void AssignUniqueIds(NodeMap* nodes, std::vector<string>* atom_vec);
// Given the matching atoms, find the regexps to be triggered.
void PropagateMatch(const std::vector<int>& atom_ids,
IntMap* regexps) const;
// Returns the prefilter node that has the same NodeString as this
// node. For the canonical node, returns node.
Prefilter* CanonicalNode(NodeMap* nodes, Prefilter* node);
// A string that uniquely identifies the node. Assumes that the
// children of node has already been assigned unique ids.
string NodeString(Prefilter* node) const;
// Recursively constructs a readable prefilter string.
string DebugNodeString(Prefilter* node) const;
// Used for debugging.
void PrintDebugInfo(NodeMap* nodes);
// These are all the nodes formed by Compile. Essentially, there is
// one node for each unique atom and each unique AND/OR node.
std::vector<Entry> entries_;
// indices of regexps that always pass through the filter (since we
// found no required literals in these regexps).
std::vector<int> unfiltered_;
// vector of Prefilter for all regexps.
std::vector<Prefilter*> prefilter_vec_;
// Atom index in returned strings to entry id mapping.
std::vector<int> atom_index_to_id_;
// Has the prefilter tree been compiled.
bool compiled_;
// Strings less than this length are not stored as atoms.
const int min_atom_len_;
PrefilterTree(const PrefilterTree&) = delete;
PrefilterTree& operator=(const PrefilterTree&) = delete;
};
} // namespace
#endif // RE2_PREFILTER_TREE_H_

View File

@ -1,826 +0,0 @@
// Copyright 2007 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Compiled regular expression representation.
// Tested by compile_test.cc
#include "re2/prog.h"
#include <stdint.h>
#include <string.h>
#include <algorithm>
#include <memory>
#include <utility>
#include "util/util.h"
#include "util/logging.h"
#include "util/strutil.h"
#include "re2/bitmap256.h"
#include "re2/stringpiece.h"
namespace re2 {
// Constructors per Inst opcode
void Prog::Inst::InitAlt(uint32_t out, uint32_t out1) {
DCHECK_EQ(out_opcode_, 0);
set_out_opcode(out, kInstAlt);
out1_ = out1;
}
void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32_t out) {
DCHECK_EQ(out_opcode_, 0);
set_out_opcode(out, kInstByteRange);
lo_ = lo & 0xFF;
hi_ = hi & 0xFF;
foldcase_ = foldcase & 0xFF;
}
void Prog::Inst::InitCapture(int cap, uint32_t out) {
DCHECK_EQ(out_opcode_, 0);
set_out_opcode(out, kInstCapture);
cap_ = cap;
}
void Prog::Inst::InitEmptyWidth(EmptyOp empty, uint32_t out) {
DCHECK_EQ(out_opcode_, 0);
set_out_opcode(out, kInstEmptyWidth);
empty_ = empty;
}
void Prog::Inst::InitMatch(int32_t id) {
DCHECK_EQ(out_opcode_, 0);
set_opcode(kInstMatch);
match_id_ = id;
}
void Prog::Inst::InitNop(uint32_t out) {
DCHECK_EQ(out_opcode_, 0);
set_opcode(kInstNop);
}
void Prog::Inst::InitFail() {
DCHECK_EQ(out_opcode_, 0);
set_opcode(kInstFail);
}
string Prog::Inst::Dump() {
switch (opcode()) {
default:
return StringPrintf("opcode %d", static_cast<int>(opcode()));
case kInstAlt:
return StringPrintf("alt -> %d | %d", out(), out1_);
case kInstAltMatch:
return StringPrintf("altmatch -> %d | %d", out(), out1_);
case kInstByteRange:
return StringPrintf("byte%s [%02x-%02x] -> %d",
foldcase_ ? "/i" : "",
lo_, hi_, out());
case kInstCapture:
return StringPrintf("capture %d -> %d", cap_, out());
case kInstEmptyWidth:
return StringPrintf("emptywidth %#x -> %d",
static_cast<int>(empty_), out());
case kInstMatch:
return StringPrintf("match! %d", match_id());
case kInstNop:
return StringPrintf("nop -> %d", out());
case kInstFail:
return StringPrintf("fail");
}
}
Prog::Prog()
: anchor_start_(false),
anchor_end_(false),
reversed_(false),
did_flatten_(false),
did_onepass_(false),
start_(0),
start_unanchored_(0),
size_(0),
bytemap_range_(0),
first_byte_(-1),
flags_(0),
list_count_(0),
inst_(NULL),
onepass_nodes_(NULL),
dfa_mem_(0),
dfa_first_(NULL),
dfa_longest_(NULL) {
}
Prog::~Prog() {
DeleteDFA(dfa_longest_);
DeleteDFA(dfa_first_);
delete[] onepass_nodes_;
delete[] inst_;
}
typedef SparseSet Workq;
static inline void AddToQueue(Workq* q, int id) {
if (id != 0)
q->insert(id);
}
static string ProgToString(Prog* prog, Workq* q) {
string s;
for (Workq::iterator i = q->begin(); i != q->end(); ++i) {
int id = *i;
Prog::Inst* ip = prog->inst(id);
StringAppendF(&s, "%d. %s\n", id, ip->Dump().c_str());
AddToQueue(q, ip->out());
if (ip->opcode() == kInstAlt || ip->opcode() == kInstAltMatch)
AddToQueue(q, ip->out1());
}
return s;
}
static string FlattenedProgToString(Prog* prog, int start) {
string s;
for (int id = start; id < prog->size(); id++) {
Prog::Inst* ip = prog->inst(id);
if (ip->last())
StringAppendF(&s, "%d. %s\n", id, ip->Dump().c_str());
else
StringAppendF(&s, "%d+ %s\n", id, ip->Dump().c_str());
}
return s;
}
string Prog::Dump() {
if (did_flatten_)
return FlattenedProgToString(this, start_);
Workq q(size_);
AddToQueue(&q, start_);
return ProgToString(this, &q);
}
string Prog::DumpUnanchored() {
if (did_flatten_)
return FlattenedProgToString(this, start_unanchored_);
Workq q(size_);
AddToQueue(&q, start_unanchored_);
return ProgToString(this, &q);
}
string Prog::DumpByteMap() {
string map;
for (int c = 0; c < 256; c++) {
int b = bytemap_[c];
int lo = c;
while (c < 256-1 && bytemap_[c+1] == b)
c++;
int hi = c;
StringAppendF(&map, "[%02x-%02x] -> %d\n", lo, hi, b);
}
return map;
}
int Prog::first_byte() {
std::call_once(first_byte_once_, [](Prog* prog) {
prog->first_byte_ = prog->ComputeFirstByte();
}, this);
return first_byte_;
}
static bool IsMatch(Prog*, Prog::Inst*);
// Peep-hole optimizer.
void Prog::Optimize() {
Workq q(size_);
// Eliminate nops. Most are taken out during compilation
// but a few are hard to avoid.
q.clear();
AddToQueue(&q, start_);
for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
int id = *i;
Inst* ip = inst(id);
int j = ip->out();
Inst* jp;
while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
j = jp->out();
}
ip->set_out(j);
AddToQueue(&q, ip->out());
if (ip->opcode() == kInstAlt) {
j = ip->out1();
while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
j = jp->out();
}
ip->out1_ = j;
AddToQueue(&q, ip->out1());
}
}
// Insert kInstAltMatch instructions
// Look for
// ip: Alt -> j | k
// j: ByteRange [00-FF] -> ip
// k: Match
// or the reverse (the above is the greedy one).
// Rewrite Alt to AltMatch.
q.clear();
AddToQueue(&q, start_);
for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
int id = *i;
Inst* ip = inst(id);
AddToQueue(&q, ip->out());
if (ip->opcode() == kInstAlt)
AddToQueue(&q, ip->out1());
if (ip->opcode() == kInstAlt) {
Inst* j = inst(ip->out());
Inst* k = inst(ip->out1());
if (j->opcode() == kInstByteRange && j->out() == id &&
j->lo() == 0x00 && j->hi() == 0xFF &&
IsMatch(this, k)) {
ip->set_opcode(kInstAltMatch);
continue;
}
if (IsMatch(this, j) &&
k->opcode() == kInstByteRange && k->out() == id &&
k->lo() == 0x00 && k->hi() == 0xFF) {
ip->set_opcode(kInstAltMatch);
}
}
}
}
// Is ip a guaranteed match at end of text, perhaps after some capturing?
static bool IsMatch(Prog* prog, Prog::Inst* ip) {
for (;;) {
switch (ip->opcode()) {
default:
LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode();
return false;
case kInstAlt:
case kInstAltMatch:
case kInstByteRange:
case kInstFail:
case kInstEmptyWidth:
return false;
case kInstCapture:
case kInstNop:
ip = prog->inst(ip->out());
break;
case kInstMatch:
return true;
}
}
}
uint32_t Prog::EmptyFlags(const StringPiece& text, const char* p) {
int flags = 0;
// ^ and \A
if (p == text.begin())
flags |= kEmptyBeginText | kEmptyBeginLine;
else if (p[-1] == '\n')
flags |= kEmptyBeginLine;
// $ and \z
if (p == text.end())
flags |= kEmptyEndText | kEmptyEndLine;
else if (p < text.end() && p[0] == '\n')
flags |= kEmptyEndLine;
// \b and \B
if (p == text.begin() && p == text.end()) {
// no word boundary here
} else if (p == text.begin()) {
if (IsWordChar(p[0]))
flags |= kEmptyWordBoundary;
} else if (p == text.end()) {
if (IsWordChar(p[-1]))
flags |= kEmptyWordBoundary;
} else {
if (IsWordChar(p[-1]) != IsWordChar(p[0]))
flags |= kEmptyWordBoundary;
}
if (!(flags & kEmptyWordBoundary))
flags |= kEmptyNonWordBoundary;
return flags;
}
// ByteMapBuilder implements a coloring algorithm.
//
// The first phase is a series of "mark and merge" batches: we mark one or more
// [lo-hi] ranges, then merge them into our internal state. Batching is not for
// performance; rather, it means that the ranges are treated indistinguishably.
//
// Internally, the ranges are represented using a bitmap that stores the splits
// and a vector that stores the colors; both of them are indexed by the ranges'
// last bytes. Thus, in order to merge a [lo-hi] range, we split at lo-1 and at
// hi (if not already split), then recolor each range in between. The color map
// (i.e. from the old color to the new color) is maintained for the lifetime of
// the batch and so underpins this somewhat obscure approach to set operations.
//
// The second phase builds the bytemap from our internal state: we recolor each
// range, then store the new color (which is now the byte class) in each of the
// corresponding array elements. Finally, we output the number of byte classes.
class ByteMapBuilder {
public:
ByteMapBuilder() {
// Initial state: the [0-255] range has color 256.
// This will avoid problems during the second phase,
// in which we assign byte classes numbered from 0.
splits_.Set(255);
colors_.resize(256);
colors_[255] = 256;
nextcolor_ = 257;
}
void Mark(int lo, int hi);
void Merge();
void Build(uint8_t* bytemap, int* bytemap_range);
private:
int Recolor(int oldcolor);
Bitmap256 splits_;
std::vector<int> colors_;
int nextcolor_;
std::vector<std::pair<int, int>> colormap_;
std::vector<std::pair<int, int>> ranges_;
ByteMapBuilder(const ByteMapBuilder&) = delete;
ByteMapBuilder& operator=(const ByteMapBuilder&) = delete;
};
void ByteMapBuilder::Mark(int lo, int hi) {
DCHECK_GE(lo, 0);
DCHECK_GE(hi, 0);
DCHECK_LE(lo, 255);
DCHECK_LE(hi, 255);
DCHECK_LE(lo, hi);
// Ignore any [0-255] ranges. They cause us to recolor every range, which
// has no effect on the eventual result and is therefore a waste of time.
if (lo == 0 && hi == 255)
return;
ranges_.emplace_back(lo, hi);
}
void ByteMapBuilder::Merge() {
for (std::vector<std::pair<int, int>>::const_iterator it = ranges_.begin();
it != ranges_.end();
++it) {
int lo = it->first-1;
int hi = it->second;
if (0 <= lo && !splits_.Test(lo)) {
splits_.Set(lo);
int next = splits_.FindNextSetBit(lo+1);
colors_[lo] = colors_[next];
}
if (!splits_.Test(hi)) {
splits_.Set(hi);
int next = splits_.FindNextSetBit(hi+1);
colors_[hi] = colors_[next];
}
int c = lo+1;
while (c < 256) {
int next = splits_.FindNextSetBit(c);
colors_[next] = Recolor(colors_[next]);
if (next == hi)
break;
c = next+1;
}
}
colormap_.clear();
ranges_.clear();
}
void ByteMapBuilder::Build(uint8_t* bytemap, int* bytemap_range) {
// Assign byte classes numbered from 0.
nextcolor_ = 0;
int c = 0;
while (c < 256) {
int next = splits_.FindNextSetBit(c);
uint8_t b = static_cast<uint8_t>(Recolor(colors_[next]));
while (c <= next) {
bytemap[c] = b;
c++;
}
}
*bytemap_range = nextcolor_;
}
int ByteMapBuilder::Recolor(int oldcolor) {
// Yes, this is a linear search. There can be at most 256
// colors and there will typically be far fewer than that.
// Also, we need to consider keys *and* values in order to
// avoid recoloring a given range more than once per batch.
std::vector<std::pair<int, int>>::const_iterator it =
std::find_if(colormap_.begin(), colormap_.end(),
[=](const std::pair<int, int>& kv) -> bool {
return kv.first == oldcolor || kv.second == oldcolor;
});
if (it != colormap_.end())
return it->second;
int newcolor = nextcolor_;
nextcolor_++;
colormap_.emplace_back(oldcolor, newcolor);
return newcolor;
}
void Prog::ComputeByteMap() {
// Fill in bytemap with byte classes for the program.
// Ranges of bytes that are treated indistinguishably
// will be mapped to a single byte class.
ByteMapBuilder builder;
// Don't repeat the work for ^ and $.
bool marked_line_boundaries = false;
// Don't repeat the work for \b and \B.
bool marked_word_boundaries = false;
for (int id = 0; id < size(); id++) {
Inst* ip = inst(id);
if (ip->opcode() == kInstByteRange) {
int lo = ip->lo();
int hi = ip->hi();
builder.Mark(lo, hi);
if (ip->foldcase() && lo <= 'z' && hi >= 'a') {
int foldlo = lo;
int foldhi = hi;
if (foldlo < 'a')
foldlo = 'a';
if (foldhi > 'z')
foldhi = 'z';
if (foldlo <= foldhi)
builder.Mark(foldlo + 'A' - 'a', foldhi + 'A' - 'a');
}
// If this Inst is not the last Inst in its list AND the next Inst is
// also a ByteRange AND the Insts have the same out, defer the merge.
if (!ip->last() &&
inst(id+1)->opcode() == kInstByteRange &&
ip->out() == inst(id+1)->out())
continue;
builder.Merge();
} else if (ip->opcode() == kInstEmptyWidth) {
if (ip->empty() & (kEmptyBeginLine|kEmptyEndLine) &&
!marked_line_boundaries) {
builder.Mark('\n', '\n');
builder.Merge();
marked_line_boundaries = true;
}
if (ip->empty() & (kEmptyWordBoundary|kEmptyNonWordBoundary) &&
!marked_word_boundaries) {
// We require two batches here: the first for ranges that are word
// characters, the second for ranges that are not word characters.
for (bool isword : {true, false}) {
int j;
for (int i = 0; i < 256; i = j) {
for (j = i + 1; j < 256 &&
Prog::IsWordChar(static_cast<uint8_t>(i)) ==
Prog::IsWordChar(static_cast<uint8_t>(j));
j++)
;
if (Prog::IsWordChar(static_cast<uint8_t>(i)) == isword)
builder.Mark(i, j - 1);
}
builder.Merge();
}
marked_word_boundaries = true;
}
}
}
builder.Build(bytemap_, &bytemap_range_);
if (0) { // For debugging, use trivial bytemap.
LOG(ERROR) << "Using trivial bytemap.";
for (int i = 0; i < 256; i++)
bytemap_[i] = static_cast<uint8_t>(i);
bytemap_range_ = 256;
}
}
// Prog::Flatten() implements a graph rewriting algorithm.
//
// The overall process is similar to epsilon removal, but retains some epsilon
// transitions: those from Capture and EmptyWidth instructions; and those from
// nullable subexpressions. (The latter avoids quadratic blowup in transitions
// in the worst case.) It might be best thought of as Alt instruction elision.
//
// In conceptual terms, it divides the Prog into "trees" of instructions, then
// traverses the "trees" in order to produce "lists" of instructions. A "tree"
// is one or more instructions that grow from one "root" instruction to one or
// more "leaf" instructions; if a "tree" has exactly one instruction, then the
// "root" is also the "leaf". In most cases, a "root" is the successor of some
// "leaf" (i.e. the "leaf" instruction's out() returns the "root" instruction)
// and is considered a "successor root". A "leaf" can be a ByteRange, Capture,
// EmptyWidth or Match instruction. However, this is insufficient for handling
// nested nullable subexpressions correctly, so in some cases, a "root" is the
// dominator of the instructions reachable from some "successor root" (i.e. it
// has an unreachable predecessor) and is considered a "dominator root". Since
// only Alt instructions can be "dominator roots" (other instructions would be
// "leaves"), only Alt instructions are required to be marked as predecessors.
//
// Dividing the Prog into "trees" comprises two passes: marking the "successor
// roots" and the predecessors; and marking the "dominator roots". Sorting the
// "successor roots" by their bytecode offsets enables iteration in order from
// greatest to least during the second pass; by working backwards in this case
// and flooding the graph no further than "leaves" and already marked "roots",
// it becomes possible to mark "dominator roots" without doing excessive work.
//
// Traversing the "trees" is just iterating over the "roots" in order of their
// marking and flooding the graph no further than "leaves" and "roots". When a
// "leaf" is reached, the instruction is copied with its successor remapped to
// its "root" number. When a "root" is reached, a Nop instruction is generated
// with its successor remapped similarly. As each "list" is produced, its last
// instruction is marked as such. After all of the "lists" have been produced,
// a pass over their instructions remaps their successors to bytecode offsets.
void Prog::Flatten() {
if (did_flatten_)
return;
did_flatten_ = true;
// Scratch structures. It's important that these are reused by functions
// that we call in loops because they would thrash the heap otherwise.
SparseSet reachable(size());
std::vector<int> stk;
stk.reserve(size());
// First pass: Marks "successor roots" and predecessors.
// Builds the mapping from inst-ids to root-ids.
SparseArray<int> rootmap(size());
SparseArray<int> predmap(size());
std::vector<std::vector<int>> predvec;
MarkSuccessors(&rootmap, &predmap, &predvec, &reachable, &stk);
// Second pass: Marks "dominator roots".
SparseArray<int> sorted(rootmap);
std::sort(sorted.begin(), sorted.end(), sorted.less);
for (SparseArray<int>::const_iterator i = sorted.end() - 1;
i != sorted.begin();
--i) {
if (i->index() != start_unanchored() && i->index() != start())
MarkDominator(i->index(), &rootmap, &predmap, &predvec, &reachable, &stk);
}
// Third pass: Emits "lists". Remaps outs to root-ids.
// Builds the mapping from root-ids to flat-ids.
std::vector<int> flatmap(rootmap.size());
std::vector<Inst> flat;
flat.reserve(size());
for (SparseArray<int>::const_iterator i = rootmap.begin();
i != rootmap.end();
++i) {
flatmap[i->value()] = static_cast<int>(flat.size());
EmitList(i->index(), &rootmap, &flat, &reachable, &stk);
flat.back().set_last();
}
list_count_ = static_cast<int>(flatmap.size());
for (int i = 0; i < kNumInst; i++)
inst_count_[i] = 0;
// Fourth pass: Remaps outs to flat-ids.
// Counts instructions by opcode.
for (int id = 0; id < static_cast<int>(flat.size()); id++) {
Inst* ip = &flat[id];
if (ip->opcode() != kInstAltMatch) // handled in EmitList()
ip->set_out(flatmap[ip->out()]);
inst_count_[ip->opcode()]++;
}
int total = 0;
for (int i = 0; i < kNumInst; i++)
total += inst_count_[i];
DCHECK_EQ(total, static_cast<int>(flat.size()));
// Remap start_unanchored and start.
if (start_unanchored() == 0) {
DCHECK_EQ(start(), 0);
} else if (start_unanchored() == start()) {
set_start_unanchored(flatmap[1]);
set_start(flatmap[1]);
} else {
set_start_unanchored(flatmap[1]);
set_start(flatmap[2]);
}
// Finally, replace the old instructions with the new instructions.
size_ = static_cast<int>(flat.size());
delete[] inst_;
inst_ = new Inst[size_];
memmove(inst_, flat.data(), size_ * sizeof *inst_);
}
void Prog::MarkSuccessors(SparseArray<int>* rootmap,
SparseArray<int>* predmap,
std::vector<std::vector<int>>* predvec,
SparseSet* reachable, std::vector<int>* stk) {
// Mark the kInstFail instruction.
rootmap->set_new(0, rootmap->size());
// Mark the start_unanchored and start instructions.
if (!rootmap->has_index(start_unanchored()))
rootmap->set_new(start_unanchored(), rootmap->size());
if (!rootmap->has_index(start()))
rootmap->set_new(start(), rootmap->size());
reachable->clear();
stk->clear();
stk->push_back(start_unanchored());
while (!stk->empty()) {
int id = stk->back();
stk->pop_back();
Loop:
if (reachable->contains(id))
continue;
reachable->insert_new(id);
Inst* ip = inst(id);
switch (ip->opcode()) {
default:
LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
break;
case kInstAltMatch:
case kInstAlt:
// Mark this instruction as a predecessor of each out.
for (int out : {ip->out(), ip->out1()}) {
if (!predmap->has_index(out)) {
predmap->set_new(out, static_cast<int>(predvec->size()));
predvec->emplace_back();
}
(*predvec)[predmap->get_existing(out)].emplace_back(id);
}
stk->push_back(ip->out1());
id = ip->out();
goto Loop;
case kInstByteRange:
case kInstCapture:
case kInstEmptyWidth:
// Mark the out of this instruction as a "root".
if (!rootmap->has_index(ip->out()))
rootmap->set_new(ip->out(), rootmap->size());
id = ip->out();
goto Loop;
case kInstNop:
id = ip->out();
goto Loop;
case kInstMatch:
case kInstFail:
break;
}
}
}
void Prog::MarkDominator(int root, SparseArray<int>* rootmap,
SparseArray<int>* predmap,
std::vector<std::vector<int>>* predvec,
SparseSet* reachable, std::vector<int>* stk) {
reachable->clear();
stk->clear();
stk->push_back(root);
while (!stk->empty()) {
int id = stk->back();
stk->pop_back();
Loop:
if (reachable->contains(id))
continue;
reachable->insert_new(id);
if (id != root && rootmap->has_index(id)) {
// We reached another "tree" via epsilon transition.
continue;
}
Inst* ip = inst(id);
switch (ip->opcode()) {
default:
LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
break;
case kInstAltMatch:
case kInstAlt:
stk->push_back(ip->out1());
id = ip->out();
goto Loop;
case kInstByteRange:
case kInstCapture:
case kInstEmptyWidth:
break;
case kInstNop:
id = ip->out();
goto Loop;
case kInstMatch:
case kInstFail:
break;
}
}
for (SparseSet::const_iterator i = reachable->begin();
i != reachable->end();
++i) {
int id = *i;
if (predmap->has_index(id)) {
for (int pred : (*predvec)[predmap->get_existing(id)]) {
if (!reachable->contains(pred)) {
// id has a predecessor that cannot be reached from root!
// Therefore, id must be a "root" too - mark it as such.
if (!rootmap->has_index(id))
rootmap->set_new(id, rootmap->size());
}
}
}
}
}
void Prog::EmitList(int root, SparseArray<int>* rootmap,
std::vector<Inst>* flat,
SparseSet* reachable, std::vector<int>* stk) {
reachable->clear();
stk->clear();
stk->push_back(root);
while (!stk->empty()) {
int id = stk->back();
stk->pop_back();
Loop:
if (reachable->contains(id))
continue;
reachable->insert_new(id);
if (id != root && rootmap->has_index(id)) {
// We reached another "tree" via epsilon transition. Emit a kInstNop
// instruction so that the Prog does not become quadratically larger.
flat->emplace_back();
flat->back().set_opcode(kInstNop);
flat->back().set_out(rootmap->get_existing(id));
continue;
}
Inst* ip = inst(id);
switch (ip->opcode()) {
default:
LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
break;
case kInstAltMatch:
flat->emplace_back();
flat->back().set_opcode(kInstAltMatch);
flat->back().set_out(static_cast<int>(flat->size()));
flat->back().out1_ = static_cast<uint32_t>(flat->size())+1;
FALLTHROUGH_INTENDED;
case kInstAlt:
stk->push_back(ip->out1());
id = ip->out();
goto Loop;
case kInstByteRange:
case kInstCapture:
case kInstEmptyWidth:
flat->emplace_back();
memmove(&flat->back(), ip, sizeof *ip);
flat->back().set_out(rootmap->get_existing(ip->out()));
break;
case kInstNop:
id = ip->out();
goto Loop;
case kInstMatch:
case kInstFail:
flat->emplace_back();
memmove(&flat->back(), ip, sizeof *ip);
break;
}
}
}
} // namespace re2

View File

@ -1,410 +0,0 @@
// Copyright 2007 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_PROG_H_
#define RE2_PROG_H_
// Compiled representation of regular expressions.
// See regexp.h for the Regexp class, which represents a regular
// expression symbolically.
#include <stdint.h>
#include <functional>
#include <mutex>
#include <string>
#include <vector>
#include "util/util.h"
#include "util/logging.h"
#include "util/sparse_array.h"
#include "util/sparse_set.h"
#include "re2/re2.h"
namespace re2 {
// Opcodes for Inst
enum InstOp {
kInstAlt = 0, // choose between out_ and out1_
kInstAltMatch, // Alt: out_ is [00-FF] and back, out1_ is match; or vice versa.
kInstByteRange, // next (possible case-folded) byte must be in [lo_, hi_]
kInstCapture, // capturing parenthesis number cap_
kInstEmptyWidth, // empty-width special (^ $ ...); bit(s) set in empty_
kInstMatch, // found a match!
kInstNop, // no-op; occasionally unavoidable
kInstFail, // never match; occasionally unavoidable
kNumInst,
};
// Bit flags for empty-width specials
enum EmptyOp {
kEmptyBeginLine = 1<<0, // ^ - beginning of line
kEmptyEndLine = 1<<1, // $ - end of line
kEmptyBeginText = 1<<2, // \A - beginning of text
kEmptyEndText = 1<<3, // \z - end of text
kEmptyWordBoundary = 1<<4, // \b - word boundary
kEmptyNonWordBoundary = 1<<5, // \B - not \b
kEmptyAllFlags = (1<<6)-1,
};
class DFA;
class Regexp;
// Compiled form of regexp program.
class Prog {
public:
Prog();
~Prog();
// Single instruction in regexp program.
class Inst {
public:
Inst() : out_opcode_(0), out1_(0) {}
// Copyable.
Inst(const Inst&) = default;
Inst& operator=(const Inst&) = default;
// Constructors per opcode
void InitAlt(uint32_t out, uint32_t out1);
void InitByteRange(int lo, int hi, int foldcase, uint32_t out);
void InitCapture(int cap, uint32_t out);
void InitEmptyWidth(EmptyOp empty, uint32_t out);
void InitMatch(int id);
void InitNop(uint32_t out);
void InitFail();
// Getters
int id(Prog* p) { return static_cast<int>(this - p->inst_); }
InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); }
int last() { return (out_opcode_>>3)&1; }
int out() { return out_opcode_>>4; }
int out1() { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; }
int cap() { DCHECK_EQ(opcode(), kInstCapture); return cap_; }
int lo() { DCHECK_EQ(opcode(), kInstByteRange); return lo_; }
int hi() { DCHECK_EQ(opcode(), kInstByteRange); return hi_; }
int foldcase() { DCHECK_EQ(opcode(), kInstByteRange); return foldcase_; }
int match_id() { DCHECK_EQ(opcode(), kInstMatch); return match_id_; }
EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; }
bool greedy(Prog* p) {
DCHECK_EQ(opcode(), kInstAltMatch);
return p->inst(out())->opcode() == kInstByteRange ||
(p->inst(out())->opcode() == kInstNop &&
p->inst(p->inst(out())->out())->opcode() == kInstByteRange);
}
// Does this inst (an kInstByteRange) match c?
inline bool Matches(int c) {
DCHECK_EQ(opcode(), kInstByteRange);
if (foldcase_ && 'A' <= c && c <= 'Z')
c += 'a' - 'A';
return lo_ <= c && c <= hi_;
}
// Returns string representation for debugging.
string Dump();
// Maximum instruction id.
// (Must fit in out_opcode_. PatchList/last steal another bit.)
static const int kMaxInst = (1<<28) - 1;
private:
void set_opcode(InstOp opcode) {
out_opcode_ = (out()<<4) | (last()<<3) | opcode;
}
void set_last() {
out_opcode_ = (out()<<4) | (1<<3) | opcode();
}
void set_out(int out) {
out_opcode_ = (out<<4) | (last()<<3) | opcode();
}
void set_out_opcode(int out, InstOp opcode) {
out_opcode_ = (out<<4) | (last()<<3) | opcode;
}
uint32_t out_opcode_; // 28 bits: out, 1 bit: last, 3 (low) bits: opcode
union { // additional instruction arguments:
uint32_t out1_; // opcode == kInstAlt
// alternate next instruction
int32_t cap_; // opcode == kInstCapture
// Index of capture register (holds text
// position recorded by capturing parentheses).
// For \n (the submatch for the nth parentheses),
// the left parenthesis captures into register 2*n
// and the right one captures into register 2*n+1.
int32_t match_id_; // opcode == kInstMatch
// Match ID to identify this match (for re2::Set).
struct { // opcode == kInstByteRange
uint8_t lo_; // byte range is lo_-hi_ inclusive
uint8_t hi_; //
uint8_t foldcase_; // convert A-Z to a-z before checking range.
};
EmptyOp empty_; // opcode == kInstEmptyWidth
// empty_ is bitwise OR of kEmpty* flags above.
};
friend class Compiler;
friend struct PatchList;
friend class Prog;
};
// Whether to anchor the search.
enum Anchor {
kUnanchored, // match anywhere
kAnchored, // match only starting at beginning of text
};
// Kind of match to look for (for anchor != kFullMatch)
//
// kLongestMatch mode finds the overall longest
// match but still makes its submatch choices the way
// Perl would, not in the way prescribed by POSIX.
// The POSIX rules are much more expensive to implement,
// and no one has needed them.
//
// kFullMatch is not strictly necessary -- we could use
// kLongestMatch and then check the length of the match -- but
// the matching code can run faster if it knows to consider only
// full matches.
enum MatchKind {
kFirstMatch, // like Perl, PCRE
kLongestMatch, // like egrep or POSIX
kFullMatch, // match only entire text; implies anchor==kAnchored
kManyMatch // for SearchDFA, records set of matches
};
Inst *inst(int id) { return &inst_[id]; }
int start() { return start_; }
int start_unanchored() { return start_unanchored_; }
void set_start(int start) { start_ = start; }
void set_start_unanchored(int start) { start_unanchored_ = start; }
int size() { return size_; }
bool reversed() { return reversed_; }
void set_reversed(bool reversed) { reversed_ = reversed; }
int list_count() { return list_count_; }
int inst_count(InstOp op) { return inst_count_[op]; }
void set_dfa_mem(int64_t dfa_mem) { dfa_mem_ = dfa_mem; }
int64_t dfa_mem() { return dfa_mem_; }
int flags() { return flags_; }
void set_flags(int flags) { flags_ = flags; }
bool anchor_start() { return anchor_start_; }
void set_anchor_start(bool b) { anchor_start_ = b; }
bool anchor_end() { return anchor_end_; }
void set_anchor_end(bool b) { anchor_end_ = b; }
int bytemap_range() { return bytemap_range_; }
const uint8_t* bytemap() { return bytemap_; }
// Lazily computed.
int first_byte();
// Returns string representation of program for debugging.
string Dump();
string DumpUnanchored();
string DumpByteMap();
// Returns the set of kEmpty flags that are in effect at
// position p within context.
static uint32_t EmptyFlags(const StringPiece& context, const char* p);
// Returns whether byte c is a word character: ASCII only.
// Used by the implementation of \b and \B.
// This is not right for Unicode, but:
// - it's hard to get right in a byte-at-a-time matching world
// (the DFA has only one-byte lookahead).
// - even if the lookahead were possible, the Progs would be huge.
// This crude approximation is the same one PCRE uses.
static bool IsWordChar(uint8_t c) {
return ('A' <= c && c <= 'Z') ||
('a' <= c && c <= 'z') ||
('0' <= c && c <= '9') ||
c == '_';
}
// Execution engines. They all search for the regexp (run the prog)
// in text, which is in the larger context (used for ^ $ \b etc).
// Anchor and kind control the kind of search.
// Returns true if match found, false if not.
// If match found, fills match[0..nmatch-1] with submatch info.
// match[0] is overall match, match[1] is first set of parens, etc.
// If a particular submatch is not matched during the regexp match,
// it is set to NULL.
//
// Matching text == StringPiece(NULL, 0) is treated as any other empty
// string, but note that on return, it will not be possible to distinguish
// submatches that matched that empty string from submatches that didn't
// match anything. Either way, match[i] == NULL.
// Search using NFA: can find submatches but kind of slow.
bool SearchNFA(const StringPiece& text, const StringPiece& context,
Anchor anchor, MatchKind kind,
StringPiece* match, int nmatch);
// Search using DFA: much faster than NFA but only finds
// end of match and can use a lot more memory.
// Returns whether a match was found.
// If the DFA runs out of memory, sets *failed to true and returns false.
// If matches != NULL and kind == kManyMatch and there is a match,
// SearchDFA fills matches with the match IDs of the final matching state.
bool SearchDFA(const StringPiece& text, const StringPiece& context,
Anchor anchor, MatchKind kind, StringPiece* match0,
bool* failed, SparseSet* matches);
// The callback issued after building each DFA state with BuildEntireDFA().
// If next is null, then the memory budget has been exhausted and building
// will halt. Otherwise, the state has been built and next points to an array
// of bytemap_range()+1 slots holding the next states as per the bytemap and
// kByteEndText. The number of the state is implied by the callback sequence:
// the first callback is for state 0, the second callback is for state 1, ...
// match indicates whether the state is a matching state.
using DFAStateCallback = std::function<void(const int* next, bool match)>;
// Build the entire DFA for the given match kind.
// Usually the DFA is built out incrementally, as needed, which
// avoids lots of unnecessary work.
// If cb is not empty, it receives one callback per state built.
// Returns the number of states built.
// FOR TESTING OR EXPERIMENTAL PURPOSES ONLY.
int BuildEntireDFA(MatchKind kind, const DFAStateCallback& cb);
// Controls whether the DFA should bail out early if the NFA would be faster.
// FOR TESTING ONLY.
static void TEST_dfa_should_bail_when_slow(bool b);
// Compute bytemap.
void ComputeByteMap();
// Computes whether all matches must begin with the same first
// byte, and if so, returns that byte. If not, returns -1.
int ComputeFirstByte();
// Run peep-hole optimizer on program.
void Optimize();
// One-pass NFA: only correct if IsOnePass() is true,
// but much faster than NFA (competitive with PCRE)
// for those expressions.
bool IsOnePass();
bool SearchOnePass(const StringPiece& text, const StringPiece& context,
Anchor anchor, MatchKind kind,
StringPiece* match, int nmatch);
// Bit-state backtracking. Fast on small cases but uses memory
// proportional to the product of the program size and the text size.
bool SearchBitState(const StringPiece& text, const StringPiece& context,
Anchor anchor, MatchKind kind,
StringPiece* match, int nmatch);
static const int kMaxOnePassCapture = 5; // $0 through $4
// Backtracking search: the gold standard against which the other
// implementations are checked. FOR TESTING ONLY.
// It allocates a ton of memory to avoid running forever.
// It is also recursive, so can't use in production (will overflow stacks).
// The name "Unsafe" here is supposed to be a flag that
// you should not be using this function.
bool UnsafeSearchBacktrack(const StringPiece& text,
const StringPiece& context,
Anchor anchor, MatchKind kind,
StringPiece* match, int nmatch);
// Computes range for any strings matching regexp. The min and max can in
// some cases be arbitrarily precise, so the caller gets to specify the
// maximum desired length of string returned.
//
// Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
// string s that is an anchored match for this regexp satisfies
// min <= s && s <= max.
//
// Note that PossibleMatchRange() will only consider the first copy of an
// infinitely repeated element (i.e., any regexp element followed by a '*' or
// '+' operator). Regexps with "{N}" constructions are not affected, as those
// do not compile down to infinite repetitions.
//
// Returns true on success, false on error.
bool PossibleMatchRange(string* min, string* max, int maxlen);
// EXPERIMENTAL! SUBJECT TO CHANGE!
// Outputs the program fanout into the given sparse array.
void Fanout(SparseArray<int>* fanout);
// Compiles a collection of regexps to Prog. Each regexp will have
// its own Match instruction recording the index in the output vector.
static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem);
// Flattens the Prog from "tree" form to "list" form. This is an in-place
// operation in the sense that the old instructions are lost.
void Flatten();
// Walks the Prog; the "successor roots" or predecessors of the reachable
// instructions are marked in rootmap or predmap/predvec, respectively.
// reachable and stk are preallocated scratch structures.
void MarkSuccessors(SparseArray<int>* rootmap,
SparseArray<int>* predmap,
std::vector<std::vector<int>>* predvec,
SparseSet* reachable, std::vector<int>* stk);
// Walks the Prog from the given "root" instruction; the "dominator root"
// of the reachable instructions (if such exists) is marked in rootmap.
// reachable and stk are preallocated scratch structures.
void MarkDominator(int root, SparseArray<int>* rootmap,
SparseArray<int>* predmap,
std::vector<std::vector<int>>* predvec,
SparseSet* reachable, std::vector<int>* stk);
// Walks the Prog from the given "root" instruction; the reachable
// instructions are emitted in "list" form and appended to flat.
// reachable and stk are preallocated scratch structures.
void EmitList(int root, SparseArray<int>* rootmap,
std::vector<Inst>* flat,
SparseSet* reachable, std::vector<int>* stk);
private:
friend class Compiler;
DFA* GetDFA(MatchKind kind);
void DeleteDFA(DFA* dfa);
bool anchor_start_; // regexp has explicit start anchor
bool anchor_end_; // regexp has explicit end anchor
bool reversed_; // whether program runs backward over input
bool did_flatten_; // has Flatten been called?
bool did_onepass_; // has IsOnePass been called?
int start_; // entry point for program
int start_unanchored_; // unanchored entry point for program
int size_; // number of instructions
int bytemap_range_; // bytemap_[x] < bytemap_range_
int first_byte_; // required first byte for match, or -1 if none
int flags_; // regexp parse flags
int list_count_; // count of lists (see above)
int inst_count_[kNumInst]; // count of instructions by opcode
Inst* inst_; // pointer to instruction array
uint8_t* onepass_nodes_; // data for OnePass nodes
int64_t dfa_mem_; // Maximum memory for DFAs.
DFA* dfa_first_; // DFA cached for kFirstMatch/kManyMatch
DFA* dfa_longest_; // DFA cached for kLongestMatch/kFullMatch
uint8_t bytemap_[256]; // map from input bytes to byte classes
std::once_flag first_byte_once_;
std::once_flag dfa_first_once_;
std::once_flag dfa_longest_once_;
Prog(const Prog&) = delete;
Prog& operator=(const Prog&) = delete;
};
} // namespace re2
#endif // RE2_PROG_H_

File diff suppressed because it is too large Load Diff

View File

@ -1,941 +0,0 @@
// Copyright 2003-2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_RE2_H_
#define RE2_RE2_H_
// C++ interface to the re2 regular-expression library.
// RE2 supports Perl-style regular expressions (with extensions like
// \d, \w, \s, ...).
//
// -----------------------------------------------------------------------
// REGEXP SYNTAX:
//
// This module uses the re2 library and hence supports
// its syntax for regular expressions, which is similar to Perl's with
// some of the more complicated things thrown away. In particular,
// backreferences and generalized assertions are not available, nor is \Z.
//
// See https://github.com/google/re2/wiki/Syntax for the syntax
// supported by RE2, and a comparison with PCRE and PERL regexps.
//
// For those not familiar with Perl's regular expressions,
// here are some examples of the most commonly used extensions:
//
// "hello (\\w+) world" -- \w matches a "word" character
// "version (\\d+)" -- \d matches a digit
// "hello\\s+world" -- \s matches any whitespace character
// "\\b(\\w+)\\b" -- \b matches non-empty string at word boundary
// "(?i)hello" -- (?i) turns on case-insensitive matching
// "/\\*(.*?)\\*/" -- .*? matches . minimum no. of times possible
//
// -----------------------------------------------------------------------
// MATCHING INTERFACE:
//
// The "FullMatch" operation checks that supplied text matches a
// supplied pattern exactly.
//
// Example: successful match
// CHECK(RE2::FullMatch("hello", "h.*o"));
//
// Example: unsuccessful match (requires full match):
// CHECK(!RE2::FullMatch("hello", "e"));
//
// -----------------------------------------------------------------------
// UTF-8 AND THE MATCHING INTERFACE:
//
// By default, the pattern and input text are interpreted as UTF-8.
// The RE2::Latin1 option causes them to be interpreted as Latin-1.
//
// Example:
// CHECK(RE2::FullMatch(utf8_string, RE2(utf8_pattern)));
// CHECK(RE2::FullMatch(latin1_string, RE2(latin1_pattern, RE2::Latin1)));
//
// -----------------------------------------------------------------------
// MATCHING WITH SUB-STRING EXTRACTION:
//
// You can supply extra pointer arguments to extract matched subpieces.
//
// Example: extracts "ruby" into "s" and 1234 into "i"
// int i;
// string s;
// CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i));
//
// Example: fails because string cannot be stored in integer
// CHECK(!RE2::FullMatch("ruby", "(.*)", &i));
//
// Example: fails because there aren't enough sub-patterns:
// CHECK(!RE2::FullMatch("ruby:1234", "\\w+:\\d+", &s));
//
// Example: does not try to extract any extra sub-patterns
// CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s));
//
// Example: does not try to extract into NULL
// CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", NULL, &i));
//
// Example: integer overflow causes failure
// CHECK(!RE2::FullMatch("ruby:1234567891234", "\\w+:(\\d+)", &i));
//
// NOTE(rsc): Asking for substrings slows successful matches quite a bit.
// This may get a little faster in the future, but right now is slower
// than PCRE. On the other hand, failed matches run *very* fast (faster
// than PCRE), as do matches without substring extraction.
//
// -----------------------------------------------------------------------
// PARTIAL MATCHES
//
// You can use the "PartialMatch" operation when you want the pattern
// to match any substring of the text.
//
// Example: simple search for a string:
// CHECK(RE2::PartialMatch("hello", "ell"));
//
// Example: find first number in a string
// int number;
// CHECK(RE2::PartialMatch("x*100 + 20", "(\\d+)", &number));
// CHECK_EQ(number, 100);
//
// -----------------------------------------------------------------------
// PRE-COMPILED REGULAR EXPRESSIONS
//
// RE2 makes it easy to use any string as a regular expression, without
// requiring a separate compilation step.
//
// If speed is of the essence, you can create a pre-compiled "RE2"
// object from the pattern and use it multiple times. If you do so,
// you can typically parse text faster than with sscanf.
//
// Example: precompile pattern for faster matching:
// RE2 pattern("h.*o");
// while (ReadLine(&str)) {
// if (RE2::FullMatch(str, pattern)) ...;
// }
//
// -----------------------------------------------------------------------
// SCANNING TEXT INCREMENTALLY
//
// The "Consume" operation may be useful if you want to repeatedly
// match regular expressions at the front of a string and skip over
// them as they match. This requires use of the "StringPiece" type,
// which represents a sub-range of a real string.
//
// Example: read lines of the form "var = value" from a string.
// string contents = ...; // Fill string somehow
// StringPiece input(contents); // Wrap a StringPiece around it
//
// string var;
// int value;
// while (RE2::Consume(&input, "(\\w+) = (\\d+)\n", &var, &value)) {
// ...;
// }
//
// Each successful call to "Consume" will set "var/value", and also
// advance "input" so it points past the matched text. Note that if the
// regular expression matches an empty string, input will advance
// by 0 bytes. If the regular expression being used might match
// an empty string, the loop body must check for this case and either
// advance the string or break out of the loop.
//
// The "FindAndConsume" operation is similar to "Consume" but does not
// anchor your match at the beginning of the string. For example, you
// could extract all words from a string by repeatedly calling
// RE2::FindAndConsume(&input, "(\\w+)", &word)
//
// -----------------------------------------------------------------------
// USING VARIABLE NUMBER OF ARGUMENTS
//
// The above operations require you to know the number of arguments
// when you write the code. This is not always possible or easy (for
// example, the regular expression may be calculated at run time).
// You can use the "N" version of the operations when the number of
// match arguments are determined at run time.
//
// Example:
// const RE2::Arg* args[10];
// int n;
// // ... populate args with pointers to RE2::Arg values ...
// // ... set n to the number of RE2::Arg objects ...
// bool match = RE2::FullMatchN(input, pattern, args, n);
//
// The last statement is equivalent to
//
// bool match = RE2::FullMatch(input, pattern,
// *args[0], *args[1], ..., *args[n - 1]);
//
// -----------------------------------------------------------------------
// PARSING HEX/OCTAL/C-RADIX NUMBERS
//
// By default, if you pass a pointer to a numeric value, the
// corresponding text is interpreted as a base-10 number. You can
// instead wrap the pointer with a call to one of the operators Hex(),
// Octal(), or CRadix() to interpret the text in another base. The
// CRadix operator interprets C-style "0" (base-8) and "0x" (base-16)
// prefixes, but defaults to base-10.
//
// Example:
// int a, b, c, d;
// CHECK(RE2::FullMatch("100 40 0100 0x40", "(.*) (.*) (.*) (.*)",
// RE2::Octal(&a), RE2::Hex(&b), RE2::CRadix(&c), RE2::CRadix(&d));
// will leave 64 in a, b, c, and d.
#include <stddef.h>
#include <stdint.h>
#include <algorithm>
#include <map>
#include <mutex>
#include <string>
#include "re2/stringpiece.h"
namespace re2 {
class Prog;
class Regexp;
} // namespace re2
namespace re2 {
// TODO(junyer): Get rid of this.
using std::string;
// Interface for regular expression matching. Also corresponds to a
// pre-compiled regular expression. An "RE2" object is safe for
// concurrent use by multiple threads.
class RE2 {
public:
// We convert user-passed pointers into special Arg objects
class Arg;
class Options;
// Defined in set.h.
class Set;
enum ErrorCode {
NoError = 0,
// Unexpected error
ErrorInternal,
// Parse errors
ErrorBadEscape, // bad escape sequence
ErrorBadCharClass, // bad character class
ErrorBadCharRange, // bad character class range
ErrorMissingBracket, // missing closing ]
ErrorMissingParen, // missing closing )
ErrorTrailingBackslash, // trailing \ at end of regexp
ErrorRepeatArgument, // repeat argument missing, e.g. "*"
ErrorRepeatSize, // bad repetition argument
ErrorRepeatOp, // bad repetition operator
ErrorBadPerlOp, // bad perl operator
ErrorBadUTF8, // invalid UTF-8 in regexp
ErrorBadNamedCapture, // bad named capture group
ErrorPatternTooLarge // pattern too large (compile failed)
};
// Predefined common options.
// If you need more complicated things, instantiate
// an Option class, possibly passing one of these to
// the Option constructor, change the settings, and pass that
// Option class to the RE2 constructor.
enum CannedOptions {
DefaultOptions = 0,
Latin1, // treat input as Latin-1 (default UTF-8)
POSIX, // POSIX syntax, leftmost-longest match
Quiet // do not log about regexp parse errors
};
// Need to have the const char* and const string& forms for implicit
// conversions when passing string literals to FullMatch and PartialMatch.
// Otherwise the StringPiece form would be sufficient.
#ifndef SWIG
RE2(const char* pattern);
RE2(const string& pattern);
#endif
RE2(const StringPiece& pattern);
RE2(const StringPiece& pattern, const Options& options);
~RE2();
// Returns whether RE2 was created properly.
bool ok() const { return error_code() == NoError; }
// The string specification for this RE2. E.g.
// RE2 re("ab*c?d+");
// re.pattern(); // "ab*c?d+"
const string& pattern() const { return pattern_; }
// If RE2 could not be created properly, returns an error string.
// Else returns the empty string.
const string& error() const { return *error_; }
// If RE2 could not be created properly, returns an error code.
// Else returns RE2::NoError (== 0).
ErrorCode error_code() const { return error_code_; }
// If RE2 could not be created properly, returns the offending
// portion of the regexp.
const string& error_arg() const { return error_arg_; }
// Returns the program size, a very approximate measure of a regexp's "cost".
// Larger numbers are more expensive than smaller numbers.
int ProgramSize() const;
// EXPERIMENTAL! SUBJECT TO CHANGE!
// Outputs the program fanout as a histogram bucketed by powers of 2.
// Returns the number of the largest non-empty bucket.
int ProgramFanout(std::map<int, int>* histogram) const;
// Returns the underlying Regexp; not for general use.
// Returns entire_regexp_ so that callers don't need
// to know about prefix_ and prefix_foldcase_.
re2::Regexp* Regexp() const { return entire_regexp_; }
/***** The useful part: the matching interface *****/
// Matches "text" against "re". If pointer arguments are
// supplied, copies matched sub-patterns into them.
//
// You can pass in a "const char*" or a "string" for "text".
// You can pass in a "const char*" or a "string" or a "RE2" for "re".
//
// The provided pointer arguments can be pointers to any scalar numeric
// type, or one of:
// string (matched piece is copied to string)
// StringPiece (StringPiece is mutated to point to matched piece)
// T (where "bool T::ParseFrom(const char*, size_t)" exists)
// (void*)NULL (the corresponding matched sub-pattern is not copied)
//
// Returns true iff all of the following conditions are satisfied:
// a. "text" matches "re" exactly
// b. The number of matched sub-patterns is >= number of supplied pointers
// c. The "i"th argument has a suitable type for holding the
// string captured as the "i"th sub-pattern. If you pass in
// NULL for the "i"th argument, or pass fewer arguments than
// number of sub-patterns, "i"th captured sub-pattern is
// ignored.
//
// CAVEAT: An optional sub-pattern that does not exist in the
// matched string is assigned the empty string. Therefore, the
// following will return false (because the empty string is not a
// valid number):
// int number;
// RE2::FullMatch("abc", "[a-z]+(\\d+)?", &number);
static bool FullMatchN(const StringPiece& text, const RE2& re,
const Arg* const args[], int argc);
// Exactly like FullMatch(), except that "re" is allowed to match
// a substring of "text".
static bool PartialMatchN(const StringPiece& text, const RE2& re,
const Arg* const args[], int argc);
// Like FullMatch() and PartialMatch(), except that "re" has to match
// a prefix of the text, and "input" is advanced past the matched
// text. Note: "input" is modified iff this routine returns true.
static bool ConsumeN(StringPiece* input, const RE2& re,
const Arg* const args[], int argc);
// Like Consume(), but does not anchor the match at the beginning of
// the text. That is, "re" need not start its match at the beginning
// of "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds
// the next word in "s" and stores it in "word".
static bool FindAndConsumeN(StringPiece* input, const RE2& re,
const Arg* const args[], int argc);
#ifndef SWIG
private:
template <typename F, typename SP>
static inline bool Apply(F f, SP sp, const RE2& re) {
return f(sp, re, NULL, 0);
}
template <typename F, typename SP, typename... A>
static inline bool Apply(F f, SP sp, const RE2& re, const A&... a) {
const Arg* const args[] = {&a...};
const int argc = sizeof...(a);
return f(sp, re, args, argc);
}
public:
// In order to allow FullMatch() et al. to be called with a varying number
// of arguments of varying types, we use two layers of variadic templates.
// The first layer constructs the temporary Arg objects. The second layer
// (above) constructs the array of pointers to the temporary Arg objects.
template <typename... A>
static bool FullMatch(const StringPiece& text, const RE2& re, A&&... a) {
return Apply(FullMatchN, text, re, Arg(std::forward<A>(a))...);
}
template <typename... A>
static bool PartialMatch(const StringPiece& text, const RE2& re, A&&... a) {
return Apply(PartialMatchN, text, re, Arg(std::forward<A>(a))...);
}
template <typename... A>
static bool Consume(StringPiece* input, const RE2& re, A&&... a) {
return Apply(ConsumeN, input, re, Arg(std::forward<A>(a))...);
}
template <typename... A>
static bool FindAndConsume(StringPiece* input, const RE2& re, A&&... a) {
return Apply(FindAndConsumeN, input, re, Arg(std::forward<A>(a))...);
}
#endif
// Replace the first match of "pattern" in "str" with "rewrite".
// Within "rewrite", backslash-escaped digits (\1 to \9) can be
// used to insert text matching corresponding parenthesized group
// from the pattern. \0 in "rewrite" refers to the entire matching
// text. E.g.,
//
// string s = "yabba dabba doo";
// CHECK(RE2::Replace(&s, "b+", "d"));
//
// will leave "s" containing "yada dabba doo"
//
// Returns true if the pattern matches and a replacement occurs,
// false otherwise.
static bool Replace(string *str,
const RE2& pattern,
const StringPiece& rewrite);
// Like Replace(), except replaces successive non-overlapping occurrences
// of the pattern in the string with the rewrite. E.g.
//
// string s = "yabba dabba doo";
// CHECK(RE2::GlobalReplace(&s, "b+", "d"));
//
// will leave "s" containing "yada dada doo"
// Replacements are not subject to re-matching.
//
// Because GlobalReplace only replaces non-overlapping matches,
// replacing "ana" within "banana" makes only one replacement, not two.
//
// Returns the number of replacements made.
static int GlobalReplace(string *str,
const RE2& pattern,
const StringPiece& rewrite);
// Like Replace, except that if the pattern matches, "rewrite"
// is copied into "out" with substitutions. The non-matching
// portions of "text" are ignored.
//
// Returns true iff a match occurred and the extraction happened
// successfully; if no match occurs, the string is left unaffected.
//
// REQUIRES: "text" must not alias any part of "*out".
static bool Extract(const StringPiece &text,
const RE2& pattern,
const StringPiece &rewrite,
string *out);
// Escapes all potentially meaningful regexp characters in
// 'unquoted'. The returned string, used as a regular expression,
// will exactly match the original string. For example,
// 1.5-2.0?
// may become:
// 1\.5\-2\.0\?
static string QuoteMeta(const StringPiece& unquoted);
// Computes range for any strings matching regexp. The min and max can in
// some cases be arbitrarily precise, so the caller gets to specify the
// maximum desired length of string returned.
//
// Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
// string s that is an anchored match for this regexp satisfies
// min <= s && s <= max.
//
// Note that PossibleMatchRange() will only consider the first copy of an
// infinitely repeated element (i.e., any regexp element followed by a '*' or
// '+' operator). Regexps with "{N}" constructions are not affected, as those
// do not compile down to infinite repetitions.
//
// Returns true on success, false on error.
bool PossibleMatchRange(string* min, string* max, int maxlen) const;
// Generic matching interface
// Type of match.
enum Anchor {
UNANCHORED, // No anchoring
ANCHOR_START, // Anchor at start only
ANCHOR_BOTH // Anchor at start and end
};
// Return the number of capturing subpatterns, or -1 if the
// regexp wasn't valid on construction. The overall match ($0)
// does not count: if the regexp is "(a)(b)", returns 2.
int NumberOfCapturingGroups() const;
// Return a map from names to capturing indices.
// The map records the index of the leftmost group
// with the given name.
// Only valid until the re is deleted.
const std::map<string, int>& NamedCapturingGroups() const;
// Return a map from capturing indices to names.
// The map has no entries for unnamed groups.
// Only valid until the re is deleted.
const std::map<int, string>& CapturingGroupNames() const;
// General matching routine.
// Match against text starting at offset startpos
// and stopping the search at offset endpos.
// Returns true if match found, false if not.
// On a successful match, fills in match[] (up to nmatch entries)
// with information about submatches.
// I.e. matching RE2("(foo)|(bar)baz") on "barbazbla" will return true,
// setting match[0] = "barbaz", match[1].data() = NULL, match[2] = "bar",
// match[3].data() = NULL, ..., up to match[nmatch-1].data() = NULL.
//
// Don't ask for more match information than you will use:
// runs much faster with nmatch == 1 than nmatch > 1, and
// runs even faster if nmatch == 0.
// Doesn't make sense to use nmatch > 1 + NumberOfCapturingGroups(),
// but will be handled correctly.
//
// Passing text == StringPiece(NULL, 0) will be handled like any other
// empty string, but note that on return, it will not be possible to tell
// whether submatch i matched the empty string or did not match:
// either way, match[i].data() == NULL.
bool Match(const StringPiece& text,
size_t startpos,
size_t endpos,
Anchor anchor,
StringPiece *match,
int nmatch) const;
// Check that the given rewrite string is suitable for use with this
// regular expression. It checks that:
// * The regular expression has enough parenthesized subexpressions
// to satisfy all of the \N tokens in rewrite
// * The rewrite string doesn't have any syntax errors. E.g.,
// '\' followed by anything other than a digit or '\'.
// A true return value guarantees that Replace() and Extract() won't
// fail because of a bad rewrite string.
bool CheckRewriteString(const StringPiece& rewrite, string* error) const;
// Returns the maximum submatch needed for the rewrite to be done by
// Replace(). E.g. if rewrite == "foo \\2,\\1", returns 2.
static int MaxSubmatch(const StringPiece& rewrite);
// Append the "rewrite" string, with backslash subsitutions from "vec",
// to string "out".
// Returns true on success. This method can fail because of a malformed
// rewrite string. CheckRewriteString guarantees that the rewrite will
// be sucessful.
bool Rewrite(string* out,
const StringPiece& rewrite,
const StringPiece* vec,
int veclen) const;
// Constructor options
class Options {
public:
// The options are (defaults in parentheses):
//
// utf8 (true) text and pattern are UTF-8; otherwise Latin-1
// posix_syntax (false) restrict regexps to POSIX egrep syntax
// longest_match (false) search for longest match, not first match
// log_errors (true) log syntax and execution errors to ERROR
// max_mem (see below) approx. max memory footprint of RE2
// literal (false) interpret string as literal, not regexp
// never_nl (false) never match \n, even if it is in regexp
// dot_nl (false) dot matches everything including new line
// never_capture (false) parse all parens as non-capturing
// case_sensitive (true) match is case-sensitive (regexp can override
// with (?i) unless in posix_syntax mode)
//
// The following options are only consulted when posix_syntax == true.
// (When posix_syntax == false these features are always enabled and
// cannot be turned off.)
// perl_classes (false) allow Perl's \d \s \w \D \S \W
// word_boundary (false) allow Perl's \b \B (word boundary and not)
// one_line (false) ^ and $ only match beginning and end of text
//
// The max_mem option controls how much memory can be used
// to hold the compiled form of the regexp (the Prog) and
// its cached DFA graphs. Code Search placed limits on the number
// of Prog instructions and DFA states: 10,000 for both.
// In RE2, those limits would translate to about 240 KB per Prog
// and perhaps 2.5 MB per DFA (DFA state sizes vary by regexp; RE2 does a
// better job of keeping them small than Code Search did).
// Each RE2 has two Progs (one forward, one reverse), and each Prog
// can have two DFAs (one first match, one longest match).
// That makes 4 DFAs:
//
// forward, first-match - used for UNANCHORED or ANCHOR_LEFT searches
// if opt.longest_match() == false
// forward, longest-match - used for all ANCHOR_BOTH searches,
// and the other two kinds if
// opt.longest_match() == true
// reverse, first-match - never used
// reverse, longest-match - used as second phase for unanchored searches
//
// The RE2 memory budget is statically divided between the two
// Progs and then the DFAs: two thirds to the forward Prog
// and one third to the reverse Prog. The forward Prog gives half
// of what it has left over to each of its DFAs. The reverse Prog
// gives it all to its longest-match DFA.
//
// Once a DFA fills its budget, it flushes its cache and starts over.
// If this happens too often, RE2 falls back on the NFA implementation.
// For now, make the default budget something close to Code Search.
static const int kDefaultMaxMem = 8<<20;
enum Encoding {
EncodingUTF8 = 1,
EncodingLatin1
};
Options() :
encoding_(EncodingUTF8),
posix_syntax_(false),
longest_match_(false),
log_errors_(true),
max_mem_(kDefaultMaxMem),
literal_(false),
never_nl_(false),
dot_nl_(false),
never_capture_(false),
case_sensitive_(true),
perl_classes_(false),
word_boundary_(false),
one_line_(false) {
}
/*implicit*/ Options(CannedOptions);
Encoding encoding() const { return encoding_; }
void set_encoding(Encoding encoding) { encoding_ = encoding; }
// Legacy interface to encoding.
// TODO(rsc): Remove once clients have been converted.
bool utf8() const { return encoding_ == EncodingUTF8; }
void set_utf8(bool b) {
if (b) {
encoding_ = EncodingUTF8;
} else {
encoding_ = EncodingLatin1;
}
}
bool posix_syntax() const { return posix_syntax_; }
void set_posix_syntax(bool b) { posix_syntax_ = b; }
bool longest_match() const { return longest_match_; }
void set_longest_match(bool b) { longest_match_ = b; }
bool log_errors() const { return log_errors_; }
void set_log_errors(bool b) { log_errors_ = b; }
int64_t max_mem() const { return max_mem_; }
void set_max_mem(int64_t m) { max_mem_ = m; }
bool literal() const { return literal_; }
void set_literal(bool b) { literal_ = b; }
bool never_nl() const { return never_nl_; }
void set_never_nl(bool b) { never_nl_ = b; }
bool dot_nl() const { return dot_nl_; }
void set_dot_nl(bool b) { dot_nl_ = b; }
bool never_capture() const { return never_capture_; }
void set_never_capture(bool b) { never_capture_ = b; }
bool case_sensitive() const { return case_sensitive_; }
void set_case_sensitive(bool b) { case_sensitive_ = b; }
bool perl_classes() const { return perl_classes_; }
void set_perl_classes(bool b) { perl_classes_ = b; }
bool word_boundary() const { return word_boundary_; }
void set_word_boundary(bool b) { word_boundary_ = b; }
bool one_line() const { return one_line_; }
void set_one_line(bool b) { one_line_ = b; }
void Copy(const Options& src) {
*this = src;
}
int ParseFlags() const;
private:
Encoding encoding_;
bool posix_syntax_;
bool longest_match_;
bool log_errors_;
int64_t max_mem_;
bool literal_;
bool never_nl_;
bool dot_nl_;
bool never_capture_;
bool case_sensitive_;
bool perl_classes_;
bool word_boundary_;
bool one_line_;
};
// Returns the options set in the constructor.
const Options& options() const { return options_; };
// Argument converters; see below.
static inline Arg CRadix(short* x);
static inline Arg CRadix(unsigned short* x);
static inline Arg CRadix(int* x);
static inline Arg CRadix(unsigned int* x);
static inline Arg CRadix(long* x);
static inline Arg CRadix(unsigned long* x);
static inline Arg CRadix(long long* x);
static inline Arg CRadix(unsigned long long* x);
static inline Arg Hex(short* x);
static inline Arg Hex(unsigned short* x);
static inline Arg Hex(int* x);
static inline Arg Hex(unsigned int* x);
static inline Arg Hex(long* x);
static inline Arg Hex(unsigned long* x);
static inline Arg Hex(long long* x);
static inline Arg Hex(unsigned long long* x);
static inline Arg Octal(short* x);
static inline Arg Octal(unsigned short* x);
static inline Arg Octal(int* x);
static inline Arg Octal(unsigned int* x);
static inline Arg Octal(long* x);
static inline Arg Octal(unsigned long* x);
static inline Arg Octal(long long* x);
static inline Arg Octal(unsigned long long* x);
private:
void Init(const StringPiece& pattern, const Options& options);
bool DoMatch(const StringPiece& text,
Anchor anchor,
size_t* consumed,
const Arg* const args[],
int n) const;
re2::Prog* ReverseProg() const;
string pattern_; // string regular expression
Options options_; // option flags
string prefix_; // required prefix (before regexp_)
bool prefix_foldcase_; // prefix is ASCII case-insensitive
re2::Regexp* entire_regexp_; // parsed regular expression
re2::Regexp* suffix_regexp_; // parsed regular expression, prefix removed
re2::Prog* prog_; // compiled program for regexp
bool is_one_pass_; // can use prog_->SearchOnePass?
mutable re2::Prog* rprog_; // reverse program for regexp
mutable const string* error_; // Error indicator
// (or points to empty string)
mutable ErrorCode error_code_; // Error code
mutable string error_arg_; // Fragment of regexp showing error
mutable int num_captures_; // Number of capturing groups
// Map from capture names to indices
mutable const std::map<string, int>* named_groups_;
// Map from capture indices to names
mutable const std::map<int, string>* group_names_;
// Onces for lazy computations.
mutable std::once_flag rprog_once_;
mutable std::once_flag num_captures_once_;
mutable std::once_flag named_groups_once_;
mutable std::once_flag group_names_once_;
RE2(const RE2&) = delete;
RE2& operator=(const RE2&) = delete;
};
/***** Implementation details *****/
// Hex/Octal/Binary?
// Special class for parsing into objects that define a ParseFrom() method
template <class T>
class _RE2_MatchObject {
public:
static inline bool Parse(const char* str, size_t n, void* dest) {
if (dest == NULL) return true;
T* object = reinterpret_cast<T*>(dest);
return object->ParseFrom(str, n);
}
};
class RE2::Arg {
public:
// Empty constructor so we can declare arrays of RE2::Arg
Arg();
// Constructor specially designed for NULL arguments
Arg(void*);
typedef bool (*Parser)(const char* str, size_t n, void* dest);
// Type-specific parsers
#define MAKE_PARSER(type, name) \
Arg(type* p) : arg_(p), parser_(name) {} \
Arg(type* p, Parser parser) : arg_(p), parser_(parser) {}
MAKE_PARSER(char, parse_char);
MAKE_PARSER(signed char, parse_schar);
MAKE_PARSER(unsigned char, parse_uchar);
MAKE_PARSER(float, parse_float);
MAKE_PARSER(double, parse_double);
MAKE_PARSER(string, parse_string);
MAKE_PARSER(StringPiece, parse_stringpiece);
MAKE_PARSER(short, parse_short);
MAKE_PARSER(unsigned short, parse_ushort);
MAKE_PARSER(int, parse_int);
MAKE_PARSER(unsigned int, parse_uint);
MAKE_PARSER(long, parse_long);
MAKE_PARSER(unsigned long, parse_ulong);
MAKE_PARSER(long long, parse_longlong);
MAKE_PARSER(unsigned long long, parse_ulonglong);
#undef MAKE_PARSER
// Generic constructor templates
template <class T> Arg(T* p)
: arg_(p), parser_(_RE2_MatchObject<T>::Parse) { }
template <class T> Arg(T* p, Parser parser)
: arg_(p), parser_(parser) { }
// Parse the data
bool Parse(const char* str, size_t n) const;
private:
void* arg_;
Parser parser_;
static bool parse_null (const char* str, size_t n, void* dest);
static bool parse_char (const char* str, size_t n, void* dest);
static bool parse_schar (const char* str, size_t n, void* dest);
static bool parse_uchar (const char* str, size_t n, void* dest);
static bool parse_float (const char* str, size_t n, void* dest);
static bool parse_double (const char* str, size_t n, void* dest);
static bool parse_string (const char* str, size_t n, void* dest);
static bool parse_stringpiece (const char* str, size_t n, void* dest);
#define DECLARE_INTEGER_PARSER(name) \
private: \
static bool parse_##name(const char* str, size_t n, void* dest); \
static bool parse_##name##_radix(const char* str, size_t n, void* dest, \
int radix); \
\
public: \
static bool parse_##name##_hex(const char* str, size_t n, void* dest); \
static bool parse_##name##_octal(const char* str, size_t n, void* dest); \
static bool parse_##name##_cradix(const char* str, size_t n, void* dest)
DECLARE_INTEGER_PARSER(short);
DECLARE_INTEGER_PARSER(ushort);
DECLARE_INTEGER_PARSER(int);
DECLARE_INTEGER_PARSER(uint);
DECLARE_INTEGER_PARSER(long);
DECLARE_INTEGER_PARSER(ulong);
DECLARE_INTEGER_PARSER(longlong);
DECLARE_INTEGER_PARSER(ulonglong);
#undef DECLARE_INTEGER_PARSER
};
inline RE2::Arg::Arg() : arg_(NULL), parser_(parse_null) { }
inline RE2::Arg::Arg(void* p) : arg_(p), parser_(parse_null) { }
inline bool RE2::Arg::Parse(const char* str, size_t n) const {
return (*parser_)(str, n, arg_);
}
// This part of the parser, appropriate only for ints, deals with bases
#define MAKE_INTEGER_PARSER(type, name) \
inline RE2::Arg RE2::Hex(type* ptr) { \
return RE2::Arg(ptr, RE2::Arg::parse_##name##_hex); \
} \
inline RE2::Arg RE2::Octal(type* ptr) { \
return RE2::Arg(ptr, RE2::Arg::parse_##name##_octal); \
} \
inline RE2::Arg RE2::CRadix(type* ptr) { \
return RE2::Arg(ptr, RE2::Arg::parse_##name##_cradix); \
}
MAKE_INTEGER_PARSER(short, short)
MAKE_INTEGER_PARSER(unsigned short, ushort)
MAKE_INTEGER_PARSER(int, int)
MAKE_INTEGER_PARSER(unsigned int, uint)
MAKE_INTEGER_PARSER(long, long)
MAKE_INTEGER_PARSER(unsigned long, ulong)
MAKE_INTEGER_PARSER(long long, longlong)
MAKE_INTEGER_PARSER(unsigned long long, ulonglong)
#undef MAKE_INTEGER_PARSER
#ifndef SWIG
// Silence warnings about missing initializers for members of LazyRE2.
// Note that we test for Clang first because it defines __GNUC__ as well.
#if defined(__clang__)
#elif defined(__GNUC__) && __GNUC__ >= 6
#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
#endif
// Helper for writing global or static RE2s safely.
// Write
// static LazyRE2 re = {".*"};
// and then use *re instead of writing
// static RE2 re(".*");
// The former is more careful about multithreaded
// situations than the latter.
//
// N.B. This class never deletes the RE2 object that
// it constructs: that's a feature, so that it can be used
// for global and function static variables.
class LazyRE2 {
private:
struct NoArg {};
public:
typedef RE2 element_type; // support std::pointer_traits
// Constructor omitted to preserve braced initialization in C++98.
// Pretend to be a pointer to Type (never NULL due to on-demand creation):
RE2& operator*() const { return *get(); }
RE2* operator->() const { return get(); }
// Named accessor/initializer:
RE2* get() const {
std::call_once(once_, &LazyRE2::Init, this);
return ptr_;
}
// All data fields must be public to support {"foo"} initialization.
const char* pattern_;
RE2::CannedOptions options_;
NoArg barrier_against_excess_initializers_;
mutable RE2* ptr_;
mutable std::once_flag once_;
private:
static void Init(const LazyRE2* lazy_re2) {
lazy_re2->ptr_ = new RE2(lazy_re2->pattern_, lazy_re2->options_);
}
void operator=(const LazyRE2&); // disallowed
};
#endif // SWIG
} // namespace re2
using re2::RE2;
using re2::LazyRE2;
#endif // RE2_RE2_H_

View File

@ -1,968 +0,0 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Regular expression representation.
// Tested by parse_test.cc
#include "re2/regexp.h"
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#include <algorithm>
#include <map>
#include <mutex>
#include <string>
#include <vector>
#include "util/util.h"
#include "util/logging.h"
#include "util/mutex.h"
#include "util/utf.h"
#include "re2/stringpiece.h"
#include "re2/walker-inl.h"
namespace re2 {
// Constructor. Allocates vectors as appropriate for operator.
Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
: op_(static_cast<uint8_t>(op)),
simple_(false),
parse_flags_(static_cast<uint16_t>(parse_flags)),
ref_(1),
nsub_(0),
down_(NULL) {
subone_ = NULL;
memset(the_union_, 0, sizeof the_union_);
}
// Destructor. Assumes already cleaned up children.
// Private: use Decref() instead of delete to destroy Regexps.
// Can't call Decref on the sub-Regexps here because
// that could cause arbitrarily deep recursion, so
// required Decref() to have handled them for us.
Regexp::~Regexp() {
if (nsub_ > 0)
LOG(DFATAL) << "Regexp not destroyed.";
switch (op_) {
default:
break;
case kRegexpCapture:
delete name_;
break;
case kRegexpLiteralString:
delete[] runes_;
break;
case kRegexpCharClass:
if (cc_)
cc_->Delete();
delete ccb_;
break;
}
}
// If it's possible to destroy this regexp without recurring,
// do so and return true. Else return false.
bool Regexp::QuickDestroy() {
if (nsub_ == 0) {
delete this;
return true;
}
return false;
}
// Lazily allocated.
static Mutex* ref_mutex;
static std::map<Regexp*, int>* ref_map;
int Regexp::Ref() {
if (ref_ < kMaxRef)
return ref_;
MutexLock l(ref_mutex);
return (*ref_map)[this];
}
// Increments reference count, returns object as convenience.
Regexp* Regexp::Incref() {
if (ref_ >= kMaxRef-1) {
static std::once_flag ref_once;
std::call_once(ref_once, []() {
ref_mutex = new Mutex;
ref_map = new std::map<Regexp*, int>;
});
// Store ref count in overflow map.
MutexLock l(ref_mutex);
if (ref_ == kMaxRef) {
// already overflowed
(*ref_map)[this]++;
} else {
// overflowing now
(*ref_map)[this] = kMaxRef;
ref_ = kMaxRef;
}
return this;
}
ref_++;
return this;
}
// Decrements reference count and deletes this object if count reaches 0.
void Regexp::Decref() {
if (ref_ == kMaxRef) {
// Ref count is stored in overflow map.
MutexLock l(ref_mutex);
int r = (*ref_map)[this] - 1;
if (r < kMaxRef) {
ref_ = static_cast<uint16_t>(r);
ref_map->erase(this);
} else {
(*ref_map)[this] = r;
}
return;
}
ref_--;
if (ref_ == 0)
Destroy();
}
// Deletes this object; ref count has count reached 0.
void Regexp::Destroy() {
if (QuickDestroy())
return;
// Handle recursive Destroy with explicit stack
// to avoid arbitrarily deep recursion on process stack [sigh].
down_ = NULL;
Regexp* stack = this;
while (stack != NULL) {
Regexp* re = stack;
stack = re->down_;
if (re->ref_ != 0)
LOG(DFATAL) << "Bad reference count " << re->ref_;
if (re->nsub_ > 0) {
Regexp** subs = re->sub();
for (int i = 0; i < re->nsub_; i++) {
Regexp* sub = subs[i];
if (sub == NULL)
continue;
if (sub->ref_ == kMaxRef)
sub->Decref();
else
--sub->ref_;
if (sub->ref_ == 0 && !sub->QuickDestroy()) {
sub->down_ = stack;
stack = sub;
}
}
if (re->nsub_ > 1)
delete[] subs;
re->nsub_ = 0;
}
delete re;
}
}
void Regexp::AddRuneToString(Rune r) {
DCHECK(op_ == kRegexpLiteralString);
if (nrunes_ == 0) {
// start with 8
runes_ = new Rune[8];
} else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) {
// double on powers of two
Rune *old = runes_;
runes_ = new Rune[nrunes_ * 2];
for (int i = 0; i < nrunes_; i++)
runes_[i] = old[i];
delete[] old;
}
runes_[nrunes_++] = r;
}
Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
Regexp* re = new Regexp(kRegexpHaveMatch, flags);
re->match_id_ = match_id;
return re;
}
Regexp* Regexp::StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags) {
// Squash **, ++ and ??.
if (op == sub->op() && flags == sub->parse_flags())
return sub;
// Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because
// op is Star/Plus/Quest, we just have to check that sub->op() is too.
if ((sub->op() == kRegexpStar ||
sub->op() == kRegexpPlus ||
sub->op() == kRegexpQuest) &&
flags == sub->parse_flags()) {
// If sub is Star, no need to rewrite it.
if (sub->op() == kRegexpStar)
return sub;
// Rewrite sub to Star.
Regexp* re = new Regexp(kRegexpStar, flags);
re->AllocSub(1);
re->sub()[0] = sub->sub()[0]->Incref();
sub->Decref(); // We didn't consume the reference after all.
return re;
}
Regexp* re = new Regexp(op, flags);
re->AllocSub(1);
re->sub()[0] = sub;
return re;
}
Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
return StarPlusOrQuest(kRegexpPlus, sub, flags);
}
Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
return StarPlusOrQuest(kRegexpStar, sub, flags);
}
Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
return StarPlusOrQuest(kRegexpQuest, sub, flags);
}
Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
ParseFlags flags, bool can_factor) {
if (nsub == 1)
return sub[0];
if (nsub == 0) {
if (op == kRegexpAlternate)
return new Regexp(kRegexpNoMatch, flags);
else
return new Regexp(kRegexpEmptyMatch, flags);
}
Regexp** subcopy = NULL;
if (op == kRegexpAlternate && can_factor) {
// Going to edit sub; make a copy so we don't step on caller.
subcopy = new Regexp*[nsub];
memmove(subcopy, sub, nsub * sizeof sub[0]);
sub = subcopy;
nsub = FactorAlternation(sub, nsub, flags);
if (nsub == 1) {
Regexp* re = sub[0];
delete[] subcopy;
return re;
}
}
if (nsub > kMaxNsub) {
// Too many subexpressions to fit in a single Regexp.
// Make a two-level tree. Two levels gets us to 65535^2.
int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub;
Regexp* re = new Regexp(op, flags);
re->AllocSub(nbigsub);
Regexp** subs = re->sub();
for (int i = 0; i < nbigsub - 1; i++)
subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false);
subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,
nsub - (nbigsub-1)*kMaxNsub, flags,
false);
delete[] subcopy;
return re;
}
Regexp* re = new Regexp(op, flags);
re->AllocSub(nsub);
Regexp** subs = re->sub();
for (int i = 0; i < nsub; i++)
subs[i] = sub[i];
delete[] subcopy;
return re;
}
Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {
return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);
}
Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {
return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);
}
Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {
return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);
}
Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {
Regexp* re = new Regexp(kRegexpCapture, flags);
re->AllocSub(1);
re->sub()[0] = sub;
re->cap_ = cap;
return re;
}
Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) {
Regexp* re = new Regexp(kRegexpRepeat, flags);
re->AllocSub(1);
re->sub()[0] = sub;
re->min_ = min;
re->max_ = max;
return re;
}
Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) {
Regexp* re = new Regexp(kRegexpLiteral, flags);
re->rune_ = rune;
return re;
}
Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) {
if (nrunes <= 0)
return new Regexp(kRegexpEmptyMatch, flags);
if (nrunes == 1)
return NewLiteral(runes[0], flags);
Regexp* re = new Regexp(kRegexpLiteralString, flags);
for (int i = 0; i < nrunes; i++)
re->AddRuneToString(runes[i]);
return re;
}
Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) {
Regexp* re = new Regexp(kRegexpCharClass, flags);
re->cc_ = cc;
return re;
}
// Swaps this and that in place.
void Regexp::Swap(Regexp* that) {
// Can use memmove because Regexp is just a struct (no vtable).
char tmp[sizeof *this];
memmove(tmp, this, sizeof tmp);
memmove(this, that, sizeof tmp);
memmove(that, tmp, sizeof tmp);
}
// Tests equality of all top-level structure but not subregexps.
static bool TopEqual(Regexp* a, Regexp* b) {
if (a->op() != b->op())
return false;
switch (a->op()) {
case kRegexpNoMatch:
case kRegexpEmptyMatch:
case kRegexpAnyChar:
case kRegexpAnyByte:
case kRegexpBeginLine:
case kRegexpEndLine:
case kRegexpWordBoundary:
case kRegexpNoWordBoundary:
case kRegexpBeginText:
return true;
case kRegexpEndText:
// The parse flags remember whether it's \z or (?-m:$),
// which matters when testing against PCRE.
return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;
case kRegexpLiteral:
return a->rune() == b->rune() &&
((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;
case kRegexpLiteralString:
return a->nrunes() == b->nrunes() &&
((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&
memcmp(a->runes(), b->runes(),
a->nrunes() * sizeof a->runes()[0]) == 0;
case kRegexpAlternate:
case kRegexpConcat:
return a->nsub() == b->nsub();
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;
case kRegexpRepeat:
return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 &&
a->min() == b->min() &&
a->max() == b->max();
case kRegexpCapture:
return a->cap() == b->cap() && a->name() == b->name();
case kRegexpHaveMatch:
return a->match_id() == b->match_id();
case kRegexpCharClass: {
CharClass* acc = a->cc();
CharClass* bcc = b->cc();
return acc->size() == bcc->size() &&
acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
memcmp(acc->begin(), bcc->begin(),
(acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;
}
}
LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
return 0;
}
bool Regexp::Equal(Regexp* a, Regexp* b) {
if (a == NULL || b == NULL)
return a == b;
if (!TopEqual(a, b))
return false;
// Fast path:
// return without allocating vector if there are no subregexps.
switch (a->op()) {
case kRegexpAlternate:
case kRegexpConcat:
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
case kRegexpRepeat:
case kRegexpCapture:
break;
default:
return true;
}
// Committed to doing real work.
// The stack (vector) has pairs of regexps waiting to
// be compared. The regexps are only equal if
// all the pairs end up being equal.
std::vector<Regexp*> stk;
for (;;) {
// Invariant: TopEqual(a, b) == true.
Regexp* a2;
Regexp* b2;
switch (a->op()) {
default:
break;
case kRegexpAlternate:
case kRegexpConcat:
for (int i = 0; i < a->nsub(); i++) {
a2 = a->sub()[i];
b2 = b->sub()[i];
if (!TopEqual(a2, b2))
return false;
stk.push_back(a2);
stk.push_back(b2);
}
break;
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
case kRegexpRepeat:
case kRegexpCapture:
a2 = a->sub()[0];
b2 = b->sub()[0];
if (!TopEqual(a2, b2))
return false;
// Really:
// stk.push_back(a2);
// stk.push_back(b2);
// break;
// but faster to assign directly and loop.
a = a2;
b = b2;
continue;
}
size_t n = stk.size();
if (n == 0)
break;
DCHECK_GE(n, 2);
a = stk[n-2];
b = stk[n-1];
stk.resize(n-2);
}
return true;
}
// Keep in sync with enum RegexpStatusCode in regexp.h
static const char *kErrorStrings[] = {
"no error",
"unexpected error",
"invalid escape sequence",
"invalid character class",
"invalid character class range",
"missing ]",
"missing )",
"trailing \\",
"no argument for repetition operator",
"invalid repetition size",
"bad repetition operator",
"invalid perl operator",
"invalid UTF-8",
"invalid named capture group",
};
string RegexpStatus::CodeText(enum RegexpStatusCode code) {
if (code < 0 || code >= arraysize(kErrorStrings))
code = kRegexpInternalError;
return kErrorStrings[code];
}
string RegexpStatus::Text() const {
if (error_arg_.empty())
return CodeText(code_);
string s;
s.append(CodeText(code_));
s.append(": ");
s.append(error_arg_.data(), error_arg_.size());
return s;
}
void RegexpStatus::Copy(const RegexpStatus& status) {
code_ = status.code_;
error_arg_ = status.error_arg_;
}
typedef int Ignored; // Walker<void> doesn't exist
// Walker subclass to count capturing parens in regexp.
class NumCapturesWalker : public Regexp::Walker<Ignored> {
public:
NumCapturesWalker() : ncapture_(0) {}
int ncapture() { return ncapture_; }
virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
if (re->op() == kRegexpCapture)
ncapture_++;
return ignored;
}
virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
// Should never be called: we use Walk not WalkExponential.
LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";
return ignored;
}
private:
int ncapture_;
NumCapturesWalker(const NumCapturesWalker&) = delete;
NumCapturesWalker& operator=(const NumCapturesWalker&) = delete;
};
int Regexp::NumCaptures() {
NumCapturesWalker w;
w.Walk(this, 0);
return w.ncapture();
}
// Walker class to build map of named capture groups and their indices.
class NamedCapturesWalker : public Regexp::Walker<Ignored> {
public:
NamedCapturesWalker() : map_(NULL) {}
~NamedCapturesWalker() { delete map_; }
std::map<string, int>* TakeMap() {
std::map<string, int>* m = map_;
map_ = NULL;
return m;
}
Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
if (re->op() == kRegexpCapture && re->name() != NULL) {
// Allocate map once we find a name.
if (map_ == NULL)
map_ = new std::map<string, int>;
// Record first occurrence of each name.
// (The rule is that if you have the same name
// multiple times, only the leftmost one counts.)
if (map_->find(*re->name()) == map_->end())
(*map_)[*re->name()] = re->cap();
}
return ignored;
}
virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
// Should never be called: we use Walk not WalkExponential.
LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";
return ignored;
}
private:
std::map<string, int>* map_;
NamedCapturesWalker(const NamedCapturesWalker&) = delete;
NamedCapturesWalker& operator=(const NamedCapturesWalker&) = delete;
};
std::map<string, int>* Regexp::NamedCaptures() {
NamedCapturesWalker w;
w.Walk(this, 0);
return w.TakeMap();
}
// Walker class to build map from capture group indices to their names.
class CaptureNamesWalker : public Regexp::Walker<Ignored> {
public:
CaptureNamesWalker() : map_(NULL) {}
~CaptureNamesWalker() { delete map_; }
std::map<int, string>* TakeMap() {
std::map<int, string>* m = map_;
map_ = NULL;
return m;
}
Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
if (re->op() == kRegexpCapture && re->name() != NULL) {
// Allocate map once we find a name.
if (map_ == NULL)
map_ = new std::map<int, string>;
(*map_)[re->cap()] = *re->name();
}
return ignored;
}
virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
// Should never be called: we use Walk not WalkExponential.
LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
return ignored;
}
private:
std::map<int, string>* map_;
CaptureNamesWalker(const CaptureNamesWalker&) = delete;
CaptureNamesWalker& operator=(const CaptureNamesWalker&) = delete;
};
std::map<int, string>* Regexp::CaptureNames() {
CaptureNamesWalker w;
w.Walk(this, 0);
return w.TakeMap();
}
// Determines whether regexp matches must be anchored
// with a fixed string prefix. If so, returns the prefix and
// the regexp that remains after the prefix. The prefix might
// be ASCII case-insensitive.
bool Regexp::RequiredPrefix(string* prefix, bool* foldcase, Regexp** suffix) {
// No need for a walker: the regexp must be of the form
// 1. some number of ^ anchors
// 2. a literal char or string
// 3. the rest
prefix->clear();
*foldcase = false;
*suffix = NULL;
if (op_ != kRegexpConcat)
return false;
// Some number of anchors, then a literal or concatenation.
int i = 0;
Regexp** sub = this->sub();
while (i < nsub_ && sub[i]->op_ == kRegexpBeginText)
i++;
if (i == 0 || i >= nsub_)
return false;
Regexp* re = sub[i];
switch (re->op_) {
default:
return false;
case kRegexpLiteralString:
// Convert to string in proper encoding.
if (re->parse_flags() & Latin1) {
prefix->resize(re->nrunes_);
for (int j = 0; j < re->nrunes_; j++)
(*prefix)[j] = static_cast<char>(re->runes_[j]);
} else {
// Convert to UTF-8 in place.
// Assume worst-case space and then trim.
prefix->resize(re->nrunes_ * UTFmax);
char *p = &(*prefix)[0];
for (int j = 0; j < re->nrunes_; j++) {
Rune r = re->runes_[j];
if (r < Runeself)
*p++ = static_cast<char>(r);
else
p += runetochar(p, &r);
}
prefix->resize(p - &(*prefix)[0]);
}
break;
case kRegexpLiteral:
if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) {
prefix->append(1, static_cast<char>(re->rune_));
} else {
char buf[UTFmax];
prefix->append(buf, runetochar(buf, &re->rune_));
}
break;
}
*foldcase = (sub[i]->parse_flags() & FoldCase) != 0;
i++;
// The rest.
if (i < nsub_) {
for (int j = i; j < nsub_; j++)
sub[j]->Incref();
re = Concat(sub + i, nsub_ - i, parse_flags());
} else {
re = new Regexp(kRegexpEmptyMatch, parse_flags());
}
*suffix = re;
return true;
}
// Character class builder is a balanced binary tree (STL set)
// containing non-overlapping, non-abutting RuneRanges.
// The less-than operator used in the tree treats two
// ranges as equal if they overlap at all, so that
// lookups for a particular Rune are possible.
CharClassBuilder::CharClassBuilder() {
nrunes_ = 0;
upper_ = 0;
lower_ = 0;
}
// Add lo-hi to the class; return whether class got bigger.
bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
if (hi < lo)
return false;
if (lo <= 'z' && hi >= 'A') {
// Overlaps some alpha, maybe not all.
// Update bitmaps telling which ASCII letters are in the set.
Rune lo1 = std::max<Rune>(lo, 'A');
Rune hi1 = std::min<Rune>(hi, 'Z');
if (lo1 <= hi1)
upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');
lo1 = std::max<Rune>(lo, 'a');
hi1 = std::min<Rune>(hi, 'z');
if (lo1 <= hi1)
lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
}
{ // Check whether lo, hi is already in the class.
iterator it = ranges_.find(RuneRange(lo, lo));
if (it != end() && it->lo <= lo && hi <= it->hi)
return false;
}
// Look for a range abutting lo on the left.
// If it exists, take it out and increase our range.
if (lo > 0) {
iterator it = ranges_.find(RuneRange(lo-1, lo-1));
if (it != end()) {
lo = it->lo;
if (it->hi > hi)
hi = it->hi;
nrunes_ -= it->hi - it->lo + 1;
ranges_.erase(it);
}
}
// Look for a range abutting hi on the right.
// If it exists, take it out and increase our range.
if (hi < Runemax) {
iterator it = ranges_.find(RuneRange(hi+1, hi+1));
if (it != end()) {
hi = it->hi;
nrunes_ -= it->hi - it->lo + 1;
ranges_.erase(it);
}
}
// Look for ranges between lo and hi. Take them out.
// This is only safe because the set has no overlapping ranges.
// We've already removed any ranges abutting lo and hi, so
// any that overlap [lo, hi] must be contained within it.
for (;;) {
iterator it = ranges_.find(RuneRange(lo, hi));
if (it == end())
break;
nrunes_ -= it->hi - it->lo + 1;
ranges_.erase(it);
}
// Finally, add [lo, hi].
nrunes_ += hi - lo + 1;
ranges_.insert(RuneRange(lo, hi));
return true;
}
void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {
for (iterator it = cc->begin(); it != cc->end(); ++it)
AddRange(it->lo, it->hi);
}
bool CharClassBuilder::Contains(Rune r) {
return ranges_.find(RuneRange(r, r)) != end();
}
// Does the character class behave the same on A-Z as on a-z?
bool CharClassBuilder::FoldsASCII() {
return ((upper_ ^ lower_) & AlphaMask) == 0;
}
CharClassBuilder* CharClassBuilder::Copy() {
CharClassBuilder* cc = new CharClassBuilder;
for (iterator it = begin(); it != end(); ++it)
cc->ranges_.insert(RuneRange(it->lo, it->hi));
cc->upper_ = upper_;
cc->lower_ = lower_;
cc->nrunes_ = nrunes_;
return cc;
}
void CharClassBuilder::RemoveAbove(Rune r) {
if (r >= Runemax)
return;
if (r < 'z') {
if (r < 'a')
lower_ = 0;
else
lower_ &= AlphaMask >> ('z' - r);
}
if (r < 'Z') {
if (r < 'A')
upper_ = 0;
else
upper_ &= AlphaMask >> ('Z' - r);
}
for (;;) {
iterator it = ranges_.find(RuneRange(r + 1, Runemax));
if (it == end())
break;
RuneRange rr = *it;
ranges_.erase(it);
nrunes_ -= rr.hi - rr.lo + 1;
if (rr.lo <= r) {
rr.hi = r;
ranges_.insert(rr);
nrunes_ += rr.hi - rr.lo + 1;
}
}
}
void CharClassBuilder::Negate() {
// Build up negation and then copy in.
// Could edit ranges in place, but C++ won't let me.
std::vector<RuneRange> v;
v.reserve(ranges_.size() + 1);
// In negation, first range begins at 0, unless
// the current class begins at 0.
iterator it = begin();
if (it == end()) {
v.push_back(RuneRange(0, Runemax));
} else {
int nextlo = 0;
if (it->lo == 0) {
nextlo = it->hi + 1;
++it;
}
for (; it != end(); ++it) {
v.push_back(RuneRange(nextlo, it->lo - 1));
nextlo = it->hi + 1;
}
if (nextlo <= Runemax)
v.push_back(RuneRange(nextlo, Runemax));
}
ranges_.clear();
for (size_t i = 0; i < v.size(); i++)
ranges_.insert(v[i]);
upper_ = AlphaMask & ~upper_;
lower_ = AlphaMask & ~lower_;
nrunes_ = Runemax+1 - nrunes_;
}
// Character class is a sorted list of ranges.
// The ranges are allocated in the same block as the header,
// necessitating a special allocator and Delete method.
CharClass* CharClass::New(int maxranges) {
CharClass* cc;
uint8_t* data = new uint8_t[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
cc = reinterpret_cast<CharClass*>(data);
cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc);
cc->nranges_ = 0;
cc->folds_ascii_ = false;
cc->nrunes_ = 0;
return cc;
}
void CharClass::Delete() {
uint8_t* data = reinterpret_cast<uint8_t*>(this);
delete[] data;
}
CharClass* CharClass::Negate() {
CharClass* cc = CharClass::New(nranges_+1);
cc->folds_ascii_ = folds_ascii_;
cc->nrunes_ = Runemax + 1 - nrunes_;
int n = 0;
int nextlo = 0;
for (CharClass::iterator it = begin(); it != end(); ++it) {
if (it->lo == nextlo) {
nextlo = it->hi + 1;
} else {
cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1);
nextlo = it->hi + 1;
}
}
if (nextlo <= Runemax)
cc->ranges_[n++] = RuneRange(nextlo, Runemax);
cc->nranges_ = n;
return cc;
}
bool CharClass::Contains(Rune r) {
RuneRange* rr = ranges_;
int n = nranges_;
while (n > 0) {
int m = n/2;
if (rr[m].hi < r) {
rr += m+1;
n -= m+1;
} else if (r < rr[m].lo) {
n = m;
} else { // rr[m].lo <= r && r <= rr[m].hi
return true;
}
}
return false;
}
CharClass* CharClassBuilder::GetCharClass() {
CharClass* cc = CharClass::New(static_cast<int>(ranges_.size()));
int n = 0;
for (iterator it = begin(); it != end(); ++it)
cc->ranges_[n++] = *it;
cc->nranges_ = n;
DCHECK_LE(n, static_cast<int>(ranges_.size()));
cc->nrunes_ = nrunes_;
cc->folds_ascii_ = FoldsASCII();
return cc;
}
} // namespace re2

View File

@ -1,652 +0,0 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_REGEXP_H_
#define RE2_REGEXP_H_
// --- SPONSORED LINK --------------------------------------------------
// If you want to use this library for regular expression matching,
// you should use re2/re2.h, which provides a class RE2 that
// mimics the PCRE interface provided by PCRE's C++ wrappers.
// This header describes the low-level interface used to implement RE2
// and may change in backwards-incompatible ways from time to time.
// In contrast, RE2's interface will not.
// ---------------------------------------------------------------------
// Regular expression library: parsing, execution, and manipulation
// of regular expressions.
//
// Any operation that traverses the Regexp structures should be written
// using Regexp::Walker (see walker-inl.h), not recursively, because deeply nested
// regular expressions such as x++++++++++++++++++++... might cause recursive
// traversals to overflow the stack.
//
// It is the caller's responsibility to provide appropriate mutual exclusion
// around manipulation of the regexps. RE2 does this.
//
// PARSING
//
// Regexp::Parse parses regular expressions encoded in UTF-8.
// The default syntax is POSIX extended regular expressions,
// with the following changes:
//
// 1. Backreferences (optional in POSIX EREs) are not supported.
// (Supporting them precludes the use of DFA-based
// matching engines.)
//
// 2. Collating elements and collation classes are not supported.
// (No one has needed or wanted them.)
//
// The exact syntax accepted can be modified by passing flags to
// Regexp::Parse. In particular, many of the basic Perl additions
// are available. The flags are documented below (search for LikePerl).
//
// If parsed with the flag Regexp::Latin1, both the regular expression
// and the input to the matching routines are assumed to be encoded in
// Latin-1, not UTF-8.
//
// EXECUTION
//
// Once Regexp has parsed a regular expression, it provides methods
// to search text using that regular expression. These methods are
// implemented via calling out to other regular expression libraries.
// (Let's call them the sublibraries.)
//
// To call a sublibrary, Regexp does not simply prepare a
// string version of the regular expression and hand it to the
// sublibrary. Instead, Regexp prepares, from its own parsed form, the
// corresponding internal representation used by the sublibrary.
// This has the drawback of needing to know the internal representation
// used by the sublibrary, but it has two important benefits:
//
// 1. The syntax and meaning of regular expressions is guaranteed
// to be that used by Regexp's parser, not the syntax expected
// by the sublibrary. Regexp might accept a restricted or
// expanded syntax for regular expressions as compared with
// the sublibrary. As long as Regexp can translate from its
// internal form into the sublibrary's, clients need not know
// exactly which sublibrary they are using.
//
// 2. The sublibrary parsers are bypassed. For whatever reason,
// sublibrary regular expression parsers often have security
// problems. For example, plan9grep's regular expression parser
// has a buffer overflow in its handling of large character
// classes, and PCRE's parser has had buffer overflow problems
// in the past. Security-team requires sandboxing of sublibrary
// regular expression parsers. Avoiding the sublibrary parsers
// avoids the sandbox.
//
// The execution methods we use now are provided by the compiled form,
// Prog, described in prog.h
//
// MANIPULATION
//
// Unlike other regular expression libraries, Regexp makes its parsed
// form accessible to clients, so that client code can analyze the
// parsed regular expressions.
#include <stdint.h>
#include <map>
#include <set>
#include <string>
#include "util/util.h"
#include "util/logging.h"
#include "util/utf.h"
#include "re2/stringpiece.h"
namespace re2 {
// Keep in sync with string list kOpcodeNames[] in testing/dump.cc
enum RegexpOp {
// Matches no strings.
kRegexpNoMatch = 1,
// Matches empty string.
kRegexpEmptyMatch,
// Matches rune_.
kRegexpLiteral,
// Matches runes_.
kRegexpLiteralString,
// Matches concatenation of sub_[0..nsub-1].
kRegexpConcat,
// Matches union of sub_[0..nsub-1].
kRegexpAlternate,
// Matches sub_[0] zero or more times.
kRegexpStar,
// Matches sub_[0] one or more times.
kRegexpPlus,
// Matches sub_[0] zero or one times.
kRegexpQuest,
// Matches sub_[0] at least min_ times, at most max_ times.
// max_ == -1 means no upper limit.
kRegexpRepeat,
// Parenthesized (capturing) subexpression. Index is cap_.
// Optionally, capturing name is name_.
kRegexpCapture,
// Matches any character.
kRegexpAnyChar,
// Matches any byte [sic].
kRegexpAnyByte,
// Matches empty string at beginning of line.
kRegexpBeginLine,
// Matches empty string at end of line.
kRegexpEndLine,
// Matches word boundary "\b".
kRegexpWordBoundary,
// Matches not-a-word boundary "\B".
kRegexpNoWordBoundary,
// Matches empty string at beginning of text.
kRegexpBeginText,
// Matches empty string at end of text.
kRegexpEndText,
// Matches character class given by cc_.
kRegexpCharClass,
// Forces match of entire expression right now,
// with match ID match_id_ (used by RE2::Set).
kRegexpHaveMatch,
kMaxRegexpOp = kRegexpHaveMatch,
};
// Keep in sync with string list in regexp.cc
enum RegexpStatusCode {
// No error
kRegexpSuccess = 0,
// Unexpected error
kRegexpInternalError,
// Parse errors
kRegexpBadEscape, // bad escape sequence
kRegexpBadCharClass, // bad character class
kRegexpBadCharRange, // bad character class range
kRegexpMissingBracket, // missing closing ]
kRegexpMissingParen, // missing closing )
kRegexpTrailingBackslash, // at end of regexp
kRegexpRepeatArgument, // repeat argument missing, e.g. "*"
kRegexpRepeatSize, // bad repetition argument
kRegexpRepeatOp, // bad repetition operator
kRegexpBadPerlOp, // bad perl operator
kRegexpBadUTF8, // invalid UTF-8 in regexp
kRegexpBadNamedCapture, // bad named capture
};
// Error status for certain operations.
class RegexpStatus {
public:
RegexpStatus() : code_(kRegexpSuccess), tmp_(NULL) {}
~RegexpStatus() { delete tmp_; }
void set_code(RegexpStatusCode code) { code_ = code; }
void set_error_arg(const StringPiece& error_arg) { error_arg_ = error_arg; }
void set_tmp(string* tmp) { delete tmp_; tmp_ = tmp; }
RegexpStatusCode code() const { return code_; }
const StringPiece& error_arg() const { return error_arg_; }
bool ok() const { return code() == kRegexpSuccess; }
// Copies state from status.
void Copy(const RegexpStatus& status);
// Returns text equivalent of code, e.g.:
// "Bad character class"
static string CodeText(RegexpStatusCode code);
// Returns text describing error, e.g.:
// "Bad character class: [z-a]"
string Text() const;
private:
RegexpStatusCode code_; // Kind of error
StringPiece error_arg_; // Piece of regexp containing syntax error.
string* tmp_; // Temporary storage, possibly where error_arg_ is.
RegexpStatus(const RegexpStatus&) = delete;
RegexpStatus& operator=(const RegexpStatus&) = delete;
};
// Compiled form; see prog.h
class Prog;
struct RuneRange {
RuneRange() : lo(0), hi(0) { }
RuneRange(int l, int h) : lo(l), hi(h) { }
Rune lo;
Rune hi;
};
// Less-than on RuneRanges treats a == b if they overlap at all.
// This lets us look in a set to find the range covering a particular Rune.
struct RuneRangeLess {
bool operator()(const RuneRange& a, const RuneRange& b) const {
return a.hi < b.lo;
}
};
class CharClassBuilder;
class CharClass {
public:
void Delete();
typedef RuneRange* iterator;
iterator begin() { return ranges_; }
iterator end() { return ranges_ + nranges_; }
int size() { return nrunes_; }
bool empty() { return nrunes_ == 0; }
bool full() { return nrunes_ == Runemax+1; }
bool FoldsASCII() { return folds_ascii_; }
bool Contains(Rune r);
CharClass* Negate();
private:
CharClass(); // not implemented
~CharClass(); // not implemented
static CharClass* New(int maxranges);
friend class CharClassBuilder;
bool folds_ascii_;
int nrunes_;
RuneRange *ranges_;
int nranges_;
CharClass(const CharClass&) = delete;
CharClass& operator=(const CharClass&) = delete;
};
class Regexp {
public:
// Flags for parsing. Can be ORed together.
enum ParseFlags {
NoParseFlags = 0,
FoldCase = 1<<0, // Fold case during matching (case-insensitive).
Literal = 1<<1, // Treat s as literal string instead of a regexp.
ClassNL = 1<<2, // Allow char classes like [^a-z] and \D and \s
// and [[:space:]] to match newline.
DotNL = 1<<3, // Allow . to match newline.
MatchNL = ClassNL | DotNL,
OneLine = 1<<4, // Treat ^ and $ as only matching at beginning and
// end of text, not around embedded newlines.
// (Perl's default)
Latin1 = 1<<5, // Regexp and text are in Latin1, not UTF-8.
NonGreedy = 1<<6, // Repetition operators are non-greedy by default.
PerlClasses = 1<<7, // Allow Perl character classes like \d.
PerlB = 1<<8, // Allow Perl's \b and \B.
PerlX = 1<<9, // Perl extensions:
// non-capturing parens - (?: )
// non-greedy operators - *? +? ?? {}?
// flag edits - (?i) (?-i) (?i: )
// i - FoldCase
// m - !OneLine
// s - DotNL
// U - NonGreedy
// line ends: \A \z
// \Q and \E to disable/enable metacharacters
// (?P<name>expr) for named captures
// \C to match any single byte
UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group
// and \P{Han} for its negation.
NeverNL = 1<<11, // Never match NL, even if the regexp mentions
// it explicitly.
NeverCapture = 1<<12, // Parse all parens as non-capturing.
// As close to Perl as we can get.
LikePerl = ClassNL | OneLine | PerlClasses | PerlB | PerlX |
UnicodeGroups,
// Internal use only.
WasDollar = 1<<13, // on kRegexpEndText: was $ in regexp text
AllParseFlags = (1<<14)-1,
};
// Get. No set, Regexps are logically immutable once created.
RegexpOp op() { return static_cast<RegexpOp>(op_); }
int nsub() { return nsub_; }
bool simple() { return simple_ != 0; }
ParseFlags parse_flags() { return static_cast<ParseFlags>(parse_flags_); }
int Ref(); // For testing.
Regexp** sub() {
if(nsub_ <= 1)
return &subone_;
else
return submany_;
}
int min() { DCHECK_EQ(op_, kRegexpRepeat); return min_; }
int max() { DCHECK_EQ(op_, kRegexpRepeat); return max_; }
Rune rune() { DCHECK_EQ(op_, kRegexpLiteral); return rune_; }
CharClass* cc() { DCHECK_EQ(op_, kRegexpCharClass); return cc_; }
int cap() { DCHECK_EQ(op_, kRegexpCapture); return cap_; }
const string* name() { DCHECK_EQ(op_, kRegexpCapture); return name_; }
Rune* runes() { DCHECK_EQ(op_, kRegexpLiteralString); return runes_; }
int nrunes() { DCHECK_EQ(op_, kRegexpLiteralString); return nrunes_; }
int match_id() { DCHECK_EQ(op_, kRegexpHaveMatch); return match_id_; }
// Increments reference count, returns object as convenience.
Regexp* Incref();
// Decrements reference count and deletes this object if count reaches 0.
void Decref();
// Parses string s to produce regular expression, returned.
// Caller must release return value with re->Decref().
// On failure, sets *status (if status != NULL) and returns NULL.
static Regexp* Parse(const StringPiece& s, ParseFlags flags,
RegexpStatus* status);
// Returns a _new_ simplified version of the current regexp.
// Does not edit the current regexp.
// Caller must release return value with re->Decref().
// Simplified means that counted repetition has been rewritten
// into simpler terms and all Perl/POSIX features have been
// removed. The result will capture exactly the same
// subexpressions the original did, unless formatted with ToString.
Regexp* Simplify();
friend class CoalesceWalker;
friend class SimplifyWalker;
// Parses the regexp src and then simplifies it and sets *dst to the
// string representation of the simplified form. Returns true on success.
// Returns false and sets *status (if status != NULL) on parse error.
static bool SimplifyRegexp(const StringPiece& src, ParseFlags flags,
string* dst,
RegexpStatus* status);
// Returns the number of capturing groups in the regexp.
int NumCaptures();
friend class NumCapturesWalker;
// Returns a map from names to capturing group indices,
// or NULL if the regexp contains no named capture groups.
// The caller is responsible for deleting the map.
std::map<string, int>* NamedCaptures();
// Returns a map from capturing group indices to capturing group
// names or NULL if the regexp contains no named capture groups. The
// caller is responsible for deleting the map.
std::map<int, string>* CaptureNames();
// Returns a string representation of the current regexp,
// using as few parentheses as possible.
string ToString();
// Convenience functions. They consume the passed reference,
// so in many cases you should use, e.g., Plus(re->Incref(), flags).
// They do not consume allocated arrays like subs or runes.
static Regexp* Plus(Regexp* sub, ParseFlags flags);
static Regexp* Star(Regexp* sub, ParseFlags flags);
static Regexp* Quest(Regexp* sub, ParseFlags flags);
static Regexp* Concat(Regexp** subs, int nsubs, ParseFlags flags);
static Regexp* Alternate(Regexp** subs, int nsubs, ParseFlags flags);
static Regexp* Capture(Regexp* sub, ParseFlags flags, int cap);
static Regexp* Repeat(Regexp* sub, ParseFlags flags, int min, int max);
static Regexp* NewLiteral(Rune rune, ParseFlags flags);
static Regexp* NewCharClass(CharClass* cc, ParseFlags flags);
static Regexp* LiteralString(Rune* runes, int nrunes, ParseFlags flags);
static Regexp* HaveMatch(int match_id, ParseFlags flags);
// Like Alternate but does not factor out common prefixes.
static Regexp* AlternateNoFactor(Regexp** subs, int nsubs, ParseFlags flags);
// Debugging function. Returns string format for regexp
// that makes structure clear. Does NOT use regexp syntax.
string Dump();
// Helper traversal class, defined fully in walker-inl.h.
template<typename T> class Walker;
// Compile to Prog. See prog.h
// Reverse prog expects to be run over text backward.
// Construction and execution of prog will
// stay within approximately max_mem bytes of memory.
// If max_mem <= 0, a reasonable default is used.
Prog* CompileToProg(int64_t max_mem);
Prog* CompileToReverseProg(int64_t max_mem);
// Whether to expect this library to find exactly the same answer as PCRE
// when running this regexp. Most regexps do mimic PCRE exactly, but a few
// obscure cases behave differently. Technically this is more a property
// of the Prog than the Regexp, but the computation is much easier to do
// on the Regexp. See mimics_pcre.cc for the exact conditions.
bool MimicsPCRE();
// Benchmarking function.
void NullWalk();
// Whether every match of this regexp must be anchored and
// begin with a non-empty fixed string (perhaps after ASCII
// case-folding). If so, returns the prefix and the sub-regexp that
// follows it.
// Callers should expect *prefix, *foldcase and *suffix to be "zeroed"
// regardless of the return value.
bool RequiredPrefix(string* prefix, bool* foldcase, Regexp** suffix);
private:
// Constructor allocates vectors as appropriate for operator.
explicit Regexp(RegexpOp op, ParseFlags parse_flags);
// Use Decref() instead of delete to release Regexps.
// This is private to catch deletes at compile time.
~Regexp();
void Destroy();
bool QuickDestroy();
// Helpers for Parse. Listed here so they can edit Regexps.
class ParseState;
friend class ParseState;
friend bool ParseCharClass(StringPiece* s, Regexp** out_re,
RegexpStatus* status);
// Helper for testing [sic].
friend bool RegexpEqualTestingOnly(Regexp*, Regexp*);
// Computes whether Regexp is already simple.
bool ComputeSimple();
// Constructor that generates a Star, Plus or Quest,
// squashing the pair if sub is also a Star, Plus or Quest.
static Regexp* StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags);
// Constructor that generates a concatenation or alternation,
// enforcing the limit on the number of subexpressions for
// a particular Regexp.
static Regexp* ConcatOrAlternate(RegexpOp op, Regexp** subs, int nsubs,
ParseFlags flags, bool can_factor);
// Returns the leading string that re starts with.
// The returned Rune* points into a piece of re,
// so it must not be used after the caller calls re->Decref().
static Rune* LeadingString(Regexp* re, int* nrune, ParseFlags* flags);
// Removes the first n leading runes from the beginning of re.
// Edits re in place.
static void RemoveLeadingString(Regexp* re, int n);
// Returns the leading regexp in re's top-level concatenation.
// The returned Regexp* points at re or a sub-expression of re,
// so it must not be used after the caller calls re->Decref().
static Regexp* LeadingRegexp(Regexp* re);
// Removes LeadingRegexp(re) from re and returns the remainder.
// Might edit re in place.
static Regexp* RemoveLeadingRegexp(Regexp* re);
// Simplifies an alternation of literal strings by factoring out
// common prefixes.
static int FactorAlternation(Regexp** sub, int nsub, ParseFlags flags);
friend class FactorAlternationImpl;
// Is a == b? Only efficient on regexps that have not been through
// Simplify yet - the expansion of a kRegexpRepeat will make this
// take a long time. Do not call on such regexps, hence private.
static bool Equal(Regexp* a, Regexp* b);
// Allocate space for n sub-regexps.
void AllocSub(int n) {
DCHECK(n >= 0 && static_cast<uint16_t>(n) == n);
if (n > 1)
submany_ = new Regexp*[n];
nsub_ = static_cast<uint16_t>(n);
}
// Add Rune to LiteralString
void AddRuneToString(Rune r);
// Swaps this with that, in place.
void Swap(Regexp *that);
// Operator. See description of operators above.
// uint8_t instead of RegexpOp to control space usage.
uint8_t op_;
// Is this regexp structure already simple
// (has it been returned by Simplify)?
// uint8_t instead of bool to control space usage.
uint8_t simple_;
// Flags saved from parsing and used during execution.
// (Only FoldCase is used.)
// uint16_t instead of ParseFlags to control space usage.
uint16_t parse_flags_;
// Reference count. Exists so that SimplifyRegexp can build
// regexp structures that are dags rather than trees to avoid
// exponential blowup in space requirements.
// uint16_t to control space usage.
// The standard regexp routines will never generate a
// ref greater than the maximum repeat count (kMaxRepeat),
// but even so, Incref and Decref consult an overflow map
// when ref_ reaches kMaxRef.
uint16_t ref_;
static const uint16_t kMaxRef = 0xffff;
// Subexpressions.
// uint16_t to control space usage.
// Concat and Alternate handle larger numbers of subexpressions
// by building concatenation or alternation trees.
// Other routines should call Concat or Alternate instead of
// filling in sub() by hand.
uint16_t nsub_;
static const uint16_t kMaxNsub = 0xffff;
union {
Regexp** submany_; // if nsub_ > 1
Regexp* subone_; // if nsub_ == 1
};
// Extra space for parse and teardown stacks.
Regexp* down_;
// Arguments to operator. See description of operators above.
union {
struct { // Repeat
int max_;
int min_;
};
struct { // Capture
int cap_;
string* name_;
};
struct { // LiteralString
int nrunes_;
Rune* runes_;
};
struct { // CharClass
// These two could be in separate union members,
// but it wouldn't save any space (there are other two-word structs)
// and keeping them separate avoids confusion during parsing.
CharClass* cc_;
CharClassBuilder* ccb_;
};
Rune rune_; // Literal
int match_id_; // HaveMatch
void *the_union_[2]; // as big as any other element, for memset
};
Regexp(const Regexp&) = delete;
Regexp& operator=(const Regexp&) = delete;
};
// Character class set: contains non-overlapping, non-abutting RuneRanges.
typedef std::set<RuneRange, RuneRangeLess> RuneRangeSet;
class CharClassBuilder {
public:
CharClassBuilder();
typedef RuneRangeSet::iterator iterator;
iterator begin() { return ranges_.begin(); }
iterator end() { return ranges_.end(); }
int size() { return nrunes_; }
bool empty() { return nrunes_ == 0; }
bool full() { return nrunes_ == Runemax+1; }
bool Contains(Rune r);
bool FoldsASCII();
bool AddRange(Rune lo, Rune hi); // returns whether class changed
CharClassBuilder* Copy();
void AddCharClass(CharClassBuilder* cc);
void Negate();
void RemoveAbove(Rune r);
CharClass* GetCharClass();
void AddRangeFlags(Rune lo, Rune hi, Regexp::ParseFlags parse_flags);
private:
static const uint32_t AlphaMask = (1<<26) - 1;
uint32_t upper_; // bitmap of A-Z
uint32_t lower_; // bitmap of a-z
int nrunes_;
RuneRangeSet ranges_;
CharClassBuilder(const CharClassBuilder&) = delete;
CharClassBuilder& operator=(const CharClassBuilder&) = delete;
};
// Bitwise ops on ParseFlags produce ParseFlags.
inline Regexp::ParseFlags operator|(Regexp::ParseFlags a,
Regexp::ParseFlags b) {
return static_cast<Regexp::ParseFlags>(
static_cast<int>(a) | static_cast<int>(b));
}
inline Regexp::ParseFlags operator^(Regexp::ParseFlags a,
Regexp::ParseFlags b) {
return static_cast<Regexp::ParseFlags>(
static_cast<int>(a) ^ static_cast<int>(b));
}
inline Regexp::ParseFlags operator&(Regexp::ParseFlags a,
Regexp::ParseFlags b) {
return static_cast<Regexp::ParseFlags>(
static_cast<int>(a) & static_cast<int>(b));
}
inline Regexp::ParseFlags operator~(Regexp::ParseFlags a) {
// Attempting to produce a value out of enum's range has undefined behaviour.
return static_cast<Regexp::ParseFlags>(
~static_cast<int>(a) & static_cast<int>(Regexp::AllParseFlags));
}
} // namespace re2
#endif // RE2_REGEXP_H_

View File

@ -1,154 +0,0 @@
// Copyright 2010 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "re2/set.h"
#include <stddef.h>
#include <algorithm>
#include <memory>
#include "util/util.h"
#include "util/logging.h"
#include "re2/stringpiece.h"
#include "re2/prog.h"
#include "re2/re2.h"
#include "re2/regexp.h"
namespace re2 {
RE2::Set::Set(const RE2::Options& options, RE2::Anchor anchor) {
options_.Copy(options);
options_.set_never_capture(true); // might unblock some optimisations
anchor_ = anchor;
prog_ = NULL;
compiled_ = false;
size_ = 0;
}
RE2::Set::~Set() {
for (size_t i = 0; i < elem_.size(); i++)
elem_[i].second->Decref();
delete prog_;
}
int RE2::Set::Add(const StringPiece& pattern, string* error) {
if (compiled_) {
LOG(DFATAL) << "RE2::Set::Add() called after compiling";
return -1;
}
Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
options_.ParseFlags());
RegexpStatus status;
re2::Regexp* re = Regexp::Parse(pattern, pf, &status);
if (re == NULL) {
if (error != NULL)
*error = status.Text();
if (options_.log_errors())
LOG(ERROR) << "Error parsing '" << pattern << "': " << status.Text();
return -1;
}
// Concatenate with match index and push on vector.
int n = static_cast<int>(elem_.size());
re2::Regexp* m = re2::Regexp::HaveMatch(n, pf);
if (re->op() == kRegexpConcat) {
int nsub = re->nsub();
re2::Regexp** sub = new re2::Regexp*[nsub + 1];
for (int i = 0; i < nsub; i++)
sub[i] = re->sub()[i]->Incref();
sub[nsub] = m;
re->Decref();
re = re2::Regexp::Concat(sub, nsub + 1, pf);
delete[] sub;
} else {
re2::Regexp* sub[2];
sub[0] = re;
sub[1] = m;
re = re2::Regexp::Concat(sub, 2, pf);
}
elem_.emplace_back(pattern.ToString(), re);
return n;
}
bool RE2::Set::Compile() {
if (compiled_) {
LOG(DFATAL) << "RE2::Set::Compile() called more than once";
return false;
}
compiled_ = true;
size_ = static_cast<int>(elem_.size());
// Sort the elements by their patterns. This is good enough for now
// until we have a Regexp comparison function. (Maybe someday...)
std::sort(elem_.begin(), elem_.end(),
[](const Elem& a, const Elem& b) -> bool {
return a.first < b.first;
});
re2::Regexp** sub = new re2::Regexp*[size_];
for (size_t i = 0; i < elem_.size(); i++)
sub[i] = elem_[i].second;
elem_.clear();
elem_.shrink_to_fit();
Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
options_.ParseFlags());
re2::Regexp* re = re2::Regexp::Alternate(sub, size_, pf);
delete[] sub;
prog_ = Prog::CompileSet(re, anchor_, options_.max_mem());
re->Decref();
return prog_ != NULL;
}
bool RE2::Set::Match(const StringPiece& text, std::vector<int>* v) const {
return Match(text, v, NULL);
}
bool RE2::Set::Match(const StringPiece& text, std::vector<int>* v,
ErrorInfo* error_info) const {
if (!compiled_) {
LOG(DFATAL) << "RE2::Set::Match() called before compiling";
if (error_info != NULL)
error_info->kind = kNotCompiled;
return false;
}
bool dfa_failed = false;
std::unique_ptr<SparseSet> matches;
if (v != NULL) {
matches.reset(new SparseSet(size_));
v->clear();
}
bool ret = prog_->SearchDFA(text, text, Prog::kAnchored, Prog::kManyMatch,
NULL, &dfa_failed, matches.get());
if (dfa_failed) {
if (options_.log_errors())
LOG(ERROR) << "DFA out of memory: size " << prog_->size() << ", "
<< "bytemap range " << prog_->bytemap_range() << ", "
<< "list count " << prog_->list_count();
if (error_info != NULL)
error_info->kind = kOutOfMemory;
return false;
}
if (ret == false) {
if (error_info != NULL)
error_info->kind = kNoError;
return false;
}
if (v != NULL) {
if (matches->empty()) {
LOG(DFATAL) << "RE2::Set::Match() matched, but no matches returned?!";
if (error_info != NULL)
error_info->kind = kInconsistent;
return false;
}
v->assign(matches->begin(), matches->end());
}
if (error_info != NULL)
error_info->kind = kNoError;
return true;
}
} // namespace re2

View File

@ -1,80 +0,0 @@
// Copyright 2010 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_SET_H_
#define RE2_SET_H_
#include <string>
#include <utility>
#include <vector>
#include "re2/re2.h"
namespace re2 {
class Prog;
class Regexp;
} // namespace re2
namespace re2 {
// An RE2::Set represents a collection of regexps that can
// be searched for simultaneously.
class RE2::Set {
public:
enum ErrorKind {
kNoError = 0,
kNotCompiled, // The set is not compiled.
kOutOfMemory, // The DFA ran out of memory.
kInconsistent, // The result is inconsistent. This should never happen.
};
struct ErrorInfo {
ErrorKind kind;
};
Set(const RE2::Options& options, RE2::Anchor anchor);
~Set();
// Adds pattern to the set using the options passed to the constructor.
// Returns the index that will identify the regexp in the output of Match(),
// or -1 if the regexp cannot be parsed.
// Indices are assigned in sequential order starting from 0.
// Errors do not increment the index; if error is not NULL, *error will hold
// the error message from the parser.
int Add(const StringPiece& pattern, string* error);
// Compiles the set in preparation for matching.
// Returns false if the compiler runs out of memory.
// Add() must not be called again after Compile().
// Compile() must be called before Match().
bool Compile();
// Returns true if text matches at least one of the regexps in the set.
// Fills v (if not NULL) with the indices of the matching regexps.
// Callers must not expect v to be sorted.
bool Match(const StringPiece& text, std::vector<int>* v) const;
// As above, but populates error_info (if not NULL) when none of the regexps
// in the set matched. This can inform callers when DFA execution fails, for
// example, because they might wish to handle that case differently.
bool Match(const StringPiece& text, std::vector<int>* v,
ErrorInfo* error_info) const;
private:
typedef std::pair<string, re2::Regexp*> Elem;
RE2::Options options_;
RE2::Anchor anchor_;
std::vector<Elem> elem_;
re2::Prog* prog_;
bool compiled_;
int size_;
Set(const Set&) = delete;
Set& operator=(const Set&) = delete;
};
} // namespace re2
#endif // RE2_SET_H_

View File

@ -1,658 +0,0 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Rewrite POSIX and other features in re
// to use simple extended regular expression features.
// Also sort and simplify character classes.
#include <string>
#include "util/util.h"
#include "util/logging.h"
#include "util/utf.h"
#include "re2/regexp.h"
#include "re2/walker-inl.h"
namespace re2 {
// Parses the regexp src and then simplifies it and sets *dst to the
// string representation of the simplified form. Returns true on success.
// Returns false and sets *error (if error != NULL) on error.
bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags,
string* dst,
RegexpStatus* status) {
Regexp* re = Parse(src, flags, status);
if (re == NULL)
return false;
Regexp* sre = re->Simplify();
re->Decref();
if (sre == NULL) {
// Should not happen, since Simplify never fails.
LOG(ERROR) << "Simplify failed on " << src;
if (status) {
status->set_code(kRegexpInternalError);
status->set_error_arg(src);
}
return false;
}
*dst = sre->ToString();
sre->Decref();
return true;
}
// Assuming the simple_ flags on the children are accurate,
// is this Regexp* simple?
bool Regexp::ComputeSimple() {
Regexp** subs;
switch (op_) {
case kRegexpNoMatch:
case kRegexpEmptyMatch:
case kRegexpLiteral:
case kRegexpLiteralString:
case kRegexpBeginLine:
case kRegexpEndLine:
case kRegexpBeginText:
case kRegexpWordBoundary:
case kRegexpNoWordBoundary:
case kRegexpEndText:
case kRegexpAnyChar:
case kRegexpAnyByte:
case kRegexpHaveMatch:
return true;
case kRegexpConcat:
case kRegexpAlternate:
// These are simple as long as the subpieces are simple.
subs = sub();
for (int i = 0; i < nsub_; i++)
if (!subs[i]->simple())
return false;
return true;
case kRegexpCharClass:
// Simple as long as the char class is not empty, not full.
if (ccb_ != NULL)
return !ccb_->empty() && !ccb_->full();
return !cc_->empty() && !cc_->full();
case kRegexpCapture:
subs = sub();
return subs[0]->simple();
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
subs = sub();
if (!subs[0]->simple())
return false;
switch (subs[0]->op_) {
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
case kRegexpEmptyMatch:
case kRegexpNoMatch:
return false;
default:
break;
}
return true;
case kRegexpRepeat:
return false;
}
LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_;
return false;
}
// Walker subclass used by Simplify.
// Coalesces runs of star/plus/quest/repeat of the same literal along with any
// occurrences of that literal into repeats of that literal. It also works for
// char classes, any char and any byte.
// PostVisit creates the coalesced result, which should then be simplified.
class CoalesceWalker : public Regexp::Walker<Regexp*> {
public:
CoalesceWalker() {}
virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
Regexp** child_args, int nchild_args);
virtual Regexp* Copy(Regexp* re);
virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
private:
// These functions are declared inside CoalesceWalker so that
// they can edit the private fields of the Regexps they construct.
// Returns true if r1 and r2 can be coalesced. In particular, ensures that
// the parse flags are consistent. (They will not be checked again later.)
static bool CanCoalesce(Regexp* r1, Regexp* r2);
// Coalesces *r1ptr and *r2ptr. In most cases, the array elements afterwards
// will be empty match and the coalesced op. In other cases, where part of a
// literal string was removed to be coalesced, the array elements afterwards
// will be the coalesced op and the remainder of the literal string.
static void DoCoalesce(Regexp** r1ptr, Regexp** r2ptr);
CoalesceWalker(const CoalesceWalker&) = delete;
CoalesceWalker& operator=(const CoalesceWalker&) = delete;
};
// Walker subclass used by Simplify.
// The simplify walk is purely post-recursive: given the simplified children,
// PostVisit creates the simplified result.
// The child_args are simplified Regexp*s.
class SimplifyWalker : public Regexp::Walker<Regexp*> {
public:
SimplifyWalker() {}
virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
Regexp** child_args, int nchild_args);
virtual Regexp* Copy(Regexp* re);
virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
private:
// These functions are declared inside SimplifyWalker so that
// they can edit the private fields of the Regexps they construct.
// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
// Caller must Decref return value when done with it.
static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags);
// Simplifies the expression re{min,max} in terms of *, +, and ?.
// Returns a new regexp. Does not edit re. Does not consume reference to re.
// Caller must Decref return value when done with it.
static Regexp* SimplifyRepeat(Regexp* re, int min, int max,
Regexp::ParseFlags parse_flags);
// Simplifies a character class by expanding any named classes
// into rune ranges. Does not edit re. Does not consume ref to re.
// Caller must Decref return value when done with it.
static Regexp* SimplifyCharClass(Regexp* re);
SimplifyWalker(const SimplifyWalker&) = delete;
SimplifyWalker& operator=(const SimplifyWalker&) = delete;
};
// Simplifies a regular expression, returning a new regexp.
// The new regexp uses traditional Unix egrep features only,
// plus the Perl (?:) non-capturing parentheses.
// Otherwise, no POSIX or Perl additions. The new regexp
// captures exactly the same subexpressions (with the same indices)
// as the original.
// Does not edit current object.
// Caller must Decref() return value when done with it.
Regexp* Regexp::Simplify() {
CoalesceWalker cw;
Regexp* cre = cw.Walk(this, NULL);
if (cre == NULL)
return cre;
SimplifyWalker sw;
Regexp* sre = sw.Walk(cre, NULL);
cre->Decref();
return sre;
}
#define Simplify DontCallSimplify // Avoid accidental recursion
// Utility function for PostVisit implementations that compares re->sub() with
// child_args to determine whether any child_args changed. In the common case,
// where nothing changed, calls Decref() for all child_args and returns false,
// so PostVisit must return re->Incref(). Otherwise, returns true.
static bool ChildArgsChanged(Regexp* re, Regexp** child_args) {
for (int i = 0; i < re->nsub(); i++) {
Regexp* sub = re->sub()[i];
Regexp* newsub = child_args[i];
if (newsub != sub)
return true;
}
for (int i = 0; i < re->nsub(); i++) {
Regexp* newsub = child_args[i];
newsub->Decref();
}
return false;
}
Regexp* CoalesceWalker::Copy(Regexp* re) {
return re->Incref();
}
Regexp* CoalesceWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
// This should never be called, since we use Walk and not
// WalkExponential.
LOG(DFATAL) << "CoalesceWalker::ShortVisit called";
return re->Incref();
}
Regexp* CoalesceWalker::PostVisit(Regexp* re,
Regexp* parent_arg,
Regexp* pre_arg,
Regexp** child_args,
int nchild_args) {
if (re->nsub() == 0)
return re->Incref();
if (re->op() != kRegexpConcat) {
if (!ChildArgsChanged(re, child_args))
return re->Incref();
// Something changed. Build a new op.
Regexp* nre = new Regexp(re->op(), re->parse_flags());
nre->AllocSub(re->nsub());
Regexp** nre_subs = nre->sub();
for (int i = 0; i < re->nsub(); i++)
nre_subs[i] = child_args[i];
// Repeats and Captures have additional data that must be copied.
if (re->op() == kRegexpRepeat) {
nre->min_ = re->min();
nre->max_ = re->max();
} else if (re->op() == kRegexpCapture) {
nre->cap_ = re->cap();
}
return nre;
}
bool can_coalesce = false;
for (int i = 0; i < re->nsub(); i++) {
if (i+1 < re->nsub() &&
CanCoalesce(child_args[i], child_args[i+1])) {
can_coalesce = true;
break;
}
}
if (!can_coalesce) {
if (!ChildArgsChanged(re, child_args))
return re->Incref();
// Something changed. Build a new op.
Regexp* nre = new Regexp(re->op(), re->parse_flags());
nre->AllocSub(re->nsub());
Regexp** nre_subs = nre->sub();
for (int i = 0; i < re->nsub(); i++)
nre_subs[i] = child_args[i];
return nre;
}
for (int i = 0; i < re->nsub(); i++) {
if (i+1 < re->nsub() &&
CanCoalesce(child_args[i], child_args[i+1]))
DoCoalesce(&child_args[i], &child_args[i+1]);
}
// Determine how many empty matches were left by DoCoalesce.
int n = 0;
for (int i = n; i < re->nsub(); i++) {
if (child_args[i]->op() == kRegexpEmptyMatch)
n++;
}
// Build a new op.
Regexp* nre = new Regexp(re->op(), re->parse_flags());
nre->AllocSub(re->nsub() - n);
Regexp** nre_subs = nre->sub();
for (int i = 0, j = 0; i < re->nsub(); i++) {
if (child_args[i]->op() == kRegexpEmptyMatch) {
child_args[i]->Decref();
continue;
}
nre_subs[j] = child_args[i];
j++;
}
return nre;
}
bool CoalesceWalker::CanCoalesce(Regexp* r1, Regexp* r2) {
// r1 must be a star/plus/quest/repeat of a literal, char class, any char or
// any byte.
if ((r1->op() == kRegexpStar ||
r1->op() == kRegexpPlus ||
r1->op() == kRegexpQuest ||
r1->op() == kRegexpRepeat) &&
(r1->sub()[0]->op() == kRegexpLiteral ||
r1->sub()[0]->op() == kRegexpCharClass ||
r1->sub()[0]->op() == kRegexpAnyChar ||
r1->sub()[0]->op() == kRegexpAnyByte)) {
// r2 must be a star/plus/quest/repeat of the same literal, char class,
// any char or any byte.
if ((r2->op() == kRegexpStar ||
r2->op() == kRegexpPlus ||
r2->op() == kRegexpQuest ||
r2->op() == kRegexpRepeat) &&
Regexp::Equal(r1->sub()[0], r2->sub()[0]) &&
// The parse flags must be consistent.
((r1->parse_flags() & Regexp::NonGreedy) ==
(r2->parse_flags() & Regexp::NonGreedy))) {
return true;
}
// ... OR an occurrence of that literal, char class, any char or any byte
if (Regexp::Equal(r1->sub()[0], r2)) {
return true;
}
// ... OR a literal string that begins with that literal.
if (r1->sub()[0]->op() == kRegexpLiteral &&
r2->op() == kRegexpLiteralString &&
r2->runes()[0] == r1->sub()[0]->rune() &&
// The parse flags must be consistent.
((r1->sub()[0]->parse_flags() & Regexp::FoldCase) ==
(r2->parse_flags() & Regexp::FoldCase))) {
return true;
}
}
return false;
}
void CoalesceWalker::DoCoalesce(Regexp** r1ptr, Regexp** r2ptr) {
Regexp* r1 = *r1ptr;
Regexp* r2 = *r2ptr;
Regexp* nre = Regexp::Repeat(
r1->sub()[0]->Incref(), r1->parse_flags(), 0, 0);
switch (r1->op()) {
case kRegexpStar:
nre->min_ = 0;
nre->max_ = -1;
break;
case kRegexpPlus:
nre->min_ = 1;
nre->max_ = -1;
break;
case kRegexpQuest:
nre->min_ = 0;
nre->max_ = 1;
break;
case kRegexpRepeat:
nre->min_ = r1->min();
nre->max_ = r1->max();
break;
default:
LOG(DFATAL) << "DoCoalesce failed: r1->op() is " << r1->op();
nre->Decref();
return;
}
switch (r2->op()) {
case kRegexpStar:
nre->max_ = -1;
goto LeaveEmpty;
case kRegexpPlus:
nre->min_++;
nre->max_ = -1;
goto LeaveEmpty;
case kRegexpQuest:
if (nre->max() != -1)
nre->max_++;
goto LeaveEmpty;
case kRegexpRepeat:
nre->min_ += r2->min();
if (r2->max() == -1)
nre->max_ = -1;
else if (nre->max() != -1)
nre->max_ += r2->max();
goto LeaveEmpty;
case kRegexpLiteral:
case kRegexpCharClass:
case kRegexpAnyChar:
case kRegexpAnyByte:
nre->min_++;
if (nre->max() != -1)
nre->max_++;
goto LeaveEmpty;
LeaveEmpty:
*r1ptr = new Regexp(kRegexpEmptyMatch, Regexp::NoParseFlags);
*r2ptr = nre;
break;
case kRegexpLiteralString: {
Rune r = r1->sub()[0]->rune();
// Determine how much of the literal string is removed.
// We know that we have at least one rune. :)
int n = 1;
while (n < r2->nrunes() && r2->runes()[n] == r)
n++;
nre->min_ += n;
if (nre->max() != -1)
nre->max_ += n;
if (n == r2->nrunes())
goto LeaveEmpty;
*r1ptr = nre;
*r2ptr = Regexp::LiteralString(
&r2->runes()[n], r2->nrunes() - n, r2->parse_flags());
break;
}
default:
LOG(DFATAL) << "DoCoalesce failed: r2->op() is " << r2->op();
nre->Decref();
return;
}
r1->Decref();
r2->Decref();
}
Regexp* SimplifyWalker::Copy(Regexp* re) {
return re->Incref();
}
Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
// This should never be called, since we use Walk and not
// WalkExponential.
LOG(DFATAL) << "SimplifyWalker::ShortVisit called";
return re->Incref();
}
Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {
if (re->simple()) {
*stop = true;
return re->Incref();
}
return NULL;
}
Regexp* SimplifyWalker::PostVisit(Regexp* re,
Regexp* parent_arg,
Regexp* pre_arg,
Regexp** child_args,
int nchild_args) {
switch (re->op()) {
case kRegexpNoMatch:
case kRegexpEmptyMatch:
case kRegexpLiteral:
case kRegexpLiteralString:
case kRegexpBeginLine:
case kRegexpEndLine:
case kRegexpBeginText:
case kRegexpWordBoundary:
case kRegexpNoWordBoundary:
case kRegexpEndText:
case kRegexpAnyChar:
case kRegexpAnyByte:
case kRegexpHaveMatch:
// All these are always simple.
re->simple_ = true;
return re->Incref();
case kRegexpConcat:
case kRegexpAlternate: {
// These are simple as long as the subpieces are simple.
if (!ChildArgsChanged(re, child_args)) {
re->simple_ = true;
return re->Incref();
}
Regexp* nre = new Regexp(re->op(), re->parse_flags());
nre->AllocSub(re->nsub());
Regexp** nre_subs = nre->sub();
for (int i = 0; i < re->nsub(); i++)
nre_subs[i] = child_args[i];
nre->simple_ = true;
return nre;
}
case kRegexpCapture: {
Regexp* newsub = child_args[0];
if (newsub == re->sub()[0]) {
newsub->Decref();
re->simple_ = true;
return re->Incref();
}
Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());
nre->AllocSub(1);
nre->sub()[0] = newsub;
nre->cap_ = re->cap();
nre->simple_ = true;
return nre;
}
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest: {
Regexp* newsub = child_args[0];
// Special case: repeat the empty string as much as
// you want, but it's still the empty string.
if (newsub->op() == kRegexpEmptyMatch)
return newsub;
// These are simple as long as the subpiece is simple.
if (newsub == re->sub()[0]) {
newsub->Decref();
re->simple_ = true;
return re->Incref();
}
// These are also idempotent if flags are constant.
if (re->op() == newsub->op() &&
re->parse_flags() == newsub->parse_flags())
return newsub;
Regexp* nre = new Regexp(re->op(), re->parse_flags());
nre->AllocSub(1);
nre->sub()[0] = newsub;
nre->simple_ = true;
return nre;
}
case kRegexpRepeat: {
Regexp* newsub = child_args[0];
// Special case: repeat the empty string as much as
// you want, but it's still the empty string.
if (newsub->op() == kRegexpEmptyMatch)
return newsub;
Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_,
re->parse_flags());
newsub->Decref();
nre->simple_ = true;
return nre;
}
case kRegexpCharClass: {
Regexp* nre = SimplifyCharClass(re);
nre->simple_ = true;
return nre;
}
}
LOG(ERROR) << "Simplify case not handled: " << re->op();
return re->Incref();
}
// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
// Returns a new Regexp, handing the ref to the caller.
Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,
Regexp::ParseFlags parse_flags) {
Regexp* re = new Regexp(kRegexpConcat, parse_flags);
re->AllocSub(2);
Regexp** subs = re->sub();
subs[0] = re1;
subs[1] = re2;
return re;
}
// Simplifies the expression re{min,max} in terms of *, +, and ?.
// Returns a new regexp. Does not edit re. Does not consume reference to re.
// Caller must Decref return value when done with it.
// The result will *not* necessarily have the right capturing parens
// if you call ToString() and re-parse it: (x){2} becomes (x)(x),
// but in the Regexp* representation, both (x) are marked as $1.
Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
Regexp::ParseFlags f) {
// x{n,} means at least n matches of x.
if (max == -1) {
// Special case: x{0,} is x*
if (min == 0)
return Regexp::Star(re->Incref(), f);
// Special case: x{1,} is x+
if (min == 1)
return Regexp::Plus(re->Incref(), f);
// General case: x{4,} is xxxx+
Regexp** nre_subs = new Regexp*[min];
for (int i = 0; i < min-1; i++)
nre_subs[i] = re->Incref();
nre_subs[min-1] = Regexp::Plus(re->Incref(), f);
Regexp* nre = Regexp::Concat(nre_subs, min, f);
delete[] nre_subs;
return nre;
}
// Special case: (x){0} matches only empty string.
if (min == 0 && max == 0)
return new Regexp(kRegexpEmptyMatch, f);
// Special case: x{1} is just x.
if (min == 1 && max == 1)
return re->Incref();
// General case: x{n,m} means n copies of x and m copies of x?.
// The machine will do less work if we nest the final m copies,
// so that x{2,5} = xx(x(x(x)?)?)?
// Build leading prefix: xx. Capturing only on the last one.
Regexp* nre = NULL;
if (min > 0) {
Regexp** nre_subs = new Regexp*[min];
for (int i = 0; i < min; i++)
nre_subs[i] = re->Incref();
nre = Regexp::Concat(nre_subs, min, f);
delete[] nre_subs;
}
// Build and attach suffix: (x(x(x)?)?)?
if (max > min) {
Regexp* suf = Regexp::Quest(re->Incref(), f);
for (int i = min+1; i < max; i++)
suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f);
if (nre == NULL)
nre = suf;
else
nre = Concat2(nre, suf, f);
}
if (nre == NULL) {
// Some degenerate case, like min > max, or min < max < 0.
// This shouldn't happen, because the parser rejects such regexps.
LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max;
return new Regexp(kRegexpNoMatch, f);
}
return nre;
}
// Simplifies a character class.
// Caller must Decref return value when done with it.
Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) {
CharClass* cc = re->cc();
// Special cases
if (cc->empty())
return new Regexp(kRegexpNoMatch, re->parse_flags());
if (cc->full())
return new Regexp(kRegexpAnyChar, re->parse_flags());
return re->Incref();
}
} // namespace re2

View File

@ -1,65 +0,0 @@
// Copyright 2004 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "re2/stringpiece.h"
#include <ostream>
#include "util/util.h"
namespace re2 {
const StringPiece::size_type StringPiece::npos; // initialized in stringpiece.h
StringPiece::size_type StringPiece::copy(char* buf, size_type n,
size_type pos) const {
size_type ret = std::min(size_ - pos, n);
memcpy(buf, data_ + pos, ret);
return ret;
}
StringPiece StringPiece::substr(size_type pos, size_type n) const {
if (pos > size_) pos = size_;
if (n > size_ - pos) n = size_ - pos;
return StringPiece(data_ + pos, n);
}
StringPiece::size_type StringPiece::find(const StringPiece& s,
size_type pos) const {
if (pos > size_) return npos;
const_pointer result = std::search(data_ + pos, data_ + size_,
s.data_, s.data_ + s.size_);
size_type xpos = result - data_;
return xpos + s.size_ <= size_ ? xpos : npos;
}
StringPiece::size_type StringPiece::find(char c, size_type pos) const {
if (size_ <= 0 || pos >= size_) return npos;
const_pointer result = std::find(data_ + pos, data_ + size_, c);
return result != data_ + size_ ? result - data_ : npos;
}
StringPiece::size_type StringPiece::rfind(const StringPiece& s,
size_type pos) const {
if (size_ < s.size_) return npos;
if (s.size_ == 0) return std::min(size_, pos);
const_pointer last = data_ + std::min(size_ - s.size_, pos) + s.size_;
const_pointer result = std::find_end(data_, last, s.data_, s.data_ + s.size_);
return result != last ? result - data_ : npos;
}
StringPiece::size_type StringPiece::rfind(char c, size_type pos) const {
if (size_ <= 0) return npos;
for (size_t i = std::min(pos + 1, size_); i != 0;) {
if (data_[--i] == c) return i;
}
return npos;
}
std::ostream& operator<<(std::ostream& o, const StringPiece& p) {
o.write(p.data(), p.size());
return o;
}
} // namespace re2

View File

@ -1,190 +0,0 @@
// Copyright 2001-2010 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_STRINGPIECE_H_
#define RE2_STRINGPIECE_H_
// A string-like object that points to a sized piece of memory.
//
// Functions or methods may use const StringPiece& parameters to accept either
// a "const char*" or a "string" value that will be implicitly converted to
// a StringPiece. The implicit conversion means that it is often appropriate
// to include this .h file in other files rather than forward-declaring
// StringPiece as would be appropriate for most other Google classes.
//
// Systematic usage of StringPiece is encouraged as it will reduce unnecessary
// conversions from "const char*" to "string" and back again.
//
//
// Arghh! I wish C++ literals were "string".
#include <stddef.h>
#include <string.h>
#include <algorithm>
#include <iosfwd>
#include <iterator>
#include <string>
namespace re2 {
class StringPiece {
public:
typedef char value_type;
typedef char* pointer;
typedef const char* const_pointer;
typedef char& reference;
typedef const char& const_reference;
typedef const char* const_iterator;
typedef const_iterator iterator;
typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
typedef const_reverse_iterator reverse_iterator;
typedef size_t size_type;
typedef ptrdiff_t difference_type;
static const size_type npos = static_cast<size_type>(-1);
// We provide non-explicit singleton constructors so users can pass
// in a "const char*" or a "string" wherever a "StringPiece" is
// expected.
StringPiece()
: data_(NULL), size_(0) {}
StringPiece(const std::string& str)
: data_(str.data()), size_(str.size()) {}
StringPiece(const char* str)
: data_(str), size_(str == NULL ? 0 : strlen(str)) {}
StringPiece(const char* str, size_type len)
: data_(str), size_(len) {}
const_iterator begin() const { return data_; }
const_iterator end() const { return data_ + size_; }
const_reverse_iterator rbegin() const {
return const_reverse_iterator(data_ + size_);
}
const_reverse_iterator rend() const {
return const_reverse_iterator(data_);
}
size_type size() const { return size_; }
size_type length() const { return size_; }
bool empty() const { return size_ == 0; }
const_reference operator[](size_type i) const { return data_[i]; }
const_pointer data() const { return data_; }
void remove_prefix(size_type n) {
data_ += n;
size_ -= n;
}
void remove_suffix(size_type n) {
size_ -= n;
}
void set(const char* str) {
data_ = str;
size_ = str == NULL ? 0 : strlen(str);
}
void set(const char* str, size_type len) {
data_ = str;
size_ = len;
}
std::string as_string() const {
return std::string(data_, size_);
}
// We also define ToString() here, since many other string-like
// interfaces name the routine that converts to a C++ string
// "ToString", and it's confusing to have the method that does that
// for a StringPiece be called "as_string()". We also leave the
// "as_string()" method defined here for existing code.
std::string ToString() const {
return std::string(data_, size_);
}
void CopyToString(std::string* target) const {
target->assign(data_, size_);
}
void AppendToString(std::string* target) const {
target->append(data_, size_);
}
size_type copy(char* buf, size_type n, size_type pos = 0) const;
StringPiece substr(size_type pos = 0, size_type n = npos) const;
int compare(const StringPiece& x) const {
size_type min_size = std::min(size(), x.size());
if (min_size > 0) {
int r = memcmp(data(), x.data(), min_size);
if (r < 0) return -1;
if (r > 0) return 1;
}
if (size() < x.size()) return -1;
if (size() > x.size()) return 1;
return 0;
}
// Does "this" start with "x"?
bool starts_with(const StringPiece& x) const {
return x.empty() ||
(size() >= x.size() && memcmp(data(), x.data(), x.size()) == 0);
}
// Does "this" end with "x"?
bool ends_with(const StringPiece& x) const {
return x.empty() ||
(size() >= x.size() &&
memcmp(data() + (size() - x.size()), x.data(), x.size()) == 0);
}
bool contains(const StringPiece& s) const {
return find(s) != npos;
}
size_type find(const StringPiece& s, size_type pos = 0) const;
size_type find(char c, size_type pos = 0) const;
size_type rfind(const StringPiece& s, size_type pos = npos) const;
size_type rfind(char c, size_type pos = npos) const;
private:
const_pointer data_;
size_type size_;
};
inline bool operator==(const StringPiece& x, const StringPiece& y) {
StringPiece::size_type len = x.size();
if (len != y.size()) return false;
return x.data() == y.data() || len == 0 ||
memcmp(x.data(), y.data(), len) == 0;
}
inline bool operator!=(const StringPiece& x, const StringPiece& y) {
return !(x == y);
}
inline bool operator<(const StringPiece& x, const StringPiece& y) {
StringPiece::size_type min_size = std::min(x.size(), y.size());
int r = min_size == 0 ? 0 : memcmp(x.data(), y.data(), min_size);
return (r < 0) || (r == 0 && x.size() < y.size());
}
inline bool operator>(const StringPiece& x, const StringPiece& y) {
return y < x;
}
inline bool operator<=(const StringPiece& x, const StringPiece& y) {
return !(x > y);
}
inline bool operator>=(const StringPiece& x, const StringPiece& y) {
return !(x < y);
}
// Allow StringPiece to be logged.
std::ostream& operator<<(std::ostream& o, const StringPiece& p);
} // namespace re2
#endif // RE2_STRINGPIECE_H_

View File

@ -1,351 +0,0 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Format a regular expression structure as a string.
// Tested by parse_test.cc
#include <string.h>
#include <string>
#include "util/util.h"
#include "util/logging.h"
#include "util/strutil.h"
#include "util/utf.h"
#include "re2/regexp.h"
#include "re2/walker-inl.h"
namespace re2 {
enum {
PrecAtom,
PrecUnary,
PrecConcat,
PrecAlternate,
PrecEmpty,
PrecParen,
PrecToplevel,
};
// Helper function. See description below.
static void AppendCCRange(string* t, Rune lo, Rune hi);
// Walker to generate string in s_.
// The arg pointers are actually integers giving the
// context precedence.
// The child_args are always NULL.
class ToStringWalker : public Regexp::Walker<int> {
public:
explicit ToStringWalker(string* t) : t_(t) {}
virtual int PreVisit(Regexp* re, int parent_arg, bool* stop);
virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg,
int* child_args, int nchild_args);
virtual int ShortVisit(Regexp* re, int parent_arg) {
return 0;
}
private:
string* t_; // The string the walker appends to.
ToStringWalker(const ToStringWalker&) = delete;
ToStringWalker& operator=(const ToStringWalker&) = delete;
};
string Regexp::ToString() {
string t;
ToStringWalker w(&t);
w.WalkExponential(this, PrecToplevel, 100000);
if (w.stopped_early())
t += " [truncated]";
return t;
}
#define ToString DontCallToString // Avoid accidental recursion.
// Visits re before children are processed.
// Appends ( if needed and passes new precedence to children.
int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) {
int prec = parent_arg;
int nprec = PrecAtom;
switch (re->op()) {
case kRegexpNoMatch:
case kRegexpEmptyMatch:
case kRegexpLiteral:
case kRegexpAnyChar:
case kRegexpAnyByte:
case kRegexpBeginLine:
case kRegexpEndLine:
case kRegexpBeginText:
case kRegexpEndText:
case kRegexpWordBoundary:
case kRegexpNoWordBoundary:
case kRegexpCharClass:
case kRegexpHaveMatch:
nprec = PrecAtom;
break;
case kRegexpConcat:
case kRegexpLiteralString:
if (prec < PrecConcat)
t_->append("(?:");
nprec = PrecConcat;
break;
case kRegexpAlternate:
if (prec < PrecAlternate)
t_->append("(?:");
nprec = PrecAlternate;
break;
case kRegexpCapture:
t_->append("(");
if (re->cap() == 0)
LOG(DFATAL) << "kRegexpCapture cap() == 0";
if (re->name()) {
t_->append("?P<");
t_->append(*re->name());
t_->append(">");
}
nprec = PrecParen;
break;
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
case kRegexpRepeat:
if (prec < PrecUnary)
t_->append("(?:");
// The subprecedence here is PrecAtom instead of PrecUnary
// because PCRE treats two unary ops in a row as a parse error.
nprec = PrecAtom;
break;
}
return nprec;
}
static void AppendLiteral(string *t, Rune r, bool foldcase) {
if (r != 0 && r < 0x80 && strchr("(){}[]*+?|.^$\\", r)) {
t->append(1, '\\');
t->append(1, static_cast<char>(r));
} else if (foldcase && 'a' <= r && r <= 'z') {
r -= 'a' - 'A';
t->append(1, '[');
t->append(1, static_cast<char>(r));
t->append(1, static_cast<char>(r) + 'a' - 'A');
t->append(1, ']');
} else {
AppendCCRange(t, r, r);
}
}
// Visits re after children are processed.
// For childless regexps, all the work is done here.
// For regexps with children, append any unary suffixes or ).
int ToStringWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg,
int* child_args, int nchild_args) {
int prec = parent_arg;
switch (re->op()) {
case kRegexpNoMatch:
// There's no simple symbol for "no match", but
// [^0-Runemax] excludes everything.
t_->append("[^\\x00-\\x{10ffff}]");
break;
case kRegexpEmptyMatch:
// Append (?:) to make empty string visible,
// unless this is already being parenthesized.
if (prec < PrecEmpty)
t_->append("(?:)");
break;
case kRegexpLiteral:
AppendLiteral(t_, re->rune(),
(re->parse_flags() & Regexp::FoldCase) != 0);
break;
case kRegexpLiteralString:
for (int i = 0; i < re->nrunes(); i++)
AppendLiteral(t_, re->runes()[i],
(re->parse_flags() & Regexp::FoldCase) != 0);
if (prec < PrecConcat)
t_->append(")");
break;
case kRegexpConcat:
if (prec < PrecConcat)
t_->append(")");
break;
case kRegexpAlternate:
// Clumsy but workable: the children all appended |
// at the end of their strings, so just remove the last one.
if ((*t_)[t_->size()-1] == '|')
t_->erase(t_->size()-1);
else
LOG(DFATAL) << "Bad final char: " << t_;
if (prec < PrecAlternate)
t_->append(")");
break;
case kRegexpStar:
t_->append("*");
if (re->parse_flags() & Regexp::NonGreedy)
t_->append("?");
if (prec < PrecUnary)
t_->append(")");
break;
case kRegexpPlus:
t_->append("+");
if (re->parse_flags() & Regexp::NonGreedy)
t_->append("?");
if (prec < PrecUnary)
t_->append(")");
break;
case kRegexpQuest:
t_->append("?");
if (re->parse_flags() & Regexp::NonGreedy)
t_->append("?");
if (prec < PrecUnary)
t_->append(")");
break;
case kRegexpRepeat:
if (re->max() == -1)
t_->append(StringPrintf("{%d,}", re->min()));
else if (re->min() == re->max())
t_->append(StringPrintf("{%d}", re->min()));
else
t_->append(StringPrintf("{%d,%d}", re->min(), re->max()));
if (re->parse_flags() & Regexp::NonGreedy)
t_->append("?");
if (prec < PrecUnary)
t_->append(")");
break;
case kRegexpAnyChar:
t_->append(".");
break;
case kRegexpAnyByte:
t_->append("\\C");
break;
case kRegexpBeginLine:
t_->append("^");
break;
case kRegexpEndLine:
t_->append("$");
break;
case kRegexpBeginText:
t_->append("(?-m:^)");
break;
case kRegexpEndText:
if (re->parse_flags() & Regexp::WasDollar)
t_->append("(?-m:$)");
else
t_->append("\\z");
break;
case kRegexpWordBoundary:
t_->append("\\b");
break;
case kRegexpNoWordBoundary:
t_->append("\\B");
break;
case kRegexpCharClass: {
if (re->cc()->size() == 0) {
t_->append("[^\\x00-\\x{10ffff}]");
break;
}
t_->append("[");
// Heuristic: show class as negated if it contains the
// non-character 0xFFFE.
CharClass* cc = re->cc();
if (cc->Contains(0xFFFE)) {
cc = cc->Negate();
t_->append("^");
}
for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i)
AppendCCRange(t_, i->lo, i->hi);
if (cc != re->cc())
cc->Delete();
t_->append("]");
break;
}
case kRegexpCapture:
t_->append(")");
break;
case kRegexpHaveMatch:
// There's no syntax accepted by the parser to generate
// this node (it is generated by RE2::Set) so make something
// up that is readable but won't compile.
t_->append("(?HaveMatch:%d)", re->match_id());
break;
}
// If the parent is an alternation, append the | for it.
if (prec == PrecAlternate)
t_->append("|");
return 0;
}
// Appends a rune for use in a character class to the string t.
static void AppendCCChar(string* t, Rune r) {
if (0x20 <= r && r <= 0x7E) {
if (strchr("[]^-\\", r))
t->append("\\");
t->append(1, static_cast<char>(r));
return;
}
switch (r) {
default:
break;
case '\r':
t->append("\\r");
return;
case '\t':
t->append("\\t");
return;
case '\n':
t->append("\\n");
return;
case '\f':
t->append("\\f");
return;
}
if (r < 0x100) {
StringAppendF(t, "\\x%02x", static_cast<int>(r));
return;
}
StringAppendF(t, "\\x{%x}", static_cast<int>(r));
}
static void AppendCCRange(string* t, Rune lo, Rune hi) {
if (lo > hi)
return;
AppendCCChar(t, lo);
if (lo < hi) {
t->append("-");
AppendCCChar(t, hi);
}
}
} // namespace re2

View File

@ -1,297 +0,0 @@
# Copyright 2008 The RE2 Authors. All Rights Reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
"""Parser for Unicode data files (as distributed by unicode.org)."""
import os
import re
import urllib2
# Directory or URL where Unicode tables reside.
_UNICODE_DIR = "http://www.unicode.org/Public/10.0.0/ucd"
# Largest valid Unicode code value.
_RUNE_MAX = 0x10FFFF
class Error(Exception):
"""Unicode error base class."""
class InputError(Error):
"""Unicode input error class. Raised on invalid input."""
def _UInt(s):
"""Converts string to Unicode code point ('263A' => 0x263a).
Args:
s: string to convert
Returns:
Unicode code point
Raises:
InputError: the string is not a valid Unicode value.
"""
try:
v = int(s, 16)
except ValueError:
v = -1
if len(s) < 4 or len(s) > 6 or v < 0 or v > _RUNE_MAX:
raise InputError("invalid Unicode value %s" % (s,))
return v
def _URange(s):
"""Converts string to Unicode range.
'0001..0003' => [1, 2, 3].
'0001' => [1].
Args:
s: string to convert
Returns:
Unicode range
Raises:
InputError: the string is not a valid Unicode range.
"""
a = s.split("..")
if len(a) == 1:
return [_UInt(a[0])]
if len(a) == 2:
lo = _UInt(a[0])
hi = _UInt(a[1])
if lo < hi:
return range(lo, hi + 1)
raise InputError("invalid Unicode range %s" % (s,))
def _UStr(v):
"""Converts Unicode code point to hex string.
0x263a => '0x263A'.
Args:
v: code point to convert
Returns:
Unicode string
Raises:
InputError: the argument is not a valid Unicode value.
"""
if v < 0 or v > _RUNE_MAX:
raise InputError("invalid Unicode value %s" % (v,))
return "0x%04X" % (v,)
def _ParseContinue(s):
"""Parses a Unicode continuation field.
These are of the form '<Name, First>' or '<Name, Last>'.
Instead of giving an explicit range in a single table entry,
some Unicode tables use two entries, one for the first
code value in the range and one for the last.
The first entry's description is '<Name, First>' instead of 'Name'
and the second is '<Name, Last>'.
'<Name, First>' => ('Name', 'First')
'<Name, Last>' => ('Name', 'Last')
'Anything else' => ('Anything else', None)
Args:
s: continuation field string
Returns:
pair: name and ('First', 'Last', or None)
"""
match = re.match("<(.*), (First|Last)>", s)
if match is not None:
return match.groups()
return (s, None)
def ReadUnicodeTable(filename, nfields, doline):
"""Generic Unicode table text file reader.
The reader takes care of stripping out comments and also
parsing the two different ways that the Unicode tables specify
code ranges (using the .. notation and splitting the range across
multiple lines).
Each non-comment line in the table is expected to have the given
number of fields. The first field is known to be the Unicode value
and the second field its description.
The reader calls doline(codes, fields) for each entry in the table.
If fn raises an exception, the reader prints that exception,
prefixed with the file name and line number, and continues
processing the file. When done with the file, the reader re-raises
the first exception encountered during the file.
Arguments:
filename: the Unicode data file to read, or a file-like object.
nfields: the number of expected fields per line in that file.
doline: the function to call for each table entry.
Raises:
InputError: nfields is invalid (must be >= 2).
"""
if nfields < 2:
raise InputError("invalid number of fields %d" % (nfields,))
if type(filename) == str:
if filename.startswith("http://"):
fil = urllib2.urlopen(filename)
else:
fil = open(filename, "r")
else:
fil = filename
first = None # first code in multiline range
expect_last = None # tag expected for "Last" line in multiline range
lineno = 0 # current line number
for line in fil:
lineno += 1
try:
# Chop # comments and white space; ignore empty lines.
sharp = line.find("#")
if sharp >= 0:
line = line[:sharp]
line = line.strip()
if not line:
continue
# Split fields on ";", chop more white space.
# Must have the expected number of fields.
fields = [s.strip() for s in line.split(";")]
if len(fields) != nfields:
raise InputError("wrong number of fields %d %d - %s" %
(len(fields), nfields, line))
# The Unicode text files have two different ways
# to list a Unicode range. Either the first field is
# itself a range (0000..FFFF), or the range is split
# across two lines, with the second field noting
# the continuation.
codes = _URange(fields[0])
(name, cont) = _ParseContinue(fields[1])
if expect_last is not None:
# If the last line gave the First code in a range,
# this one had better give the Last one.
if (len(codes) != 1 or codes[0] <= first or
cont != "Last" or name != expect_last):
raise InputError("expected Last line for %s" %
(expect_last,))
codes = range(first, codes[0] + 1)
first = None
expect_last = None
fields[0] = "%04X..%04X" % (codes[0], codes[-1])
fields[1] = name
elif cont == "First":
# Otherwise, if this is the First code in a range,
# remember it and go to the next line.
if len(codes) != 1:
raise InputError("bad First line: range given")
expect_last = name
first = codes[0]
continue
doline(codes, fields)
except Exception, e:
print "%s:%d: %s" % (filename, lineno, e)
raise
if expect_last is not None:
raise InputError("expected Last line for %s; got EOF" %
(expect_last,))
def CaseGroups(unicode_dir=_UNICODE_DIR):
"""Returns list of Unicode code groups equivalent under case folding.
Each group is a sorted list of code points,
and the list of groups is sorted by first code point
in the group.
Args:
unicode_dir: Unicode data directory
Returns:
list of Unicode code groups
"""
# Dict mapping lowercase code point to fold-equivalent group.
togroup = {}
def DoLine(codes, fields):
"""Process single CaseFolding.txt line, updating togroup."""
(_, foldtype, lower, _) = fields
if foldtype not in ("C", "S"):
return
lower = _UInt(lower)
togroup.setdefault(lower, [lower]).extend(codes)
ReadUnicodeTable(unicode_dir+"/CaseFolding.txt", 4, DoLine)
groups = togroup.values()
for g in groups:
g.sort()
groups.sort()
return togroup, groups
def Scripts(unicode_dir=_UNICODE_DIR):
"""Returns dict mapping script names to code lists.
Args:
unicode_dir: Unicode data directory
Returns:
dict mapping script names to code lists
"""
scripts = {}
def DoLine(codes, fields):
"""Process single Scripts.txt line, updating scripts."""
(_, name) = fields
scripts.setdefault(name, []).extend(codes)
ReadUnicodeTable(unicode_dir+"/Scripts.txt", 2, DoLine)
return scripts
def Categories(unicode_dir=_UNICODE_DIR):
"""Returns dict mapping category names to code lists.
Args:
unicode_dir: Unicode data directory
Returns:
dict mapping category names to code lists
"""
categories = {}
def DoLine(codes, fields):
"""Process single UnicodeData.txt line, updating categories."""
category = fields[2]
categories.setdefault(category, []).extend(codes)
# Add codes from Lu into L, etc.
if len(category) > 1:
short = category[0]
categories.setdefault(short, []).extend(codes)
ReadUnicodeTable(unicode_dir+"/UnicodeData.txt", 15, DoLine)
return categories

View File

@ -1,558 +0,0 @@
// GENERATED BY make_unicode_casefold.py; DO NOT EDIT.
// make_unicode_casefold.py >unicode_casefold.cc
#include "re2/unicode_casefold.h"
namespace re2 {
// 1295 groups, 2620 pairs, 343 ranges
const CaseFold unicode_casefold[] = {
{ 65, 90, 32 },
{ 97, 106, -32 },
{ 107, 107, 8383 },
{ 108, 114, -32 },
{ 115, 115, 268 },
{ 116, 122, -32 },
{ 181, 181, 743 },
{ 192, 214, 32 },
{ 216, 222, 32 },
{ 223, 223, 7615 },
{ 224, 228, -32 },
{ 229, 229, 8262 },
{ 230, 246, -32 },
{ 248, 254, -32 },
{ 255, 255, 121 },
{ 256, 303, EvenOdd },
{ 306, 311, EvenOdd },
{ 313, 328, OddEven },
{ 330, 375, EvenOdd },
{ 376, 376, -121 },
{ 377, 382, OddEven },
{ 383, 383, -300 },
{ 384, 384, 195 },
{ 385, 385, 210 },
{ 386, 389, EvenOdd },
{ 390, 390, 206 },
{ 391, 392, OddEven },
{ 393, 394, 205 },
{ 395, 396, OddEven },
{ 398, 398, 79 },
{ 399, 399, 202 },
{ 400, 400, 203 },
{ 401, 402, OddEven },
{ 403, 403, 205 },
{ 404, 404, 207 },
{ 405, 405, 97 },
{ 406, 406, 211 },
{ 407, 407, 209 },
{ 408, 409, EvenOdd },
{ 410, 410, 163 },
{ 412, 412, 211 },
{ 413, 413, 213 },
{ 414, 414, 130 },
{ 415, 415, 214 },
{ 416, 421, EvenOdd },
{ 422, 422, 218 },
{ 423, 424, OddEven },
{ 425, 425, 218 },
{ 428, 429, EvenOdd },
{ 430, 430, 218 },
{ 431, 432, OddEven },
{ 433, 434, 217 },
{ 435, 438, OddEven },
{ 439, 439, 219 },
{ 440, 441, EvenOdd },
{ 444, 445, EvenOdd },
{ 447, 447, 56 },
{ 452, 452, EvenOdd },
{ 453, 453, OddEven },
{ 454, 454, -2 },
{ 455, 455, OddEven },
{ 456, 456, EvenOdd },
{ 457, 457, -2 },
{ 458, 458, EvenOdd },
{ 459, 459, OddEven },
{ 460, 460, -2 },
{ 461, 476, OddEven },
{ 477, 477, -79 },
{ 478, 495, EvenOdd },
{ 497, 497, OddEven },
{ 498, 498, EvenOdd },
{ 499, 499, -2 },
{ 500, 501, EvenOdd },
{ 502, 502, -97 },
{ 503, 503, -56 },
{ 504, 543, EvenOdd },
{ 544, 544, -130 },
{ 546, 563, EvenOdd },
{ 570, 570, 10795 },
{ 571, 572, OddEven },
{ 573, 573, -163 },
{ 574, 574, 10792 },
{ 575, 576, 10815 },
{ 577, 578, OddEven },
{ 579, 579, -195 },
{ 580, 580, 69 },
{ 581, 581, 71 },
{ 582, 591, EvenOdd },
{ 592, 592, 10783 },
{ 593, 593, 10780 },
{ 594, 594, 10782 },
{ 595, 595, -210 },
{ 596, 596, -206 },
{ 598, 599, -205 },
{ 601, 601, -202 },
{ 603, 603, -203 },
{ 604, 604, 42319 },
{ 608, 608, -205 },
{ 609, 609, 42315 },
{ 611, 611, -207 },
{ 613, 613, 42280 },
{ 614, 614, 42308 },
{ 616, 616, -209 },
{ 617, 617, -211 },
{ 618, 618, 42308 },
{ 619, 619, 10743 },
{ 620, 620, 42305 },
{ 623, 623, -211 },
{ 625, 625, 10749 },
{ 626, 626, -213 },
{ 629, 629, -214 },
{ 637, 637, 10727 },
{ 640, 640, -218 },
{ 643, 643, -218 },
{ 647, 647, 42282 },
{ 648, 648, -218 },
{ 649, 649, -69 },
{ 650, 651, -217 },
{ 652, 652, -71 },
{ 658, 658, -219 },
{ 669, 669, 42261 },
{ 670, 670, 42258 },
{ 837, 837, 84 },
{ 880, 883, EvenOdd },
{ 886, 887, EvenOdd },
{ 891, 893, 130 },
{ 895, 895, 116 },
{ 902, 902, 38 },
{ 904, 906, 37 },
{ 908, 908, 64 },
{ 910, 911, 63 },
{ 913, 929, 32 },
{ 931, 931, 31 },
{ 932, 939, 32 },
{ 940, 940, -38 },
{ 941, 943, -37 },
{ 945, 945, -32 },
{ 946, 946, 30 },
{ 947, 948, -32 },
{ 949, 949, 64 },
{ 950, 951, -32 },
{ 952, 952, 25 },
{ 953, 953, 7173 },
{ 954, 954, 54 },
{ 955, 955, -32 },
{ 956, 956, -775 },
{ 957, 959, -32 },
{ 960, 960, 22 },
{ 961, 961, 48 },
{ 962, 962, EvenOdd },
{ 963, 965, -32 },
{ 966, 966, 15 },
{ 967, 968, -32 },
{ 969, 969, 7517 },
{ 970, 971, -32 },
{ 972, 972, -64 },
{ 973, 974, -63 },
{ 975, 975, 8 },
{ 976, 976, -62 },
{ 977, 977, 35 },
{ 981, 981, -47 },
{ 982, 982, -54 },
{ 983, 983, -8 },
{ 984, 1007, EvenOdd },
{ 1008, 1008, -86 },
{ 1009, 1009, -80 },
{ 1010, 1010, 7 },
{ 1011, 1011, -116 },
{ 1012, 1012, -92 },
{ 1013, 1013, -96 },
{ 1015, 1016, OddEven },
{ 1017, 1017, -7 },
{ 1018, 1019, EvenOdd },
{ 1021, 1023, -130 },
{ 1024, 1039, 80 },
{ 1040, 1071, 32 },
{ 1072, 1073, -32 },
{ 1074, 1074, 6222 },
{ 1075, 1075, -32 },
{ 1076, 1076, 6221 },
{ 1077, 1085, -32 },
{ 1086, 1086, 6212 },
{ 1087, 1088, -32 },
{ 1089, 1090, 6210 },
{ 1091, 1097, -32 },
{ 1098, 1098, 6204 },
{ 1099, 1103, -32 },
{ 1104, 1119, -80 },
{ 1120, 1122, EvenOdd },
{ 1123, 1123, 6180 },
{ 1124, 1153, EvenOdd },
{ 1162, 1215, EvenOdd },
{ 1216, 1216, 15 },
{ 1217, 1230, OddEven },
{ 1231, 1231, -15 },
{ 1232, 1327, EvenOdd },
{ 1329, 1366, 48 },
{ 1377, 1414, -48 },
{ 4256, 4293, 7264 },
{ 4295, 4295, 7264 },
{ 4301, 4301, 7264 },
{ 5024, 5103, 38864 },
{ 5104, 5109, 8 },
{ 5112, 5117, -8 },
{ 7296, 7296, -6254 },
{ 7297, 7297, -6253 },
{ 7298, 7298, -6244 },
{ 7299, 7299, -6242 },
{ 7300, 7300, EvenOdd },
{ 7301, 7301, -6243 },
{ 7302, 7302, -6236 },
{ 7303, 7303, -6181 },
{ 7304, 7304, 35266 },
{ 7545, 7545, 35332 },
{ 7549, 7549, 3814 },
{ 7680, 7776, EvenOdd },
{ 7777, 7777, 58 },
{ 7778, 7829, EvenOdd },
{ 7835, 7835, -59 },
{ 7838, 7838, -7615 },
{ 7840, 7935, EvenOdd },
{ 7936, 7943, 8 },
{ 7944, 7951, -8 },
{ 7952, 7957, 8 },
{ 7960, 7965, -8 },
{ 7968, 7975, 8 },
{ 7976, 7983, -8 },
{ 7984, 7991, 8 },
{ 7992, 7999, -8 },
{ 8000, 8005, 8 },
{ 8008, 8013, -8 },
{ 8017, 8017, 8 },
{ 8019, 8019, 8 },
{ 8021, 8021, 8 },
{ 8023, 8023, 8 },
{ 8025, 8025, -8 },
{ 8027, 8027, -8 },
{ 8029, 8029, -8 },
{ 8031, 8031, -8 },
{ 8032, 8039, 8 },
{ 8040, 8047, -8 },
{ 8048, 8049, 74 },
{ 8050, 8053, 86 },
{ 8054, 8055, 100 },
{ 8056, 8057, 128 },
{ 8058, 8059, 112 },
{ 8060, 8061, 126 },
{ 8064, 8071, 8 },
{ 8072, 8079, -8 },
{ 8080, 8087, 8 },
{ 8088, 8095, -8 },
{ 8096, 8103, 8 },
{ 8104, 8111, -8 },
{ 8112, 8113, 8 },
{ 8115, 8115, 9 },
{ 8120, 8121, -8 },
{ 8122, 8123, -74 },
{ 8124, 8124, -9 },
{ 8126, 8126, -7289 },
{ 8131, 8131, 9 },
{ 8136, 8139, -86 },
{ 8140, 8140, -9 },
{ 8144, 8145, 8 },
{ 8152, 8153, -8 },
{ 8154, 8155, -100 },
{ 8160, 8161, 8 },
{ 8165, 8165, 7 },
{ 8168, 8169, -8 },
{ 8170, 8171, -112 },
{ 8172, 8172, -7 },
{ 8179, 8179, 9 },
{ 8184, 8185, -128 },
{ 8186, 8187, -126 },
{ 8188, 8188, -9 },
{ 8486, 8486, -7549 },
{ 8490, 8490, -8415 },
{ 8491, 8491, -8294 },
{ 8498, 8498, 28 },
{ 8526, 8526, -28 },
{ 8544, 8559, 16 },
{ 8560, 8575, -16 },
{ 8579, 8580, OddEven },
{ 9398, 9423, 26 },
{ 9424, 9449, -26 },
{ 11264, 11310, 48 },
{ 11312, 11358, -48 },
{ 11360, 11361, EvenOdd },
{ 11362, 11362, -10743 },
{ 11363, 11363, -3814 },
{ 11364, 11364, -10727 },
{ 11365, 11365, -10795 },
{ 11366, 11366, -10792 },
{ 11367, 11372, OddEven },
{ 11373, 11373, -10780 },
{ 11374, 11374, -10749 },
{ 11375, 11375, -10783 },
{ 11376, 11376, -10782 },
{ 11378, 11379, EvenOdd },
{ 11381, 11382, OddEven },
{ 11390, 11391, -10815 },
{ 11392, 11491, EvenOdd },
{ 11499, 11502, OddEven },
{ 11506, 11507, EvenOdd },
{ 11520, 11557, -7264 },
{ 11559, 11559, -7264 },
{ 11565, 11565, -7264 },
{ 42560, 42570, EvenOdd },
{ 42571, 42571, -35267 },
{ 42572, 42605, EvenOdd },
{ 42624, 42651, EvenOdd },
{ 42786, 42799, EvenOdd },
{ 42802, 42863, EvenOdd },
{ 42873, 42876, OddEven },
{ 42877, 42877, -35332 },
{ 42878, 42887, EvenOdd },
{ 42891, 42892, OddEven },
{ 42893, 42893, -42280 },
{ 42896, 42899, EvenOdd },
{ 42902, 42921, EvenOdd },
{ 42922, 42922, -42308 },
{ 42923, 42923, -42319 },
{ 42924, 42924, -42315 },
{ 42925, 42925, -42305 },
{ 42926, 42926, -42308 },
{ 42928, 42928, -42258 },
{ 42929, 42929, -42282 },
{ 42930, 42930, -42261 },
{ 42931, 42931, 928 },
{ 42932, 42935, EvenOdd },
{ 43859, 43859, -928 },
{ 43888, 43967, -38864 },
{ 65313, 65338, 32 },
{ 65345, 65370, -32 },
{ 66560, 66599, 40 },
{ 66600, 66639, -40 },
{ 66736, 66771, 40 },
{ 66776, 66811, -40 },
{ 68736, 68786, 64 },
{ 68800, 68850, -64 },
{ 71840, 71871, 32 },
{ 71872, 71903, -32 },
{ 125184, 125217, 34 },
{ 125218, 125251, -34 },
};
const int num_unicode_casefold = 343;
// 1295 groups, 1325 pairs, 191 ranges
const CaseFold unicode_tolower[] = {
{ 65, 90, 32 },
{ 181, 181, 775 },
{ 192, 214, 32 },
{ 216, 222, 32 },
{ 256, 302, EvenOddSkip },
{ 306, 310, EvenOddSkip },
{ 313, 327, OddEvenSkip },
{ 330, 374, EvenOddSkip },
{ 376, 376, -121 },
{ 377, 381, OddEvenSkip },
{ 383, 383, -268 },
{ 385, 385, 210 },
{ 386, 388, EvenOddSkip },
{ 390, 390, 206 },
{ 391, 391, OddEven },
{ 393, 394, 205 },
{ 395, 395, OddEven },
{ 398, 398, 79 },
{ 399, 399, 202 },
{ 400, 400, 203 },
{ 401, 401, OddEven },
{ 403, 403, 205 },
{ 404, 404, 207 },
{ 406, 406, 211 },
{ 407, 407, 209 },
{ 408, 408, EvenOdd },
{ 412, 412, 211 },
{ 413, 413, 213 },
{ 415, 415, 214 },
{ 416, 420, EvenOddSkip },
{ 422, 422, 218 },
{ 423, 423, OddEven },
{ 425, 425, 218 },
{ 428, 428, EvenOdd },
{ 430, 430, 218 },
{ 431, 431, OddEven },
{ 433, 434, 217 },
{ 435, 437, OddEvenSkip },
{ 439, 439, 219 },
{ 440, 440, EvenOdd },
{ 444, 444, EvenOdd },
{ 452, 452, 2 },
{ 453, 453, OddEven },
{ 455, 455, 2 },
{ 456, 456, EvenOdd },
{ 458, 458, 2 },
{ 459, 475, OddEvenSkip },
{ 478, 494, EvenOddSkip },
{ 497, 497, 2 },
{ 498, 500, EvenOddSkip },
{ 502, 502, -97 },
{ 503, 503, -56 },
{ 504, 542, EvenOddSkip },
{ 544, 544, -130 },
{ 546, 562, EvenOddSkip },
{ 570, 570, 10795 },
{ 571, 571, OddEven },
{ 573, 573, -163 },
{ 574, 574, 10792 },
{ 577, 577, OddEven },
{ 579, 579, -195 },
{ 580, 580, 69 },
{ 581, 581, 71 },
{ 582, 590, EvenOddSkip },
{ 837, 837, 116 },
{ 880, 882, EvenOddSkip },
{ 886, 886, EvenOdd },
{ 895, 895, 116 },
{ 902, 902, 38 },
{ 904, 906, 37 },
{ 908, 908, 64 },
{ 910, 911, 63 },
{ 913, 929, 32 },
{ 931, 939, 32 },
{ 962, 962, EvenOdd },
{ 975, 975, 8 },
{ 976, 976, -30 },
{ 977, 977, -25 },
{ 981, 981, -15 },
{ 982, 982, -22 },
{ 984, 1006, EvenOddSkip },
{ 1008, 1008, -54 },
{ 1009, 1009, -48 },
{ 1012, 1012, -60 },
{ 1013, 1013, -64 },
{ 1015, 1015, OddEven },
{ 1017, 1017, -7 },
{ 1018, 1018, EvenOdd },
{ 1021, 1023, -130 },
{ 1024, 1039, 80 },
{ 1040, 1071, 32 },
{ 1120, 1152, EvenOddSkip },
{ 1162, 1214, EvenOddSkip },
{ 1216, 1216, 15 },
{ 1217, 1229, OddEvenSkip },
{ 1232, 1326, EvenOddSkip },
{ 1329, 1366, 48 },
{ 4256, 4293, 7264 },
{ 4295, 4295, 7264 },
{ 4301, 4301, 7264 },
{ 5112, 5117, -8 },
{ 7296, 7296, -6222 },
{ 7297, 7297, -6221 },
{ 7298, 7298, -6212 },
{ 7299, 7300, -6210 },
{ 7301, 7301, -6211 },
{ 7302, 7302, -6204 },
{ 7303, 7303, -6180 },
{ 7304, 7304, 35267 },
{ 7680, 7828, EvenOddSkip },
{ 7835, 7835, -58 },
{ 7838, 7838, -7615 },
{ 7840, 7934, EvenOddSkip },
{ 7944, 7951, -8 },
{ 7960, 7965, -8 },
{ 7976, 7983, -8 },
{ 7992, 7999, -8 },
{ 8008, 8013, -8 },
{ 8025, 8025, -8 },
{ 8027, 8027, -8 },
{ 8029, 8029, -8 },
{ 8031, 8031, -8 },
{ 8040, 8047, -8 },
{ 8072, 8079, -8 },
{ 8088, 8095, -8 },
{ 8104, 8111, -8 },
{ 8120, 8121, -8 },
{ 8122, 8123, -74 },
{ 8124, 8124, -9 },
{ 8126, 8126, -7173 },
{ 8136, 8139, -86 },
{ 8140, 8140, -9 },
{ 8152, 8153, -8 },
{ 8154, 8155, -100 },
{ 8168, 8169, -8 },
{ 8170, 8171, -112 },
{ 8172, 8172, -7 },
{ 8184, 8185, -128 },
{ 8186, 8187, -126 },
{ 8188, 8188, -9 },
{ 8486, 8486, -7517 },
{ 8490, 8490, -8383 },
{ 8491, 8491, -8262 },
{ 8498, 8498, 28 },
{ 8544, 8559, 16 },
{ 8579, 8579, OddEven },
{ 9398, 9423, 26 },
{ 11264, 11310, 48 },
{ 11360, 11360, EvenOdd },
{ 11362, 11362, -10743 },
{ 11363, 11363, -3814 },
{ 11364, 11364, -10727 },
{ 11367, 11371, OddEvenSkip },
{ 11373, 11373, -10780 },
{ 11374, 11374, -10749 },
{ 11375, 11375, -10783 },
{ 11376, 11376, -10782 },
{ 11378, 11378, EvenOdd },
{ 11381, 11381, OddEven },
{ 11390, 11391, -10815 },
{ 11392, 11490, EvenOddSkip },
{ 11499, 11501, OddEvenSkip },
{ 11506, 11506, EvenOdd },
{ 42560, 42604, EvenOddSkip },
{ 42624, 42650, EvenOddSkip },
{ 42786, 42798, EvenOddSkip },
{ 42802, 42862, EvenOddSkip },
{ 42873, 42875, OddEvenSkip },
{ 42877, 42877, -35332 },
{ 42878, 42886, EvenOddSkip },
{ 42891, 42891, OddEven },
{ 42893, 42893, -42280 },
{ 42896, 42898, EvenOddSkip },
{ 42902, 42920, EvenOddSkip },
{ 42922, 42922, -42308 },
{ 42923, 42923, -42319 },
{ 42924, 42924, -42315 },
{ 42925, 42925, -42305 },
{ 42926, 42926, -42308 },
{ 42928, 42928, -42258 },
{ 42929, 42929, -42282 },
{ 42930, 42930, -42261 },
{ 42931, 42931, 928 },
{ 42932, 42934, EvenOddSkip },
{ 43888, 43967, -38864 },
{ 65313, 65338, 32 },
{ 66560, 66599, 40 },
{ 66736, 66771, 40 },
{ 68736, 68786, 64 },
{ 71840, 71871, 32 },
{ 125184, 125217, 34 },
};
const int num_unicode_tolower = 191;
} // namespace re2

View File

@ -1,78 +0,0 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_UNICODE_CASEFOLD_H_
#define RE2_UNICODE_CASEFOLD_H_
// Unicode case folding tables.
// The Unicode case folding tables encode the mapping from one Unicode point
// to the next largest Unicode point with equivalent folding. The largest
// point wraps back to the first. For example, the tables map:
//
// 'A' -> 'a'
// 'a' -> 'A'
//
// 'K' -> 'k'
// 'k' -> '' (Kelvin symbol)
// '' -> 'K'
//
// Like everything Unicode, these tables are big. If we represent the table
// as a sorted list of uint32_t pairs, it has 2049 entries and is 16 kB.
// Most table entries look like the ones around them:
// 'A' maps to 'A'+32, 'B' maps to 'B'+32, etc.
// Instead of listing all the pairs explicitly, we make a list of ranges
// and deltas, so that the table entries for 'A' through 'Z' can be represented
// as a single entry { 'A', 'Z', +32 }.
//
// In addition to blocks that map to each other (A-Z mapping to a-z)
// there are blocks of pairs that individually map to each other
// (for example, 0100<->0101, 0102<->0103, 0104<->0105, ...).
// For those, the special delta value EvenOdd marks even/odd pairs
// (if even, add 1; if odd, subtract 1), and OddEven marks odd/even pairs.
//
// In this form, the table has 274 entries, about 3kB. If we were to split
// the table into one for 16-bit codes and an overflow table for larger ones,
// we could get it down to about 1.5kB, but that's not worth the complexity.
//
// The grouped form also allows for efficient fold range calculations
// rather than looping one character at a time.
#include <stdint.h>
#include "util/util.h"
#include "util/utf.h"
namespace re2 {
enum {
EvenOdd = 1,
OddEven = -1,
EvenOddSkip = 1<<30,
OddEvenSkip,
};
struct CaseFold {
Rune lo;
Rune hi;
int32_t delta;
};
extern const CaseFold unicode_casefold[];
extern const int num_unicode_casefold;
extern const CaseFold unicode_tolower[];
extern const int num_unicode_tolower;
// Returns the CaseFold* in the tables that contains rune.
// If rune is not in the tables, returns the first CaseFold* after rune.
// If rune is larger than any value in the tables, returns NULL.
extern const CaseFold* LookupCaseFold(const CaseFold*, int, Rune rune);
// Returns the result of applying the fold f to the rune r.
extern Rune ApplyFold(const CaseFold *f, Rune r);
} // namespace re2
#endif // RE2_UNICODE_CASEFOLD_H_

File diff suppressed because it is too large Load Diff

View File

@ -1,67 +0,0 @@
// Copyright 2008 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_UNICODE_GROUPS_H_
#define RE2_UNICODE_GROUPS_H_
// Unicode character groups.
// The codes get split into ranges of 16-bit codes
// and ranges of 32-bit codes. It would be simpler
// to use only 32-bit ranges, but these tables are large
// enough to warrant extra care.
//
// Using just 32-bit ranges gives 27 kB of data.
// Adding 16-bit ranges gives 18 kB of data.
// Adding an extra table of 16-bit singletons would reduce
// to 16.5 kB of data but make the data harder to use;
// we don't bother.
#include <stdint.h>
#include "util/util.h"
#include "util/utf.h"
namespace re2 {
struct URange16
{
uint16_t lo;
uint16_t hi;
};
struct URange32
{
Rune lo;
Rune hi;
};
struct UGroup
{
const char *name;
int sign; // +1 for [abc], -1 for [^abc]
const URange16 *r16;
int nr16;
const URange32 *r32;
int nr32;
};
// Named by property or script name (e.g., "Nd", "N", "Han").
// Negated groups are not included.
extern const UGroup unicode_groups[];
extern const int num_unicode_groups;
// Named by POSIX name (e.g., "[:alpha:]", "[:^lower:]").
// Negated groups are included.
extern const UGroup posix_groups[];
extern const int num_posix_groups;
// Named by Perl name (e.g., "\\d", "\\D").
// Negated groups are included.
extern const UGroup perl_groups[];
extern const int num_perl_groups;
} // namespace re2
#endif // RE2_UNICODE_GROUPS_H_

View File

@ -1,344 +0,0 @@
// Copyright 2010 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_VARIADIC_FUNCTION_H_
#define RE2_VARIADIC_FUNCTION_H_
namespace re2 {
template <typename Result, typename Param0, typename Param1, typename Arg,
Result (*Func)(Param0, Param1, const Arg* const [], int count)>
class VariadicFunction2 {
public:
Result operator()(Param0 p0, Param1 p1) const {
return Func(p0, p1, 0, 0);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0) const {
const Arg* const args[] = { &a0 };
return Func(p0, p1, args, 1);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1) const {
const Arg* const args[] = { &a0, &a1 };
return Func(p0, p1, args, 2);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2) const {
const Arg* const args[] = { &a0, &a1, &a2 };
return Func(p0, p1, args, 3);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3 };
return Func(p0, p1, args, 4);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4 };
return Func(p0, p1, args, 5);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5 };
return Func(p0, p1, args, 6);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6 };
return Func(p0, p1, args, 7);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7 };
return Func(p0, p1, args, 8);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8 };
return Func(p0, p1, args, 9);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9 };
return Func(p0, p1, args, 10);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10 };
return Func(p0, p1, args, 11);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11 };
return Func(p0, p1, args, 12);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12 };
return Func(p0, p1, args, 13);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13 };
return Func(p0, p1, args, 14);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14 };
return Func(p0, p1, args, 15);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15 };
return Func(p0, p1, args, 16);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16 };
return Func(p0, p1, args, 17);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17 };
return Func(p0, p1, args, 18);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18 };
return Func(p0, p1, args, 19);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19 };
return Func(p0, p1, args, 20);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19, const Arg& a20) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19,
&a20 };
return Func(p0, p1, args, 21);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
&a21 };
return Func(p0, p1, args, 22);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
const Arg& a22) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
&a21, &a22 };
return Func(p0, p1, args, 23);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
const Arg& a22, const Arg& a23) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
&a21, &a22, &a23 };
return Func(p0, p1, args, 24);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
const Arg& a22, const Arg& a23, const Arg& a24) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
&a21, &a22, &a23, &a24 };
return Func(p0, p1, args, 25);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
&a21, &a22, &a23, &a24, &a25 };
return Func(p0, p1, args, 26);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25,
const Arg& a26) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
&a21, &a22, &a23, &a24, &a25, &a26 };
return Func(p0, p1, args, 27);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25,
const Arg& a26, const Arg& a27) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
&a21, &a22, &a23, &a24, &a25, &a26, &a27 };
return Func(p0, p1, args, 28);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25,
const Arg& a26, const Arg& a27, const Arg& a28) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
&a21, &a22, &a23, &a24, &a25, &a26, &a27, &a28 };
return Func(p0, p1, args, 29);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25,
const Arg& a26, const Arg& a27, const Arg& a28, const Arg& a29) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
&a21, &a22, &a23, &a24, &a25, &a26, &a27, &a28, &a29 };
return Func(p0, p1, args, 30);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25,
const Arg& a26, const Arg& a27, const Arg& a28, const Arg& a29,
const Arg& a30) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
&a21, &a22, &a23, &a24, &a25, &a26, &a27, &a28, &a29, &a30 };
return Func(p0, p1, args, 31);
}
Result operator()(Param0 p0, Param1 p1, const Arg& a0, const Arg& a1,
const Arg& a2, const Arg& a3, const Arg& a4, const Arg& a5,
const Arg& a6, const Arg& a7, const Arg& a8, const Arg& a9,
const Arg& a10, const Arg& a11, const Arg& a12, const Arg& a13,
const Arg& a14, const Arg& a15, const Arg& a16, const Arg& a17,
const Arg& a18, const Arg& a19, const Arg& a20, const Arg& a21,
const Arg& a22, const Arg& a23, const Arg& a24, const Arg& a25,
const Arg& a26, const Arg& a27, const Arg& a28, const Arg& a29,
const Arg& a30, const Arg& a31) const {
const Arg* const args[] = { &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, &a8,
&a9, &a10, &a11, &a12, &a13, &a14, &a15, &a16, &a17, &a18, &a19, &a20,
&a21, &a22, &a23, &a24, &a25, &a26, &a27, &a28, &a29, &a30, &a31 };
return Func(p0, p1, args, 32);
}
};
} // namespace re2
#endif // RE2_VARIADIC_FUNCTION_H_

View File

@ -1,248 +0,0 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef RE2_WALKER_INL_H_
#define RE2_WALKER_INL_H_
// Helper class for traversing Regexps without recursion.
// Clients should declare their own subclasses that override
// the PreVisit and PostVisit methods, which are called before
// and after visiting the subexpressions.
// Not quite the Visitor pattern, because (among other things)
// the Visitor pattern is recursive.
#include <stack>
#include "util/logging.h"
#include "re2/regexp.h"
namespace re2 {
template<typename T> struct WalkState;
template<typename T> class Regexp::Walker {
public:
Walker();
virtual ~Walker();
// Virtual method called before visiting re's children.
// PreVisit passes ownership of its return value to its caller.
// The Arg* that PreVisit returns will be passed to PostVisit as pre_arg
// and passed to the child PreVisits and PostVisits as parent_arg.
// At the top-most Regexp, parent_arg is arg passed to walk.
// If PreVisit sets *stop to true, the walk does not recurse
// into the children. Instead it behaves as though the return
// value from PreVisit is the return value from PostVisit.
// The default PreVisit returns parent_arg.
virtual T PreVisit(Regexp* re, T parent_arg, bool* stop);
// Virtual method called after visiting re's children.
// The pre_arg is the T that PreVisit returned.
// The child_args is a vector of the T that the child PostVisits returned.
// PostVisit takes ownership of pre_arg.
// PostVisit takes ownership of the Ts
// in *child_args, but not the vector itself.
// PostVisit passes ownership of its return value
// to its caller.
// The default PostVisit simply returns pre_arg.
virtual T PostVisit(Regexp* re, T parent_arg, T pre_arg,
T* child_args, int nchild_args);
// Virtual method called to copy a T,
// when Walk notices that more than one child is the same re.
virtual T Copy(T arg);
// Virtual method called to do a "quick visit" of the re,
// but not its children. Only called once the visit budget
// has been used up and we're trying to abort the walk
// as quickly as possible. Should return a value that
// makes sense for the parent PostVisits still to be run.
// This function is (hopefully) only called by
// WalkExponential, but must be implemented by all clients,
// just in case.
virtual T ShortVisit(Regexp* re, T parent_arg) = 0;
// Walks over a regular expression.
// Top_arg is passed as parent_arg to PreVisit and PostVisit of re.
// Returns the T returned by PostVisit on re.
T Walk(Regexp* re, T top_arg);
// Like Walk, but doesn't use Copy. This can lead to
// exponential runtimes on cross-linked Regexps like the
// ones generated by Simplify. To help limit this,
// at most max_visits nodes will be visited and then
// the walk will be cut off early.
// If the walk *is* cut off early, ShortVisit(re)
// will be called on regexps that cannot be fully
// visited rather than calling PreVisit/PostVisit.
T WalkExponential(Regexp* re, T top_arg, int max_visits);
// Clears the stack. Should never be necessary, since
// Walk always enters and exits with an empty stack.
// Logs DFATAL if stack is not already clear.
void Reset();
// Returns whether walk was cut off.
bool stopped_early() { return stopped_early_; }
private:
// Walk state for the entire traversal.
std::stack<WalkState<T> >* stack_;
bool stopped_early_;
int max_visits_;
T WalkInternal(Regexp* re, T top_arg, bool use_copy);
Walker(const Walker&) = delete;
Walker& operator=(const Walker&) = delete;
};
template<typename T> T Regexp::Walker<T>::PreVisit(Regexp* re,
T parent_arg,
bool* stop) {
return parent_arg;
}
template<typename T> T Regexp::Walker<T>::PostVisit(Regexp* re,
T parent_arg,
T pre_arg,
T* child_args,
int nchild_args) {
return pre_arg;
}
template<typename T> T Regexp::Walker<T>::Copy(T arg) {
return arg;
}
// State about a single level in the traversal.
template<typename T> struct WalkState {
WalkState<T>(Regexp* re, T parent)
: re(re),
n(-1),
parent_arg(parent),
child_args(NULL) { }
Regexp* re; // The regexp
int n; // The index of the next child to process; -1 means need to PreVisit
T parent_arg; // Accumulated arguments.
T pre_arg;
T child_arg; // One-element buffer for child_args.
T* child_args;
};
template<typename T> Regexp::Walker<T>::Walker() {
stack_ = new std::stack<WalkState<T> >;
stopped_early_ = false;
}
template<typename T> Regexp::Walker<T>::~Walker() {
Reset();
delete stack_;
}
// Clears the stack. Should never be necessary, since
// Walk always enters and exits with an empty stack.
// Logs DFATAL if stack is not already clear.
template<typename T> void Regexp::Walker<T>::Reset() {
if (stack_ && stack_->size() > 0) {
LOG(DFATAL) << "Stack not empty.";
while (stack_->size() > 0) {
delete stack_->top().child_args;
stack_->pop();
}
}
}
template<typename T> T Regexp::Walker<T>::WalkInternal(Regexp* re, T top_arg,
bool use_copy) {
Reset();
if (re == NULL) {
LOG(DFATAL) << "Walk NULL";
return top_arg;
}
stack_->push(WalkState<T>(re, top_arg));
WalkState<T>* s;
for (;;) {
T t;
s = &stack_->top();
Regexp* re = s->re;
switch (s->n) {
case -1: {
if (--max_visits_ < 0) {
stopped_early_ = true;
t = ShortVisit(re, s->parent_arg);
break;
}
bool stop = false;
s->pre_arg = PreVisit(re, s->parent_arg, &stop);
if (stop) {
t = s->pre_arg;
break;
}
s->n = 0;
s->child_args = NULL;
if (re->nsub_ == 1)
s->child_args = &s->child_arg;
else if (re->nsub_ > 1)
s->child_args = new T[re->nsub_];
FALLTHROUGH_INTENDED;
}
default: {
if (re->nsub_ > 0) {
Regexp** sub = re->sub();
if (s->n < re->nsub_) {
if (use_copy && s->n > 0 && sub[s->n - 1] == sub[s->n]) {
s->child_args[s->n] = Copy(s->child_args[s->n - 1]);
s->n++;
} else {
stack_->push(WalkState<T>(sub[s->n], s->pre_arg));
}
continue;
}
}
t = PostVisit(re, s->parent_arg, s->pre_arg, s->child_args, s->n);
if (re->nsub_ > 1)
delete[] s->child_args;
break;
}
}
// We've finished stack_->top().
// Update next guy down.
stack_->pop();
if (stack_->size() == 0)
return t;
s = &stack_->top();
if (s->child_args != NULL)
s->child_args[s->n] = t;
else
s->child_arg = t;
s->n++;
}
}
template<typename T> T Regexp::Walker<T>::Walk(Regexp* re, T top_arg) {
// Without the exponential walking behavior,
// this budget should be more than enough for any
// regexp, and yet not enough to get us in trouble
// as far as CPU time.
max_visits_ = 1000000;
return WalkInternal(re, top_arg, true);
}
template<typename T> T Regexp::Walker<T>::WalkExponential(Regexp* re, T top_arg,
int max_visits) {
max_visits_ = max_visits;
return WalkInternal(re, top_arg, false);
}
} // namespace re2
#endif // RE2_WALKER_INL_H_

View File

@ -1,160 +0,0 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <chrono>
#include "util/util.h"
#include "util/flags.h"
#include "util/benchmark.h"
#include "re2/re2.h"
DEFINE_string(test_tmpdir, "/var/tmp", "temp directory");
#ifdef _WIN32
#define snprintf _snprintf
#endif
using testing::Benchmark;
static Benchmark* benchmarks[10000];
static int nbenchmarks;
void Benchmark::Register() {
benchmarks[nbenchmarks] = this;
if(lo < 1)
lo = 1;
if(hi < lo)
hi = lo;
nbenchmarks++;
}
static int64_t nsec() {
return std::chrono::duration_cast<std::chrono::nanoseconds>(
std::chrono::steady_clock::now().time_since_epoch()).count();
}
static int64_t bytes;
static int64_t ns;
static int64_t t0;
static int64_t items;
void SetBenchmarkBytesProcessed(int64_t x) {
bytes = x;
}
void StopBenchmarkTiming() {
if(t0 != 0)
ns += nsec() - t0;
t0 = 0;
}
void StartBenchmarkTiming() {
if(t0 == 0)
t0 = nsec();
}
void SetBenchmarkItemsProcessed(int n) {
items = n;
}
void BenchmarkMemoryUsage() {
// TODO(rsc): Implement.
}
int NumCPUs() {
return 1;
}
static void runN(Benchmark *b, int n, int siz) {
bytes = 0;
items = 0;
ns = 0;
t0 = nsec();
if(b->fn)
b->fn(n);
else if(b->fnr)
b->fnr(n, siz);
else {
fprintf(stderr, "%s: missing function\n", b->name);
abort();
}
if(t0 != 0)
ns += nsec() - t0;
}
static int round(int n) {
int base = 1;
while(base*10 < n)
base *= 10;
if(n < 2*base)
return 2*base;
if(n < 5*base)
return 5*base;
return 10*base;
}
void RunBench(Benchmark* b, int nthread, int siz) {
int n, last;
// TODO(rsc): Threaded benchmarks.
if(nthread != 1)
return;
// run once in case it's expensive
n = 1;
runN(b, n, siz);
while(ns < (int)1e9 && n < (int)1e9) {
last = n;
if(ns/n == 0)
n = (int)1e9;
else
n = (int)1e9 / static_cast<int>(ns/n);
n = std::max(last+1, std::min(n+n/2, 100*last));
n = round(n);
runN(b, n, siz);
}
char mb[100];
char suf[100];
mb[0] = '\0';
suf[0] = '\0';
if(ns > 0 && bytes > 0)
snprintf(mb, sizeof mb, "\t%7.2f MB/s", ((double)bytes/1e6)/((double)ns/1e9));
if(b->fnr || b->lo != b->hi) {
if(siz >= (1<<20))
snprintf(suf, sizeof suf, "/%dM", siz/(1<<20));
else if(siz >= (1<<10))
snprintf(suf, sizeof suf, "/%dK", siz/(1<<10));
else
snprintf(suf, sizeof suf, "/%d", siz);
}
printf("%s%s\t%8lld\t%10lld ns/op%s\n", b->name, suf, (long long)n, (long long)ns/n, mb);
fflush(stdout);
}
static int match(const char* name, int argc, const char** argv) {
if(argc == 1)
return 1;
for(int i = 1; i < argc; i++)
if(RE2::PartialMatch(name, argv[i]))
return 1;
return 0;
}
int main(int argc, const char** argv) {
for(int i = 0; i < nbenchmarks; i++) {
Benchmark* b = benchmarks[i];
if(match(b->name, argc, argv))
for(int j = b->threadlo; j <= b->threadhi; j++)
for(int k = std::max(b->lo, 1); k <= std::max(b->hi, 1); k<<=1)
RunBench(b, j, k);
}
}

View File

@ -1,43 +0,0 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef UTIL_BENCHMARK_H_
#define UTIL_BENCHMARK_H_
#include <stdint.h>
namespace testing {
struct Benchmark {
const char* name;
void (*fn)(int);
void (*fnr)(int, int);
int lo;
int hi;
int threadlo;
int threadhi;
void Register();
Benchmark(const char* name, void (*f)(int)) { Clear(name); fn = f; Register(); }
Benchmark(const char* name, void (*f)(int, int), int l, int h) { Clear(name); fnr = f; lo = l; hi = h; Register(); }
void Clear(const char* n) { name = n; fn = 0; fnr = 0; lo = 0; hi = 0; threadlo = 0; threadhi = 0; }
Benchmark* ThreadRange(int lo, int hi) { threadlo = lo; threadhi = hi; return this; }
};
} // namespace testing
void SetBenchmarkBytesProcessed(int64_t);
void StopBenchmarkTiming();
void StartBenchmarkTiming();
void BenchmarkMemoryUsage();
void SetBenchmarkItemsProcessed(int);
int NumCPUs();
#define BENCHMARK(f) \
::testing::Benchmark* _benchmark_##f = (new ::testing::Benchmark(#f, f))
#define BENCHMARK_RANGE(f, lo, hi) \
::testing::Benchmark* _benchmark_##f = \
(new ::testing::Benchmark(#f, f, lo, hi))
#endif // UTIL_BENCHMARK_H_

View File

@ -1,29 +0,0 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef UTIL_FLAGS_H_
#define UTIL_FLAGS_H_
// Simplified version of Google's command line flags.
// Does not support parsing the command line.
// If you want to do that, see
// https://gflags.github.io/gflags/
#include <stdint.h>
#define DEFINE_flag(type, name, deflt, desc) \
namespace re2 { type FLAGS_##name = deflt; }
#define DECLARE_flag(type, name) \
namespace re2 { extern type FLAGS_##name; }
#define DEFINE_bool(name, deflt, desc) DEFINE_flag(bool, name, deflt, desc)
#define DEFINE_int32(name, deflt, desc) DEFINE_flag(int32_t, name, deflt, desc)
#define DEFINE_string(name, deflt, desc) DEFINE_flag(string, name, deflt, desc)
#define DECLARE_bool(name) DECLARE_flag(bool, name)
#define DECLARE_int32(name) DECLARE_flag(int32_t, name)
#define DECLARE_string(name) DECLARE_flag(string, name)
#endif // UTIL_FLAGS_H_

View File

@ -1,21 +0,0 @@
// Copyright 2016 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include <stddef.h>
#include <stdint.h>
#include <stdlib.h>
// Entry point for libFuzzer.
extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size);
int main(int argc, char** argv) {
uint8_t data[32];
for (int i = 0; i < 32; i++) {
for (int j = 0; j < 32; j++) {
data[j] = random() & 0xFF;
}
LLVMFuzzerTestOneInput(data, 32);
}
return 0;
}

View File

@ -1,109 +0,0 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef UTIL_LOGGING_H_
#define UTIL_LOGGING_H_
// Simplified version of Google's logging.
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <ostream>
#include <sstream>
#include "util/util.h"
// Debug-only checking.
#define DCHECK(condition) assert(condition)
#define DCHECK_EQ(val1, val2) assert((val1) == (val2))
#define DCHECK_NE(val1, val2) assert((val1) != (val2))
#define DCHECK_LE(val1, val2) assert((val1) <= (val2))
#define DCHECK_LT(val1, val2) assert((val1) < (val2))
#define DCHECK_GE(val1, val2) assert((val1) >= (val2))
#define DCHECK_GT(val1, val2) assert((val1) > (val2))
// Always-on checking
#define CHECK(x) if(x){}else LogMessageFatal(__FILE__, __LINE__).stream() << "Check failed: " #x
#define CHECK_LT(x, y) CHECK((x) < (y))
#define CHECK_GT(x, y) CHECK((x) > (y))
#define CHECK_LE(x, y) CHECK((x) <= (y))
#define CHECK_GE(x, y) CHECK((x) >= (y))
#define CHECK_EQ(x, y) CHECK((x) == (y))
#define CHECK_NE(x, y) CHECK((x) != (y))
#define LOG_INFO LogMessage(__FILE__, __LINE__)
#define LOG_WARNING LogMessage(__FILE__, __LINE__)
#define LOG_ERROR LogMessage(__FILE__, __LINE__)
#define LOG_FATAL LogMessageFatal(__FILE__, __LINE__)
#define LOG_QFATAL LOG_FATAL
// It seems that one of the Windows header files defines ERROR as 0.
#ifdef _WIN32
#define LOG_0 LOG_INFO
#endif
#ifdef NDEBUG
#define LOG_DFATAL LOG_ERROR
#else
#define LOG_DFATAL LOG_FATAL
#endif
#define LOG(severity) LOG_ ## severity.stream()
#define VLOG(x) if((x)>0){}else LOG_INFO.stream()
class LogMessage {
public:
LogMessage(const char* file, int line)
: flushed_(false) {
stream() << file << ":" << line << ": ";
}
void Flush() {
stream() << "\n";
string s = str_.str();
size_t n = s.size();
if (fwrite(s.data(), 1, n, stderr) < n) {} // shut up gcc
flushed_ = true;
}
~LogMessage() {
if (!flushed_) {
Flush();
}
}
std::ostream& stream() { return str_; }
private:
bool flushed_;
std::ostringstream str_;
LogMessage(const LogMessage&) = delete;
LogMessage& operator=(const LogMessage&) = delete;
};
// Silence "destructor never returns" warning for ~LogMessageFatal().
// Since this is a header file, push and then pop to limit the scope.
#ifdef _MSC_VER
#pragma warning(push)
#pragma warning(disable: 4722)
#endif
class LogMessageFatal : public LogMessage {
public:
LogMessageFatal(const char* file, int line)
: LogMessage(file, line) {}
~LogMessageFatal() {
Flush();
abort();
}
private:
LogMessageFatal(const LogMessageFatal&) = delete;
LogMessageFatal& operator=(const LogMessageFatal&) = delete;
};
#ifdef _MSC_VER
#pragma warning(pop)
#endif
#endif // UTIL_LOGGING_H_

View File

@ -1,41 +0,0 @@
// Copyright 2016 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef UTIL_MIX_H_
#define UTIL_MIX_H_
#include <stddef.h>
#include <limits>
namespace re2 {
// Silence "truncation of constant value" warning for kMul in 32-bit mode.
// Since this is a header file, push and then pop to limit the scope.
#ifdef _MSC_VER
#pragma warning(push)
#pragma warning(disable: 4309)
#endif
class HashMix {
public:
HashMix() : hash_(1) {}
explicit HashMix(size_t val) : hash_(val + 83) {}
void Mix(size_t val) {
static const size_t kMul = static_cast<size_t>(0xdc3eb94af8ab4c93ULL);
hash_ *= kMul;
hash_ = ((hash_ << 19) |
(hash_ >> (std::numeric_limits<size_t>::digits - 19))) + val;
}
size_t get() const { return hash_; }
private:
size_t hash_;
};
#ifdef _MSC_VER
#pragma warning(pop)
#endif
} // namespace re2
#endif // UTIL_MIX_H_

View File

@ -1,131 +0,0 @@
// Copyright 2007 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef UTIL_MUTEX_H_
#define UTIL_MUTEX_H_
/*
* A simple mutex wrapper, supporting locks and read-write locks.
* You should assume the locks are *not* re-entrant.
*/
#if !defined(_WIN32)
#ifndef _POSIX_C_SOURCE
#define _POSIX_C_SOURCE 200809L
#endif
#include <unistd.h>
#if defined(_POSIX_READER_WRITER_LOCKS) && _POSIX_READER_WRITER_LOCKS > 0
#define MUTEX_IS_PTHREAD_RWLOCK
#endif
#endif
#if defined(MUTEX_IS_PTHREAD_RWLOCK)
#include <pthread.h>
#include <stdlib.h>
typedef pthread_rwlock_t MutexType;
#else
#include <mutex>
typedef std::mutex MutexType;
#endif
namespace re2 {
class Mutex {
public:
inline Mutex();
inline ~Mutex();
inline void Lock(); // Block if needed until free then acquire exclusively
inline void Unlock(); // Release a lock acquired via Lock()
// Note that on systems that don't support read-write locks, these may
// be implemented as synonyms to Lock() and Unlock(). So you can use
// these for efficiency, but don't use them anyplace where being able
// to do shared reads is necessary to avoid deadlock.
inline void ReaderLock(); // Block until free or shared then acquire a share
inline void ReaderUnlock(); // Release a read share of this Mutex
inline void WriterLock() { Lock(); } // Acquire an exclusive lock
inline void WriterUnlock() { Unlock(); } // Release a lock from WriterLock()
private:
MutexType mutex_;
// Catch the error of writing Mutex when intending MutexLock.
Mutex(Mutex *ignored);
Mutex(const Mutex&) = delete;
Mutex& operator=(const Mutex&) = delete;
};
#if defined(MUTEX_IS_PTHREAD_RWLOCK)
#define SAFE_PTHREAD(fncall) \
do { \
if ((fncall) != 0) abort(); \
} while (0)
Mutex::Mutex() { SAFE_PTHREAD(pthread_rwlock_init(&mutex_, NULL)); }
Mutex::~Mutex() { SAFE_PTHREAD(pthread_rwlock_destroy(&mutex_)); }
void Mutex::Lock() { SAFE_PTHREAD(pthread_rwlock_wrlock(&mutex_)); }
void Mutex::Unlock() { SAFE_PTHREAD(pthread_rwlock_unlock(&mutex_)); }
void Mutex::ReaderLock() { SAFE_PTHREAD(pthread_rwlock_rdlock(&mutex_)); }
void Mutex::ReaderUnlock() { SAFE_PTHREAD(pthread_rwlock_unlock(&mutex_)); }
#undef SAFE_PTHREAD
#else
Mutex::Mutex() { }
Mutex::~Mutex() { }
void Mutex::Lock() { mutex_.lock(); }
void Mutex::Unlock() { mutex_.unlock(); }
void Mutex::ReaderLock() { Lock(); } // C++11 doesn't have std::shared_mutex.
void Mutex::ReaderUnlock() { Unlock(); }
#endif
// --------------------------------------------------------------------------
// Some helper classes
// MutexLock(mu) acquires mu when constructed and releases it when destroyed.
class MutexLock {
public:
explicit MutexLock(Mutex *mu) : mu_(mu) { mu_->Lock(); }
~MutexLock() { mu_->Unlock(); }
private:
Mutex * const mu_;
MutexLock(const MutexLock&) = delete;
MutexLock& operator=(const MutexLock&) = delete;
};
// ReaderMutexLock and WriterMutexLock do the same, for rwlocks
class ReaderMutexLock {
public:
explicit ReaderMutexLock(Mutex *mu) : mu_(mu) { mu_->ReaderLock(); }
~ReaderMutexLock() { mu_->ReaderUnlock(); }
private:
Mutex * const mu_;
ReaderMutexLock(const ReaderMutexLock&) = delete;
ReaderMutexLock& operator=(const ReaderMutexLock&) = delete;
};
class WriterMutexLock {
public:
explicit WriterMutexLock(Mutex *mu) : mu_(mu) { mu_->WriterLock(); }
~WriterMutexLock() { mu_->WriterUnlock(); }
private:
Mutex * const mu_;
WriterMutexLock(const WriterMutexLock&) = delete;
WriterMutexLock& operator=(const WriterMutexLock&) = delete;
};
// Catch bug where variable name is omitted, e.g. MutexLock (&mu);
#define MutexLock(x) static_assert(false, "MutexLock declaration missing variable name")
#define ReaderMutexLock(x) static_assert(false, "ReaderMutexLock declaration missing variable name")
#define WriterMutexLock(x) static_assert(false, "WriterMutexLock declaration missing variable name")
} // namespace re2
#endif // UTIL_MUTEX_H_

File diff suppressed because it is too large Load Diff

View File

@ -1,680 +0,0 @@
// Copyright 2003-2010 Google Inc. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef UTIL_PCRE_H_
#define UTIL_PCRE_H_
// This is a variant of PCRE's pcrecpp.h, originally written at Google.
// The main changes are the addition of the HitLimit method and
// compilation as PCRE in namespace re2.
// C++ interface to the pcre regular-expression library. PCRE supports
// Perl-style regular expressions (with extensions like \d, \w, \s,
// ...).
//
// -----------------------------------------------------------------------
// REGEXP SYNTAX:
//
// This module uses the pcre library and hence supports its syntax
// for regular expressions:
//
// http://www.google.com/search?q=pcre
//
// The syntax is pretty similar to Perl's. For those not familiar
// with Perl's regular expressions, here are some examples of the most
// commonly used extensions:
//
// "hello (\\w+) world" -- \w matches a "word" character
// "version (\\d+)" -- \d matches a digit
// "hello\\s+world" -- \s matches any whitespace character
// "\\b(\\w+)\\b" -- \b matches empty string at a word boundary
// "(?i)hello" -- (?i) turns on case-insensitive matching
// "/\\*(.*?)\\*/" -- .*? matches . minimum no. of times possible
//
// -----------------------------------------------------------------------
// MATCHING INTERFACE:
//
// The "FullMatch" operation checks that supplied text matches a
// supplied pattern exactly.
//
// Example: successful match
// CHECK(PCRE::FullMatch("hello", "h.*o"));
//
// Example: unsuccessful match (requires full match):
// CHECK(!PCRE::FullMatch("hello", "e"));
//
// -----------------------------------------------------------------------
// UTF-8 AND THE MATCHING INTERFACE:
//
// By default, pattern and text are plain text, one byte per character.
// The UTF8 flag, passed to the constructor, causes both pattern
// and string to be treated as UTF-8 text, still a byte stream but
// potentially multiple bytes per character. In practice, the text
// is likelier to be UTF-8 than the pattern, but the match returned
// may depend on the UTF8 flag, so always use it when matching
// UTF8 text. E.g., "." will match one byte normally but with UTF8
// set may match up to three bytes of a multi-byte character.
//
// Example:
// PCRE re(utf8_pattern, PCRE::UTF8);
// CHECK(PCRE::FullMatch(utf8_string, re));
//
// -----------------------------------------------------------------------
// MATCHING WITH SUB-STRING EXTRACTION:
//
// You can supply extra pointer arguments to extract matched subpieces.
//
// Example: extracts "ruby" into "s" and 1234 into "i"
// int i;
// string s;
// CHECK(PCRE::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i));
//
// Example: fails because string cannot be stored in integer
// CHECK(!PCRE::FullMatch("ruby", "(.*)", &i));
//
// Example: fails because there aren't enough sub-patterns:
// CHECK(!PCRE::FullMatch("ruby:1234", "\\w+:\\d+", &s));
//
// Example: does not try to extract any extra sub-patterns
// CHECK(PCRE::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s));
//
// Example: does not try to extract into NULL
// CHECK(PCRE::FullMatch("ruby:1234", "(\\w+):(\\d+)", NULL, &i));
//
// Example: integer overflow causes failure
// CHECK(!PCRE::FullMatch("ruby:1234567891234", "\\w+:(\\d+)", &i));
//
// -----------------------------------------------------------------------
// PARTIAL MATCHES
//
// You can use the "PartialMatch" operation when you want the pattern
// to match any substring of the text.
//
// Example: simple search for a string:
// CHECK(PCRE::PartialMatch("hello", "ell"));
//
// Example: find first number in a string
// int number;
// CHECK(PCRE::PartialMatch("x*100 + 20", "(\\d+)", &number));
// CHECK_EQ(number, 100);
//
// -----------------------------------------------------------------------
// PPCRE-COMPILED PCREGULAR EXPPCRESSIONS
//
// PCRE makes it easy to use any string as a regular expression, without
// requiring a separate compilation step.
//
// If speed is of the essence, you can create a pre-compiled "PCRE"
// object from the pattern and use it multiple times. If you do so,
// you can typically parse text faster than with sscanf.
//
// Example: precompile pattern for faster matching:
// PCRE pattern("h.*o");
// while (ReadLine(&str)) {
// if (PCRE::FullMatch(str, pattern)) ...;
// }
//
// -----------------------------------------------------------------------
// SCANNING TEXT INCPCREMENTALLY
//
// The "Consume" operation may be useful if you want to repeatedly
// match regular expressions at the front of a string and skip over
// them as they match. This requires use of the "StringPiece" type,
// which represents a sub-range of a real string.
//
// Example: read lines of the form "var = value" from a string.
// string contents = ...; // Fill string somehow
// StringPiece input(contents); // Wrap a StringPiece around it
//
// string var;
// int value;
// while (PCRE::Consume(&input, "(\\w+) = (\\d+)\n", &var, &value)) {
// ...;
// }
//
// Each successful call to "Consume" will set "var/value", and also
// advance "input" so it points past the matched text. Note that if the
// regular expression matches an empty string, input will advance
// by 0 bytes. If the regular expression being used might match
// an empty string, the loop body must check for this case and either
// advance the string or break out of the loop.
//
// The "FindAndConsume" operation is similar to "Consume" but does not
// anchor your match at the beginning of the string. For example, you
// could extract all words from a string by repeatedly calling
// PCRE::FindAndConsume(&input, "(\\w+)", &word)
//
// -----------------------------------------------------------------------
// PARSING HEX/OCTAL/C-RADIX NUMBERS
//
// By default, if you pass a pointer to a numeric value, the
// corresponding text is interpreted as a base-10 number. You can
// instead wrap the pointer with a call to one of the operators Hex(),
// Octal(), or CRadix() to interpret the text in another base. The
// CRadix operator interprets C-style "0" (base-8) and "0x" (base-16)
// prefixes, but defaults to base-10.
//
// Example:
// int a, b, c, d;
// CHECK(PCRE::FullMatch("100 40 0100 0x40", "(.*) (.*) (.*) (.*)",
// Octal(&a), Hex(&b), CRadix(&c), CRadix(&d));
// will leave 64 in a, b, c, and d.
#include "util/util.h"
#include "re2/stringpiece.h"
#ifdef USEPCRE
#include <pcre.h>
namespace re2 {
const bool UsingPCRE = true;
} // namespace re2
#else
struct pcre; // opaque
namespace re2 {
const bool UsingPCRE = false;
} // namespace re2
#endif
namespace re2 {
class PCRE_Options;
// Interface for regular expression matching. Also corresponds to a
// pre-compiled regular expression. An "PCRE" object is safe for
// concurrent use by multiple threads.
class PCRE {
public:
// We convert user-passed pointers into special Arg objects
class Arg;
// Marks end of arg list.
// ONLY USE IN OPTIONAL ARG DEFAULTS.
// DO NOT PASS EXPLICITLY.
static Arg no_more_args;
// Options are same value as those in pcre. We provide them here
// to avoid users needing to include pcre.h and also to isolate
// users from pcre should we change the underlying library.
// Only those needed by Google programs are exposed here to
// avoid collision with options employed internally by regexp.cc
// Note that some options have equivalents that can be specified in
// the regexp itself. For example, prefixing your regexp with
// "(?s)" has the same effect as the PCRE_DOTALL option.
enum Option {
None = 0x0000,
UTF8 = 0x0800, // == PCRE_UTF8
EnabledCompileOptions = UTF8,
EnabledExecOptions = 0x0000, // TODO: use to replace anchor flag
};
// We provide implicit conversions from strings so that users can
// pass in a string or a "const char*" wherever an "PCRE" is expected.
PCRE(const char* pattern);
PCRE(const char* pattern, Option option);
PCRE(const string& pattern);
PCRE(const string& pattern, Option option);
PCRE(const char *pattern, const PCRE_Options& re_option);
PCRE(const string& pattern, const PCRE_Options& re_option);
~PCRE();
// The string specification for this PCRE. E.g.
// PCRE re("ab*c?d+");
// re.pattern(); // "ab*c?d+"
const string& pattern() const { return pattern_; }
// If PCRE could not be created properly, returns an error string.
// Else returns the empty string.
const string& error() const { return *error_; }
// Whether the PCRE has hit a match limit during execution.
// Not thread safe. Intended only for testing.
// If hitting match limits is a problem,
// you should be using PCRE2 (re2/re2.h)
// instead of checking this flag.
bool HitLimit();
void ClearHitLimit();
/***** The useful part: the matching interface *****/
// Matches "text" against "pattern". If pointer arguments are
// supplied, copies matched sub-patterns into them.
//
// You can pass in a "const char*" or a "string" for "text".
// You can pass in a "const char*" or a "string" or a "PCRE" for "pattern".
//
// The provided pointer arguments can be pointers to any scalar numeric
// type, or one of:
// string (matched piece is copied to string)
// StringPiece (StringPiece is mutated to point to matched piece)
// T (where "bool T::ParseFrom(const char*, size_t)" exists)
// (void*)NULL (the corresponding matched sub-pattern is not copied)
//
// Returns true iff all of the following conditions are satisfied:
// a. "text" matches "pattern" exactly
// b. The number of matched sub-patterns is >= number of supplied pointers
// c. The "i"th argument has a suitable type for holding the
// string captured as the "i"th sub-pattern. If you pass in
// NULL for the "i"th argument, or pass fewer arguments than
// number of sub-patterns, "i"th captured sub-pattern is
// ignored.
//
// CAVEAT: An optional sub-pattern that does not exist in the
// matched string is assigned the empty string. Therefore, the
// following will return false (because the empty string is not a
// valid number):
// int number;
// PCRE::FullMatch("abc", "[a-z]+(\\d+)?", &number);
struct FullMatchFunctor {
bool operator ()(const StringPiece& text, const PCRE& re, // 3..16 args
const Arg& ptr1 = no_more_args,
const Arg& ptr2 = no_more_args,
const Arg& ptr3 = no_more_args,
const Arg& ptr4 = no_more_args,
const Arg& ptr5 = no_more_args,
const Arg& ptr6 = no_more_args,
const Arg& ptr7 = no_more_args,
const Arg& ptr8 = no_more_args,
const Arg& ptr9 = no_more_args,
const Arg& ptr10 = no_more_args,
const Arg& ptr11 = no_more_args,
const Arg& ptr12 = no_more_args,
const Arg& ptr13 = no_more_args,
const Arg& ptr14 = no_more_args,
const Arg& ptr15 = no_more_args,
const Arg& ptr16 = no_more_args) const;
};
static const FullMatchFunctor FullMatch;
// Exactly like FullMatch(), except that "pattern" is allowed to match
// a substring of "text".
struct PartialMatchFunctor {
bool operator ()(const StringPiece& text, const PCRE& re, // 3..16 args
const Arg& ptr1 = no_more_args,
const Arg& ptr2 = no_more_args,
const Arg& ptr3 = no_more_args,
const Arg& ptr4 = no_more_args,
const Arg& ptr5 = no_more_args,
const Arg& ptr6 = no_more_args,
const Arg& ptr7 = no_more_args,
const Arg& ptr8 = no_more_args,
const Arg& ptr9 = no_more_args,
const Arg& ptr10 = no_more_args,
const Arg& ptr11 = no_more_args,
const Arg& ptr12 = no_more_args,
const Arg& ptr13 = no_more_args,
const Arg& ptr14 = no_more_args,
const Arg& ptr15 = no_more_args,
const Arg& ptr16 = no_more_args) const;
};
static const PartialMatchFunctor PartialMatch;
// Like FullMatch() and PartialMatch(), except that pattern has to
// match a prefix of "text", and "input" is advanced past the matched
// text. Note: "input" is modified iff this routine returns true.
struct ConsumeFunctor {
bool operator ()(StringPiece* input, const PCRE& pattern, // 3..16 args
const Arg& ptr1 = no_more_args,
const Arg& ptr2 = no_more_args,
const Arg& ptr3 = no_more_args,
const Arg& ptr4 = no_more_args,
const Arg& ptr5 = no_more_args,
const Arg& ptr6 = no_more_args,
const Arg& ptr7 = no_more_args,
const Arg& ptr8 = no_more_args,
const Arg& ptr9 = no_more_args,
const Arg& ptr10 = no_more_args,
const Arg& ptr11 = no_more_args,
const Arg& ptr12 = no_more_args,
const Arg& ptr13 = no_more_args,
const Arg& ptr14 = no_more_args,
const Arg& ptr15 = no_more_args,
const Arg& ptr16 = no_more_args) const;
};
static const ConsumeFunctor Consume;
// Like Consume(..), but does not anchor the match at the beginning of the
// string. That is, "pattern" need not start its match at the beginning of
// "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds the next
// word in "s" and stores it in "word".
struct FindAndConsumeFunctor {
bool operator ()(StringPiece* input, const PCRE& pattern,
const Arg& ptr1 = no_more_args,
const Arg& ptr2 = no_more_args,
const Arg& ptr3 = no_more_args,
const Arg& ptr4 = no_more_args,
const Arg& ptr5 = no_more_args,
const Arg& ptr6 = no_more_args,
const Arg& ptr7 = no_more_args,
const Arg& ptr8 = no_more_args,
const Arg& ptr9 = no_more_args,
const Arg& ptr10 = no_more_args,
const Arg& ptr11 = no_more_args,
const Arg& ptr12 = no_more_args,
const Arg& ptr13 = no_more_args,
const Arg& ptr14 = no_more_args,
const Arg& ptr15 = no_more_args,
const Arg& ptr16 = no_more_args) const;
};
static const FindAndConsumeFunctor FindAndConsume;
// Replace the first match of "pattern" in "str" with "rewrite".
// Within "rewrite", backslash-escaped digits (\1 to \9) can be
// used to insert text matching corresponding parenthesized group
// from the pattern. \0 in "rewrite" refers to the entire matching
// text. E.g.,
//
// string s = "yabba dabba doo";
// CHECK(PCRE::Replace(&s, "b+", "d"));
//
// will leave "s" containing "yada dabba doo"
//
// Returns true if the pattern matches and a replacement occurs,
// false otherwise.
static bool Replace(string *str,
const PCRE& pattern,
const StringPiece& rewrite);
// Like Replace(), except replaces all occurrences of the pattern in
// the string with the rewrite. Replacements are not subject to
// re-matching. E.g.,
//
// string s = "yabba dabba doo";
// CHECK(PCRE::GlobalReplace(&s, "b+", "d"));
//
// will leave "s" containing "yada dada doo"
//
// Returns the number of replacements made.
static int GlobalReplace(string *str,
const PCRE& pattern,
const StringPiece& rewrite);
// Like Replace, except that if the pattern matches, "rewrite"
// is copied into "out" with substitutions. The non-matching
// portions of "text" are ignored.
//
// Returns true iff a match occurred and the extraction happened
// successfully; if no match occurs, the string is left unaffected.
static bool Extract(const StringPiece &text,
const PCRE& pattern,
const StringPiece &rewrite,
string *out);
// Check that the given @p rewrite string is suitable for use with
// this PCRE. It checks that:
// * The PCRE has enough parenthesized subexpressions to satisfy all
// of the \N tokens in @p rewrite, and
// * The @p rewrite string doesn't have any syntax errors
// ('\' followed by anything besides [0-9] and '\').
// Making this test will guarantee that "replace" and "extract"
// operations won't LOG(ERROR) or fail because of a bad rewrite
// string.
// @param rewrite The proposed rewrite string.
// @param error An error message is recorded here, iff we return false.
// Otherwise, it is unchanged.
// @return true, iff @p rewrite is suitable for use with the PCRE.
bool CheckRewriteString(const StringPiece& rewrite, string* error) const;
// Returns a copy of 'unquoted' with all potentially meaningful
// regexp characters backslash-escaped. The returned string, used
// as a regular expression, will exactly match the original string.
// For example,
// 1.5-2.0?
// becomes:
// 1\.5\-2\.0\?
static string QuoteMeta(const StringPiece& unquoted);
/***** Generic matching interface (not so nice to use) *****/
// Type of match (TODO: Should be restructured as an Option)
enum Anchor {
UNANCHORED, // No anchoring
ANCHOR_START, // Anchor at start only
ANCHOR_BOTH, // Anchor at start and end
};
// General matching routine. Stores the length of the match in
// "*consumed" if successful.
bool DoMatch(const StringPiece& text,
Anchor anchor,
size_t* consumed,
const Arg* const* args, int n) const;
// Return the number of capturing subpatterns, or -1 if the
// regexp wasn't valid on construction.
int NumberOfCapturingGroups() const;
private:
void Init(const char* pattern, Option option, int match_limit,
int stack_limit, bool report_errors);
// Match against "text", filling in "vec" (up to "vecsize" * 2/3) with
// pairs of integers for the beginning and end positions of matched
// text. The first pair corresponds to the entire matched text;
// subsequent pairs correspond, in order, to parentheses-captured
// matches. Returns the number of pairs (one more than the number of
// the last subpattern with a match) if matching was successful
// and zero if the match failed.
// I.e. for PCRE("(foo)|(bar)|(baz)") it will return 2, 3, and 4 when matching
// against "foo", "bar", and "baz" respectively.
// When matching PCRE("(foo)|hello") against "hello", it will return 1.
// But the values for all subpattern are filled in into "vec".
int TryMatch(const StringPiece& text,
size_t startpos,
Anchor anchor,
bool empty_ok,
int *vec,
int vecsize) const;
// Append the "rewrite" string, with backslash subsitutions from "text"
// and "vec", to string "out".
bool Rewrite(string *out,
const StringPiece &rewrite,
const StringPiece &text,
int *vec,
int veclen) const;
// internal implementation for DoMatch
bool DoMatchImpl(const StringPiece& text,
Anchor anchor,
size_t* consumed,
const Arg* const args[],
int n,
int* vec,
int vecsize) const;
// Compile the regexp for the specified anchoring mode
pcre* Compile(Anchor anchor);
string pattern_;
Option options_;
pcre* re_full_; // For full matches
pcre* re_partial_; // For partial matches
const string* error_; // Error indicator (or empty string)
bool report_errors_; // Silences error logging if false
int match_limit_; // Limit on execution resources
int stack_limit_; // Limit on stack resources (bytes)
mutable int32_t hit_limit_; // Hit limit during execution (bool)?
PCRE(const PCRE&) = delete;
PCRE& operator=(const PCRE&) = delete;
};
// PCRE_Options allow you to set the PCRE::Options, plus any pcre
// "extra" options. The only extras are match_limit, which limits
// the CPU time of a match, and stack_limit, which limits the
// stack usage. Setting a limit to <= 0 lets PCRE pick a sensible default
// that should not cause too many problems in production code.
// If PCRE hits a limit during a match, it may return a false negative,
// but (hopefully) it won't crash.
//
// NOTE: If you are handling regular expressions specified by
// (external or internal) users, rather than hard-coded ones,
// you should be using PCRE2, which uses an alternate implementation
// that avoids these issues. See http://go/re2quick.
class PCRE_Options {
public:
// constructor
PCRE_Options() : option_(PCRE::None), match_limit_(0), stack_limit_(0), report_errors_(true) {}
// accessors
PCRE::Option option() const { return option_; }
void set_option(PCRE::Option option) {
option_ = option;
}
int match_limit() const { return match_limit_; }
void set_match_limit(int match_limit) {
match_limit_ = match_limit;
}
int stack_limit() const { return stack_limit_; }
void set_stack_limit(int stack_limit) {
stack_limit_ = stack_limit;
}
// If the regular expression is malformed, an error message will be printed
// iff report_errors() is true. Default: true.
bool report_errors() const { return report_errors_; }
void set_report_errors(bool report_errors) {
report_errors_ = report_errors;
}
private:
PCRE::Option option_;
int match_limit_;
int stack_limit_;
bool report_errors_;
};
/***** Implementation details *****/
// Hex/Octal/Binary?
// Special class for parsing into objects that define a ParseFrom() method
template <class T>
class _PCRE_MatchObject {
public:
static inline bool Parse(const char* str, size_t n, void* dest) {
if (dest == NULL) return true;
T* object = reinterpret_cast<T*>(dest);
return object->ParseFrom(str, n);
}
};
class PCRE::Arg {
public:
// Empty constructor so we can declare arrays of PCRE::Arg
Arg();
// Constructor specially designed for NULL arguments
Arg(void*);
typedef bool (*Parser)(const char* str, size_t n, void* dest);
// Type-specific parsers
#define MAKE_PARSER(type, name) \
Arg(type* p) : arg_(p), parser_(name) {} \
Arg(type* p, Parser parser) : arg_(p), parser_(parser) {}
MAKE_PARSER(char, parse_char);
MAKE_PARSER(signed char, parse_schar);
MAKE_PARSER(unsigned char, parse_uchar);
MAKE_PARSER(float, parse_float);
MAKE_PARSER(double, parse_double);
MAKE_PARSER(string, parse_string);
MAKE_PARSER(StringPiece, parse_stringpiece);
MAKE_PARSER(short, parse_short);
MAKE_PARSER(unsigned short, parse_ushort);
MAKE_PARSER(int, parse_int);
MAKE_PARSER(unsigned int, parse_uint);
MAKE_PARSER(long, parse_long);
MAKE_PARSER(unsigned long, parse_ulong);
MAKE_PARSER(long long, parse_longlong);
MAKE_PARSER(unsigned long long, parse_ulonglong);
#undef MAKE_PARSER
// Generic constructor
template <class T> Arg(T*, Parser parser);
// Generic constructor template
template <class T> Arg(T* p)
: arg_(p), parser_(_PCRE_MatchObject<T>::Parse) {
}
// Parse the data
bool Parse(const char* str, size_t n) const;
private:
void* arg_;
Parser parser_;
static bool parse_null (const char* str, size_t n, void* dest);
static bool parse_char (const char* str, size_t n, void* dest);
static bool parse_schar (const char* str, size_t n, void* dest);
static bool parse_uchar (const char* str, size_t n, void* dest);
static bool parse_float (const char* str, size_t n, void* dest);
static bool parse_double (const char* str, size_t n, void* dest);
static bool parse_string (const char* str, size_t n, void* dest);
static bool parse_stringpiece (const char* str, size_t n, void* dest);
#define DECLARE_INTEGER_PARSER(name) \
private: \
static bool parse_##name(const char* str, size_t n, void* dest); \
static bool parse_##name##_radix(const char* str, size_t n, void* dest, \
int radix); \
\
public: \
static bool parse_##name##_hex(const char* str, size_t n, void* dest); \
static bool parse_##name##_octal(const char* str, size_t n, void* dest); \
static bool parse_##name##_cradix(const char* str, size_t n, void* dest)
DECLARE_INTEGER_PARSER(short);
DECLARE_INTEGER_PARSER(ushort);
DECLARE_INTEGER_PARSER(int);
DECLARE_INTEGER_PARSER(uint);
DECLARE_INTEGER_PARSER(long);
DECLARE_INTEGER_PARSER(ulong);
DECLARE_INTEGER_PARSER(longlong);
DECLARE_INTEGER_PARSER(ulonglong);
#undef DECLARE_INTEGER_PARSER
};
inline PCRE::Arg::Arg() : arg_(NULL), parser_(parse_null) { }
inline PCRE::Arg::Arg(void* p) : arg_(p), parser_(parse_null) { }
inline bool PCRE::Arg::Parse(const char* str, size_t n) const {
return (*parser_)(str, n, arg_);
}
// This part of the parser, appropriate only for ints, deals with bases
#define MAKE_INTEGER_PARSER(type, name) \
inline PCRE::Arg Hex(type* ptr) { \
return PCRE::Arg(ptr, PCRE::Arg::parse_##name##_hex); \
} \
inline PCRE::Arg Octal(type* ptr) { \
return PCRE::Arg(ptr, PCRE::Arg::parse_##name##_octal); \
} \
inline PCRE::Arg CRadix(type* ptr) { \
return PCRE::Arg(ptr, PCRE::Arg::parse_##name##_cradix); \
}
MAKE_INTEGER_PARSER(short, short);
MAKE_INTEGER_PARSER(unsigned short, ushort);
MAKE_INTEGER_PARSER(int, int);
MAKE_INTEGER_PARSER(unsigned int, uint);
MAKE_INTEGER_PARSER(long, long);
MAKE_INTEGER_PARSER(unsigned long, ulong);
MAKE_INTEGER_PARSER(long long, longlong);
MAKE_INTEGER_PARSER(unsigned long long, ulonglong);
#undef MAKE_INTEGER_PARSER
} // namespace re2
#endif // UTIL_PCRE_H_

View File

@ -1,260 +0,0 @@
/*
* The authors of this software are Rob Pike and Ken Thompson.
* Copyright (c) 2002 by Lucent Technologies.
* Permission to use, copy, modify, and distribute this software for any
* purpose without fee is hereby granted, provided that this entire notice
* is included in all copies of any software which is or includes a copy
* or modification of this software and in all copies of the supporting
* documentation for such software.
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
*/
#include <stdarg.h>
#include <string.h>
#include "util/utf.h"
namespace re2 {
enum
{
Bit1 = 7,
Bitx = 6,
Bit2 = 5,
Bit3 = 4,
Bit4 = 3,
Bit5 = 2,
T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */
Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */
Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */
Rune4 = (1<<(Bit4+3*Bitx))-1,
/* 0001 1111 1111 1111 1111 1111 */
Maskx = (1<<Bitx)-1, /* 0011 1111 */
Testx = Maskx ^ 0xFF, /* 1100 0000 */
Bad = Runeerror,
};
int
chartorune(Rune *rune, const char *str)
{
int c, c1, c2, c3;
long l;
/*
* one character sequence
* 00000-0007F => T1
*/
c = *(unsigned char*)str;
if(c < Tx) {
*rune = c;
return 1;
}
/*
* two character sequence
* 0080-07FF => T2 Tx
*/
c1 = *(unsigned char*)(str+1) ^ Tx;
if(c1 & Testx)
goto bad;
if(c < T3) {
if(c < T2)
goto bad;
l = ((c << Bitx) | c1) & Rune2;
if(l <= Rune1)
goto bad;
*rune = l;
return 2;
}
/*
* three character sequence
* 0800-FFFF => T3 Tx Tx
*/
c2 = *(unsigned char*)(str+2) ^ Tx;
if(c2 & Testx)
goto bad;
if(c < T4) {
l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
if(l <= Rune2)
goto bad;
*rune = l;
return 3;
}
/*
* four character sequence (21-bit value)
* 10000-1FFFFF => T4 Tx Tx Tx
*/
c3 = *(unsigned char*)(str+3) ^ Tx;
if (c3 & Testx)
goto bad;
if (c < T5) {
l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
if (l <= Rune3)
goto bad;
*rune = l;
return 4;
}
/*
* Support for 5-byte or longer UTF-8 would go here, but
* since we don't have that, we'll just fall through to bad.
*/
/*
* bad decoding
*/
bad:
*rune = Bad;
return 1;
}
int
runetochar(char *str, const Rune *rune)
{
/* Runes are signed, so convert to unsigned for range check. */
unsigned long c;
/*
* one character sequence
* 00000-0007F => 00-7F
*/
c = *rune;
if(c <= Rune1) {
str[0] = static_cast<char>(c);
return 1;
}
/*
* two character sequence
* 0080-07FF => T2 Tx
*/
if(c <= Rune2) {
str[0] = T2 | static_cast<char>(c >> 1*Bitx);
str[1] = Tx | (c & Maskx);
return 2;
}
/*
* If the Rune is out of range, convert it to the error rune.
* Do this test here because the error rune encodes to three bytes.
* Doing it earlier would duplicate work, since an out of range
* Rune wouldn't have fit in one or two bytes.
*/
if (c > Runemax)
c = Runeerror;
/*
* three character sequence
* 0800-FFFF => T3 Tx Tx
*/
if (c <= Rune3) {
str[0] = T3 | static_cast<char>(c >> 2*Bitx);
str[1] = Tx | ((c >> 1*Bitx) & Maskx);
str[2] = Tx | (c & Maskx);
return 3;
}
/*
* four character sequence (21-bit value)
* 10000-1FFFFF => T4 Tx Tx Tx
*/
str[0] = T4 | static_cast<char>(c >> 3*Bitx);
str[1] = Tx | ((c >> 2*Bitx) & Maskx);
str[2] = Tx | ((c >> 1*Bitx) & Maskx);
str[3] = Tx | (c & Maskx);
return 4;
}
int
runelen(Rune rune)
{
char str[10];
return runetochar(str, &rune);
}
int
fullrune(const char *str, int n)
{
if (n > 0) {
int c = *(unsigned char*)str;
if (c < Tx)
return 1;
if (n > 1) {
if (c < T3)
return 1;
if (n > 2) {
if (c < T4 || n > 3)
return 1;
}
}
}
return 0;
}
int
utflen(const char *s)
{
int c;
long n;
Rune rune;
n = 0;
for(;;) {
c = *(unsigned char*)s;
if(c < Runeself) {
if(c == 0)
return n;
s++;
} else
s += chartorune(&rune, s);
n++;
}
return 0;
}
char*
utfrune(const char *s, Rune c)
{
long c1;
Rune r;
int n;
if(c < Runesync) /* not part of utf sequence */
return strchr((char*)s, c);
for(;;) {
c1 = *(unsigned char*)s;
if(c1 < Runeself) { /* one byte rune */
if(c1 == 0)
return 0;
if(c1 == c)
return (char*)s;
s++;
continue;
}
n = chartorune(&r, s);
if(r == c)
return (char*)s;
s += n;
}
return 0;
}
} // namespace re2

View File

@ -1,526 +0,0 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef UTIL_SPARSE_ARRAY_H_
#define UTIL_SPARSE_ARRAY_H_
// DESCRIPTION
//
// SparseArray<T>(m) is a map from integers in [0, m) to T values.
// It requires (sizeof(T)+sizeof(int))*m memory, but it provides
// fast iteration through the elements in the array and fast clearing
// of the array. The array has a concept of certain elements being
// uninitialized (having no value).
//
// Insertion and deletion are constant time operations.
//
// Allocating the array is a constant time operation
// when memory allocation is a constant time operation.
//
// Clearing the array is a constant time operation (unusual!).
//
// Iterating through the array is an O(n) operation, where n
// is the number of items in the array (not O(m)).
//
// The array iterator visits entries in the order they were first
// inserted into the array. It is safe to add items to the array while
// using an iterator: the iterator will visit indices added to the array
// during the iteration, but will not re-visit indices whose values
// change after visiting. Thus SparseArray can be a convenient
// implementation of a work queue.
//
// The SparseArray implementation is NOT thread-safe. It is up to the
// caller to make sure only one thread is accessing the array. (Typically
// these arrays are temporary values and used in situations where speed is
// important.)
//
// The SparseArray interface does not present all the usual STL bells and
// whistles.
//
// Implemented with reference to Briggs & Torczon, An Efficient
// Representation for Sparse Sets, ACM Letters on Programming Languages
// and Systems, Volume 2, Issue 1-4 (March-Dec. 1993), pp. 59-69.
//
// Briggs & Torczon popularized this technique, but it had been known
// long before their paper. They point out that Aho, Hopcroft, and
// Ullman's 1974 Design and Analysis of Computer Algorithms and Bentley's
// 1986 Programming Pearls both hint at the technique in exercises to the
// reader (in Aho & Hopcroft, exercise 2.12; in Bentley, column 1
// exercise 8).
//
// Briggs & Torczon describe a sparse set implementation. I have
// trivially generalized it to create a sparse array (actually the original
// target of the AHU and Bentley exercises).
// IMPLEMENTATION
//
// SparseArray is an array dense_ and an array sparse_, both of size max_size_.
// At any point, the number of elements in the sparse array is size_.
//
// The array dense_ contains the size_ elements in the sparse array (with
// their indices),
// in the order that the elements were first inserted. This array is dense:
// the size_ pairs are dense_[0] through dense_[size_-1].
//
// The array sparse_ maps from indices in [0,m) to indices in [0,size_).
// For indices present in the array, dense_[sparse_[i]].index_ == i.
// For indices not present in the array, sparse_ can contain any value at all,
// perhaps outside the range [0, size_) but perhaps not.
//
// The lax requirement on sparse_ values makes clearing the array very easy:
// set size_ to 0. Lookups are slightly more complicated.
// An index i has a value in the array if and only if:
// sparse_[i] is in [0, size_) AND
// dense_[sparse_[i]].index_ == i.
// If both these properties hold, only then it is safe to refer to
// dense_[sparse_[i]].value_
// as the value associated with index i.
//
// To insert a new entry, set sparse_[i] to size_,
// initialize dense_[size_], and then increment size_.
//
// Deletion of specific values from the array is implemented by
// swapping dense_[size_-1] and the dense_ being deleted and then
// updating the appropriate sparse_ entries.
//
// To make the sparse array as efficient as possible for non-primitive types,
// elements may or may not be destroyed when they are deleted from the sparse
// array through a call to erase(), erase_existing() or resize(). They
// immediately become inaccessible, but they are only guaranteed to be
// destroyed when the SparseArray destructor is called.
//
// A moved-from SparseArray will be empty.
// Doing this simplifies the logic below.
#ifndef __has_feature
#define __has_feature(x) 0
#endif
#include <assert.h>
#include <stdint.h>
#include <string.h>
#if __has_feature(memory_sanitizer)
#include <sanitizer/msan_interface.h>
#endif
#include <algorithm>
#include <memory>
#include <type_traits>
#include <utility>
namespace re2 {
template<typename Value>
class SparseArray {
public:
SparseArray();
explicit SparseArray(int max_size);
~SparseArray();
// IndexValue pairs: exposed in SparseArray::iterator.
class IndexValue;
static_assert(std::is_trivially_destructible<IndexValue>::value,
"IndexValue must be trivially destructible");
typedef IndexValue value_type;
typedef IndexValue* iterator;
typedef const IndexValue* const_iterator;
SparseArray(const SparseArray& src);
SparseArray(SparseArray&& src) /*noexcept*/;
SparseArray& operator=(const SparseArray& src);
SparseArray& operator=(SparseArray&& src) /*noexcept*/;
const IndexValue& iv(int i) const;
// Return the number of entries in the array.
int size() const {
return size_;
}
// Indicate whether the array is empty.
int empty() const {
return size_ == 0;
}
// Iterate over the array.
iterator begin() {
return dense_.get();
}
iterator end() {
return dense_.get() + size_;
}
const_iterator begin() const {
return dense_.get();
}
const_iterator end() const {
return dense_.get() + size_;
}
// Change the maximum size of the array.
// Invalidates all iterators.
void resize(int max_size);
// Return the maximum size of the array.
// Indices can be in the range [0, max_size).
int max_size() const {
return max_size_;
}
// Clear the array.
void clear() {
size_ = 0;
}
// Check whether index i is in the array.
bool has_index(int i) const;
// Comparison function for sorting.
// Can sort the sparse array so that future iterations
// will visit indices in increasing order using
// std::sort(arr.begin(), arr.end(), arr.less);
static bool less(const IndexValue& a, const IndexValue& b);
public:
// Set the value at index i to v.
iterator set(int i, const Value& v) {
return SetInternal(true, i, v);
}
iterator set(int i, Value&& v) { // NOLINT
return SetInternal(true, i, std::move(v));
}
std::pair<iterator, bool> insert(const value_type& v) {
return InsertInternal(v);
}
std::pair<iterator, bool> insert(value_type&& v) { // NOLINT
return InsertInternal(std::move(v));
}
template <typename... Args>
std::pair<iterator, bool> emplace(Args&&... args) { // NOLINT
return InsertInternal(value_type(std::forward<Args>(args)...));
}
iterator find(int i) {
if (has_index(i))
return dense_.get() + sparse_[i];
return end();
}
const_iterator find(int i) const {
if (has_index(i))
return dense_.get() + sparse_[i];
return end();
}
// Change the value at index i to v.
// Fast but unsafe: only use if has_index(i) is true.
iterator set_existing(int i, const Value& v) {
return SetExistingInternal(i, v);
}
iterator set_existing(int i, Value&& v) { // NOLINT
return SetExistingInternal(i, std::move(v));
}
// Set the value at the new index i to v.
// Fast but unsafe: only use if has_index(i) is false.
iterator set_new(int i, const Value& v) {
return SetInternal(false, i, v);
}
iterator set_new(int i, Value&& v) { // NOLINT
return SetInternal(false, i, std::move(v));
}
// Get the value at index i from the array..
// Fast but unsafe: only use if has_index(i) is true.
const Value& get_existing(int i) const;
// Erasing items from the array during iteration is in general
// NOT safe. There is one special case, which is that the current
// index-value pair can be erased as long as the iterator is then
// checked for being at the end before being incremented.
// For example:
//
// for (i = m.begin(); i != m.end(); ++i) {
// if (ShouldErase(i->index(), i->value())) {
// m.erase(i->index());
// --i;
// }
// }
//
// Except in the specific case just described, elements must
// not be erased from the array (including clearing the array)
// while iterators are walking over the array. Otherwise,
// the iterators could walk past the end of the array.
// Erases the element at index i from the array.
void erase(int i);
// Erases the element at index i from the array.
// Fast but unsafe: only use if has_index(i) is true.
void erase_existing(int i);
private:
template <typename U>
std::pair<iterator, bool> InsertInternal(U&& v) {
DebugCheckInvariants();
std::pair<iterator, bool> p;
if (has_index(v.index_)) {
p = {dense_.get() + sparse_[v.index_], false};
} else {
p = {set_new(std::forward<U>(v).index_, std::forward<U>(v).second), true};
}
DebugCheckInvariants();
return p;
}
template <typename U>
iterator SetInternal(bool allow_overwrite, int i, U&& v) { // NOLINT
DebugCheckInvariants();
if (static_cast<uint32_t>(i) >= static_cast<uint32_t>(max_size_)) {
assert(false && "illegal index");
// Semantically, end() would be better here, but we already know
// the user did something stupid, so begin() insulates them from
// dereferencing an invalid pointer.
return begin();
}
if (!allow_overwrite) {
assert(!has_index(i));
create_index(i);
} else {
if (!has_index(i))
create_index(i);
}
return set_existing(i, std::forward<U>(v)); // NOLINT
}
template <typename U>
iterator SetExistingInternal(int i, U&& v) { // NOLINT
DebugCheckInvariants();
assert(has_index(i));
dense_[sparse_[i]].value() = std::forward<U>(v);
DebugCheckInvariants();
return dense_.get() + sparse_[i];
}
// Add the index i to the array.
// Only use if has_index(i) is known to be false.
// Since it doesn't set the value associated with i,
// this function is private, only intended as a helper
// for other methods.
void create_index(int i);
// In debug mode, verify that some invariant properties of the class
// are being maintained. This is called at the end of the constructor
// and at the beginning and end of all public non-const member functions.
void DebugCheckInvariants() const;
// Initializes memory for elements [min, max).
void MaybeInitializeMemory(int min, int max) {
#if __has_feature(memory_sanitizer)
__msan_unpoison(sparse_.get() + min, (max - min) * sizeof sparse_[0]);
#elif defined(RE2_ON_VALGRIND)
for (int i = min; i < max; i++) {
sparse_[i] = 0xababababU;
}
#endif
}
int size_ = 0;
int max_size_ = 0;
std::unique_ptr<int[]> sparse_;
std::unique_ptr<IndexValue[]> dense_;
};
template<typename Value>
SparseArray<Value>::SparseArray() = default;
template<typename Value>
SparseArray<Value>::SparseArray(const SparseArray& src)
: size_(src.size_),
max_size_(src.max_size_),
sparse_(new int[max_size_]),
dense_(new IndexValue[max_size_]) {
std::copy_n(src.sparse_.get(), max_size_, sparse_.get());
std::copy_n(src.dense_.get(), max_size_, dense_.get());
}
template<typename Value>
SparseArray<Value>::SparseArray(SparseArray&& src) /*noexcept*/ // NOLINT
: size_(src.size_),
max_size_(src.max_size_),
sparse_(std::move(src.sparse_)),
dense_(std::move(src.dense_)) {
src.size_ = 0;
src.max_size_ = 0;
}
template<typename Value>
SparseArray<Value>& SparseArray<Value>::operator=(const SparseArray& src) {
size_ = src.size_;
max_size_ = src.max_size_;
std::unique_ptr<int[]> a(new int[max_size_]);
std::copy_n(src.sparse_.get(), src.max_size_, a.get());
sparse_ = std::move(a);
std::unique_ptr<IndexValue[]> b(new IndexValue[max_size_]);
std::copy_n(src.dense_.get(), src.max_size_, b.get());
dense_ = std::move(b);
return *this;
}
template<typename Value>
SparseArray<Value>& SparseArray<Value>::operator=(
SparseArray&& src) /*noexcept*/ { // NOLINT
size_ = src.size_;
max_size_ = src.max_size_;
sparse_ = std::move(src.sparse_);
dense_ = std::move(src.dense_);
// clear out the source
src.size_ = 0;
src.max_size_ = 0;
return *this;
}
// IndexValue pairs: exposed in SparseArray::iterator.
template<typename Value>
class SparseArray<Value>::IndexValue {
friend class SparseArray;
public:
typedef int first_type;
typedef Value second_type;
IndexValue() {}
IndexValue(int i, const Value& v) : index_(i), second(v) {}
IndexValue(int i, Value&& v) : index_(i), second(std::move(v)) {}
int index() const { return index_; }
Value& value() /*&*/ { return second; }
const Value& value() const /*&*/ { return second; }
//Value&& value() /*&&*/ { return std::move(second); } // NOLINT
private:
int index_;
public:
// Provide the data in the 'second' member so that the utilities
// in map-util work.
// TODO(billydonahue): 'second' is public for short-term compatibility.
// Users will be transitioned to using value() accessor.
Value second;
};
template<typename Value>
const typename SparseArray<Value>::IndexValue&
SparseArray<Value>::iv(int i) const {
assert(i >= 0);
assert(i < size_);
return dense_[i];
}
// Change the maximum size of the array.
// Invalidates all iterators.
template<typename Value>
void SparseArray<Value>::resize(int max_size) {
DebugCheckInvariants();
if (max_size > max_size_) {
std::unique_ptr<int[]> a(new int[max_size]);
if (sparse_) {
std::copy_n(sparse_.get(), max_size_, a.get());
}
sparse_ = std::move(a);
std::unique_ptr<IndexValue[]> b(new IndexValue[max_size]);
if (dense_) {
std::copy_n(dense_.get(), max_size_, b.get());
}
dense_ = std::move(b);
MaybeInitializeMemory(max_size_, max_size);
}
max_size_ = max_size;
if (size_ > max_size_)
size_ = max_size_;
DebugCheckInvariants();
}
// Check whether index i is in the array.
template<typename Value>
bool SparseArray<Value>::has_index(int i) const {
assert(i >= 0);
assert(i < max_size_);
if (static_cast<uint32_t>(i) >= static_cast<uint32_t>(max_size_)) {
return false;
}
// Unsigned comparison avoids checking sparse_[i] < 0.
return (uint32_t)sparse_[i] < (uint32_t)size_ &&
dense_[sparse_[i]].index_ == i;
}
template<typename Value>
const Value& SparseArray<Value>::get_existing(int i) const {
assert(has_index(i));
return dense_[sparse_[i]].second;
}
template<typename Value>
void SparseArray<Value>::erase(int i) {
DebugCheckInvariants();
if (has_index(i))
erase_existing(i);
DebugCheckInvariants();
}
template<typename Value>
void SparseArray<Value>::erase_existing(int i) {
DebugCheckInvariants();
assert(has_index(i));
int di = sparse_[i];
if (di < size_ - 1) {
dense_[di] = std::move(dense_[size_ - 1]);
sparse_[dense_[di].index_] = di;
}
size_--;
DebugCheckInvariants();
}
template<typename Value>
void SparseArray<Value>::create_index(int i) {
assert(!has_index(i));
assert(size_ < max_size_);
sparse_[i] = size_;
dense_[size_].index_ = i;
size_++;
}
template<typename Value> SparseArray<Value>::SparseArray(int max_size) {
sparse_.reset(new int[max_size]);
dense_.reset(new IndexValue[max_size]);
size_ = 0;
MaybeInitializeMemory(size_, max_size);
max_size_ = max_size;
DebugCheckInvariants();
}
template<typename Value> SparseArray<Value>::~SparseArray() {
DebugCheckInvariants();
}
template<typename Value> void SparseArray<Value>::DebugCheckInvariants() const {
assert(0 <= size_);
assert(size_ <= max_size_);
assert(size_ == 0 || sparse_ != NULL);
}
// Comparison function for sorting.
template<typename Value> bool SparseArray<Value>::less(const IndexValue& a,
const IndexValue& b) {
return a.index_ < b.index_;
}
} // namespace re2
#endif // UTIL_SPARSE_ARRAY_H_

View File

@ -1,266 +0,0 @@
// Copyright 2006 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef UTIL_SPARSE_SET_H_
#define UTIL_SPARSE_SET_H_
// DESCRIPTION
//
// SparseSet(m) is a set of integers in [0, m).
// It requires sizeof(int)*m memory, but it provides
// fast iteration through the elements in the set and fast clearing
// of the set.
//
// Insertion and deletion are constant time operations.
//
// Allocating the set is a constant time operation
// when memory allocation is a constant time operation.
//
// Clearing the set is a constant time operation (unusual!).
//
// Iterating through the set is an O(n) operation, where n
// is the number of items in the set (not O(m)).
//
// The set iterator visits entries in the order they were first
// inserted into the set. It is safe to add items to the set while
// using an iterator: the iterator will visit indices added to the set
// during the iteration, but will not re-visit indices whose values
// change after visiting. Thus SparseSet can be a convenient
// implementation of a work queue.
//
// The SparseSet implementation is NOT thread-safe. It is up to the
// caller to make sure only one thread is accessing the set. (Typically
// these sets are temporary values and used in situations where speed is
// important.)
//
// The SparseSet interface does not present all the usual STL bells and
// whistles.
//
// Implemented with reference to Briggs & Torczon, An Efficient
// Representation for Sparse Sets, ACM Letters on Programming Languages
// and Systems, Volume 2, Issue 1-4 (March-Dec. 1993), pp. 59-69.
//
// This is a specialization of sparse array; see sparse_array.h.
// IMPLEMENTATION
//
// See sparse_array.h for implementation details.
// Doing this simplifies the logic below.
#ifndef __has_feature
#define __has_feature(x) 0
#endif
#include <assert.h>
#include <stdint.h>
#include <string.h>
#if __has_feature(memory_sanitizer)
#include <sanitizer/msan_interface.h>
#endif
#include <algorithm>
#include <memory>
#include <utility>
namespace re2 {
template<typename Value>
class SparseSetT {
public:
SparseSetT();
explicit SparseSetT(int max_size);
~SparseSetT();
typedef int* iterator;
typedef const int* const_iterator;
// Return the number of entries in the set.
int size() const {
return size_;
}
// Indicate whether the set is empty.
int empty() const {
return size_ == 0;
}
// Iterate over the set.
iterator begin() {
return dense_.get();
}
iterator end() {
return dense_.get() + size_;
}
const_iterator begin() const {
return dense_.get();
}
const_iterator end() const {
return dense_.get() + size_;
}
// Change the maximum size of the set.
// Invalidates all iterators.
void resize(int max_size);
// Return the maximum size of the set.
// Indices can be in the range [0, max_size).
int max_size() const {
return max_size_;
}
// Clear the set.
void clear() {
size_ = 0;
}
// Check whether index i is in the set.
bool contains(int i) const;
// Comparison function for sorting.
// Can sort the sparse set so that future iterations
// will visit indices in increasing order using
// std::sort(arr.begin(), arr.end(), arr.less);
static bool less(int a, int b);
public:
// Insert index i into the set.
iterator insert(int i) {
return InsertInternal(true, i);
}
// Insert index i into the set.
// Fast but unsafe: only use if contains(i) is false.
iterator insert_new(int i) {
return InsertInternal(false, i);
}
private:
iterator InsertInternal(bool allow_existing, int i) {
DebugCheckInvariants();
if (static_cast<uint32_t>(i) >= static_cast<uint32_t>(max_size_)) {
assert(false && "illegal index");
// Semantically, end() would be better here, but we already know
// the user did something stupid, so begin() insulates them from
// dereferencing an invalid pointer.
return begin();
}
if (!allow_existing) {
assert(!contains(i));
create_index(i);
} else {
if (!contains(i))
create_index(i);
}
DebugCheckInvariants();
return dense_.get() + sparse_[i];
}
// Add the index i to the set.
// Only use if contains(i) is known to be false.
// This function is private, only intended as a helper
// for other methods.
void create_index(int i);
// In debug mode, verify that some invariant properties of the class
// are being maintained. This is called at the end of the constructor
// and at the beginning and end of all public non-const member functions.
void DebugCheckInvariants() const;
// Initializes memory for elements [min, max).
void MaybeInitializeMemory(int min, int max) {
#if __has_feature(memory_sanitizer)
__msan_unpoison(sparse_.get() + min, (max - min) * sizeof sparse_[0]);
#elif defined(RE2_ON_VALGRIND)
for (int i = min; i < max; i++) {
sparse_[i] = 0xababababU;
}
#endif
}
int size_ = 0;
int max_size_ = 0;
std::unique_ptr<int[]> sparse_;
std::unique_ptr<int[]> dense_;
};
template<typename Value>
SparseSetT<Value>::SparseSetT() = default;
// Change the maximum size of the set.
// Invalidates all iterators.
template<typename Value>
void SparseSetT<Value>::resize(int max_size) {
DebugCheckInvariants();
if (max_size > max_size_) {
std::unique_ptr<int[]> a(new int[max_size]);
if (sparse_) {
std::copy_n(sparse_.get(), max_size_, a.get());
}
sparse_ = std::move(a);
std::unique_ptr<int[]> b(new int[max_size]);
if (dense_) {
std::copy_n(dense_.get(), max_size_, b.get());
}
dense_ = std::move(b);
MaybeInitializeMemory(max_size_, max_size);
}
max_size_ = max_size;
if (size_ > max_size_)
size_ = max_size_;
DebugCheckInvariants();
}
// Check whether index i is in the set.
template<typename Value>
bool SparseSetT<Value>::contains(int i) const {
assert(i >= 0);
assert(i < max_size_);
if (static_cast<uint32_t>(i) >= static_cast<uint32_t>(max_size_)) {
return false;
}
// Unsigned comparison avoids checking sparse_[i] < 0.
return (uint32_t)sparse_[i] < (uint32_t)size_ &&
dense_[sparse_[i]] == i;
}
template<typename Value>
void SparseSetT<Value>::create_index(int i) {
assert(!contains(i));
assert(size_ < max_size_);
sparse_[i] = size_;
dense_[size_] = i;
size_++;
}
template<typename Value> SparseSetT<Value>::SparseSetT(int max_size) {
sparse_.reset(new int[max_size]);
dense_.reset(new int[max_size]);
size_ = 0;
MaybeInitializeMemory(size_, max_size);
max_size_ = max_size;
DebugCheckInvariants();
}
template<typename Value> SparseSetT<Value>::~SparseSetT() {
DebugCheckInvariants();
}
template<typename Value> void SparseSetT<Value>::DebugCheckInvariants() const {
assert(0 <= size_);
assert(size_ <= max_size_);
assert(size_ == 0 || sparse_ != NULL);
}
// Comparison function for sorting.
template<typename Value> bool SparseSetT<Value>::less(int a, int b) {
return a < b;
}
typedef SparseSetT<void> SparseSet;
} // namespace re2
#endif // UTIL_SPARSE_SET_H_

View File

@ -1,164 +0,0 @@
// Copyright 1999-2005 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include <stdarg.h>
#include <stdio.h>
#include "util/strutil.h"
#ifdef _WIN32
#define snprintf _snprintf
#define vsnprintf _vsnprintf
#endif
namespace re2 {
// ----------------------------------------------------------------------
// CEscapeString()
// Copies 'src' to 'dest', escaping dangerous characters using
// C-style escape sequences. 'src' and 'dest' should not overlap.
// Returns the number of bytes written to 'dest' (not including the \0)
// or (size_t)-1 if there was insufficient space.
// ----------------------------------------------------------------------
static size_t CEscapeString(const char* src, size_t src_len,
char* dest, size_t dest_len) {
const char* src_end = src + src_len;
size_t used = 0;
for (; src < src_end; src++) {
if (dest_len - used < 2) // space for two-character escape
return (size_t)-1;
unsigned char c = *src;
switch (c) {
case '\n': dest[used++] = '\\'; dest[used++] = 'n'; break;
case '\r': dest[used++] = '\\'; dest[used++] = 'r'; break;
case '\t': dest[used++] = '\\'; dest[used++] = 't'; break;
case '\"': dest[used++] = '\\'; dest[used++] = '\"'; break;
case '\'': dest[used++] = '\\'; dest[used++] = '\''; break;
case '\\': dest[used++] = '\\'; dest[used++] = '\\'; break;
default:
// Note that if we emit \xNN and the src character after that is a hex
// digit then that digit must be escaped too to prevent it being
// interpreted as part of the character code by C.
if (c < ' ' || c > '~') {
if (dest_len - used < 5) // space for four-character escape + \0
return (size_t)-1;
snprintf(dest + used, 5, "\\%03o", c);
used += 4;
} else {
dest[used++] = c; break;
}
}
}
if (dest_len - used < 1) // make sure that there is room for \0
return (size_t)-1;
dest[used] = '\0'; // doesn't count towards return value though
return used;
}
// ----------------------------------------------------------------------
// CEscape()
// Copies 'src' to result, escaping dangerous characters using
// C-style escape sequences. 'src' and 'dest' should not overlap.
// ----------------------------------------------------------------------
string CEscape(const StringPiece& src) {
const size_t dest_len = src.size() * 4 + 1; // Maximum possible expansion
char* dest = new char[dest_len];
const size_t used = CEscapeString(src.data(), src.size(),
dest, dest_len);
string s = string(dest, used);
delete[] dest;
return s;
}
void PrefixSuccessor(string* prefix) {
// We can increment the last character in the string and be done
// unless that character is 255, in which case we have to erase the
// last character and increment the previous character, unless that
// is 255, etc. If the string is empty or consists entirely of
// 255's, we just return the empty string.
while (!prefix->empty()) {
char& c = prefix->back();
if (c == '\xff') { // char literal avoids signed/unsigned.
prefix->pop_back();
} else {
++c;
break;
}
}
}
static void StringAppendV(string* dst, const char* format, va_list ap) {
// First try with a small fixed size buffer
char space[1024];
// It's possible for methods that use a va_list to invalidate
// the data in it upon use. The fix is to make a copy
// of the structure before using it and use that copy instead.
va_list backup_ap;
va_copy(backup_ap, ap);
int result = vsnprintf(space, sizeof(space), format, backup_ap);
va_end(backup_ap);
if ((result >= 0) && (static_cast<size_t>(result) < sizeof(space))) {
// It fit
dst->append(space, result);
return;
}
// Repeatedly increase buffer size until it fits
int length = sizeof(space);
while (true) {
if (result < 0) {
// Older behavior: just try doubling the buffer size
length *= 2;
} else {
// We need exactly "result+1" characters
length = result+1;
}
char* buf = new char[length];
// Restore the va_list before we use it again
va_copy(backup_ap, ap);
result = vsnprintf(buf, length, format, backup_ap);
va_end(backup_ap);
if ((result >= 0) && (result < length)) {
// It fit
dst->append(buf, result);
delete[] buf;
return;
}
delete[] buf;
}
}
string StringPrintf(const char* format, ...) {
va_list ap;
va_start(ap, format);
string result;
StringAppendV(&result, format, ap);
va_end(ap);
return result;
}
void SStringPrintf(string* dst, const char* format, ...) {
va_list ap;
va_start(ap, format);
dst->clear();
StringAppendV(dst, format, ap);
va_end(ap);
}
void StringAppendF(string* dst, const char* format, ...) {
va_list ap;
va_start(ap, format);
StringAppendV(dst, format, ap);
va_end(ap);
}
} // namespace re2

View File

@ -1,23 +0,0 @@
// Copyright 2016 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef UTIL_STRUTIL_H_
#define UTIL_STRUTIL_H_
#include <string>
#include "re2/stringpiece.h"
#include "util/util.h"
namespace re2 {
string CEscape(const StringPiece& src);
void PrefixSuccessor(string* prefix);
string StringPrintf(const char* format, ...);
void SStringPrintf(string* dst, const char* format, ...);
void StringAppendF(string* dst, const char* format, ...);
} // namespace re2
#endif // UTIL_STRUTIL_H_

View File

@ -1,34 +0,0 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include <stdio.h>
#ifndef _WIN32
#include <sys/resource.h>
#endif
#include "util/test.h"
DEFINE_string(test_tmpdir, "/var/tmp", "temp directory");
struct Test {
void (*fn)(void);
const char *name;
};
static Test tests[10000];
static int ntests;
void RegisterTest(void (*fn)(void), const char *name) {
tests[ntests].fn = fn;
tests[ntests++].name = name;
}
int main(int argc, char** argv) {
for (int i = 0; i < ntests; i++) {
printf("%s\n", tests[i].name);
tests[i].fn();
}
printf("PASS\n");
return 0;
}

View File

@ -1,46 +0,0 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef UTIL_TEST_H_
#define UTIL_TEST_H_
#include "util/util.h"
#include "util/flags.h"
#include "util/logging.h"
#define TEST(x, y) \
void x##y(void); \
TestRegisterer r##x##y(x##y, # x "." # y); \
void x##y(void)
void RegisterTest(void (*)(void), const char*);
class TestRegisterer {
public:
TestRegisterer(void (*fn)(void), const char *s) {
RegisterTest(fn, s);
}
};
// TODO(rsc): Do a better job.
#define EXPECT_EQ CHECK_EQ
#define EXPECT_TRUE CHECK
#define EXPECT_LT CHECK_LT
#define EXPECT_GT CHECK_GT
#define EXPECT_LE CHECK_LE
#define EXPECT_GE CHECK_GE
#define EXPECT_FALSE(x) CHECK(!(x))
namespace testing {
class MallocCounter {
public:
MallocCounter(int x) {}
static const int THIS_THREAD_ONLY = 0;
long long HeapGrowth() { return 0; }
long long PeakHeapGrowth() { return 0; }
void Reset() {}
};
} // namespace testing
#endif // UTIL_TEST_H_

View File

@ -1,44 +0,0 @@
/*
* The authors of this software are Rob Pike and Ken Thompson.
* Copyright (c) 2002 by Lucent Technologies.
* Permission to use, copy, modify, and distribute this software for any
* purpose without fee is hereby granted, provided that this entire notice
* is included in all copies of any software which is or includes a copy
* or modification of this software and in all copies of the supporting
* documentation for such software.
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
*
* This file and rune.cc have been converted to compile as C++ code
* in name space re2.
*/
#ifndef UTIL_UTF_H_
#define UTIL_UTF_H_
#include <stdint.h>
namespace re2 {
typedef signed int Rune; /* Code-point values in Unicode 4.0 are 21 bits wide.*/
enum
{
UTFmax = 4, /* maximum bytes per rune */
Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */
Runeself = 0x80, /* rune and UTF sequences are the same (<) */
Runeerror = 0xFFFD, /* decoding error in UTF */
Runemax = 0x10FFFF, /* maximum rune value */
};
int runetochar(char* s, const Rune* r);
int chartorune(Rune* r, const char* s);
int fullrune(const char* s, int n);
int utflen(const char* s);
char* utfrune(const char*, Rune);
} // namespace re2
#endif // UTIL_UTF_H_

View File

@ -1,22 +0,0 @@
// Copyright 2009 The RE2 Authors. All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#ifndef UTIL_UTIL_H_
#define UTIL_UTIL_H_
// TODO(junyer): Get rid of this.
#include <string>
using std::string;
#define arraysize(array) (int)(sizeof(array)/sizeof((array)[0]))
#ifndef FALLTHROUGH_INTENDED
#define FALLTHROUGH_INTENDED do { } while (0)
#endif
#ifndef NO_THREAD_SAFETY_ANALYSIS
#define NO_THREAD_SAFETY_ANALYSIS
#endif
#endif // UTIL_UTIL_H_

1
contrib/re2 vendored Submodule

@ -0,0 +1 @@
Subproject commit 7cf8b88e8f70f97fd4926b56aa87e7f53b2717e0

View File

@ -1,50 +1,23 @@
set (re2_sources
./re2/bitstate.cc
./re2/compile.cc
./re2/dfa.cc
./re2/filtered_re2.cc
./re2/mimics_pcre.cc
./re2/nfa.cc
./re2/onepass.cc
./re2/parse.cc
./re2/perl_groups.cc
./re2/prefilter.cc
./re2/prefilter_tree.cc
./re2/prog.cc
./re2/re2.cc
./re2/regexp.cc
./re2/set.cc
./re2/simplify.cc
./re2/stringpiece.cc
./re2/tostring.cc
./re2/unicode_casefold.cc
./re2/unicode_groups.cc
./util/benchmark.cc
./util/fuzz.cc
./util/pcre.cc
./util/rune.cc
./util/strutil.cc
./util/test.cc
)
# Building re2 which is thread-safe and re2_st which is not.
# re2 changes its state during matching of regular expression, e.g. creates temporary DFA.
# It uses RWLock to process the same regular expression object from different threads.
# In order to avoid redundant locks in some cases, we use not thread-safe version of the library (re2_st).
add_library (re2 ${re2_sources})
add_library (re2_st ${re2_sources})
set (RE2_SOURCE_DIR ${ClickHouse_SOURCE_DIR}/contrib/re2/)
target_compile_definitions (re2 PRIVATE NDEBUG)
get_target_property (RE2_SOURCES_ re2 SOURCES)
foreach (src ${RE2_SOURCES_})
list(APPEND RE2_ST_SOURCES ${RE2_SOURCE_DIR}/${src})
endforeach ()
add_library (re2_st ${RE2_ST_SOURCES})
target_compile_definitions (re2_st PRIVATE NDEBUG NO_THREADS re2=re2_st)
target_include_directories (re2 PUBLIC .)
target_include_directories (re2_st PRIVATE . PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
target_include_directories (re2_st PRIVATE . PUBLIC ${CMAKE_CURRENT_BINARY_DIR} ${RE2_SOURCE_DIR})
file (MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/re2_st)
foreach (FILENAME filtered_re2.h re2.h set.h stringpiece.h variadic_function.h)
foreach (FILENAME filtered_re2.h re2.h set.h stringpiece.h)
add_custom_command (OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/re2_st/${FILENAME}"
COMMAND ${CMAKE_COMMAND} -DSOURCE_FILENAME="${CMAKE_CURRENT_SOURCE_DIR}/re2/${FILENAME}"
COMMAND ${CMAKE_COMMAND} -DSOURCE_FILENAME="${RE2_SOURCE_DIR}/re2/${FILENAME}"
-DTARGET_FILENAME="${CMAKE_CURRENT_BINARY_DIR}/re2_st/${FILENAME}"
-P "${CMAKE_CURRENT_SOURCE_DIR}/re2_transform.cmake"
COMMENT "Creating ${FILENAME} for re2_st library.")

View File

@ -80,16 +80,6 @@ list (APPEND dbms_headers
list (APPEND dbms_sources src/TableFunctions/TableFunctionFactory.cpp)
list (APPEND dbms_headers src/TableFunctions/ITableFunction.h src/TableFunctions/TableFunctionFactory.h)
if (APPLE OR CMAKE_SYSTEM MATCHES "FreeBSD")
list(REMOVE_ITEM clickhouse_common_io_headers
src/Common/AIO.h
src/IO/WriteBufferAIO.h
src/IO/ReadBufferAIO.h)
list(REMOVE_ITEM clickhouse_common_io_sources
src/IO/ReadBufferAIO.cpp
src/IO/WriteBufferAIO.cpp)
endif()
add_library(clickhouse_common_io ${SPLIT_SHARED} ${clickhouse_common_io_headers} ${clickhouse_common_io_sources})
add_subdirectory(src/Common/ZooKeeper)
@ -142,6 +132,7 @@ target_link_libraries (clickhouse_common_io
${ZLIB_LIBRARIES}
${EXECINFO_LIBRARY}
${Boost_SYSTEM_LIBRARY}
${CMAKE_DL_LIBS}
)
target_link_libraries (dbms
@ -203,7 +194,6 @@ if (USE_RDKAFKA)
endif ()
target_link_libraries (dbms
${CMAKE_DL_LIBS}
Threads::Threads
)
@ -228,7 +218,7 @@ target_include_directories (clickhouse_common_io BEFORE PRIVATE ${COMMON_INCLUDE
add_subdirectory (tests)
if (ENABLE_TESTS)
include (${PROJECT_SOURCE_DIR}/cmake/find_gtest.cmake)
include (${ClickHouse_SOURCE_DIR}/cmake/find_gtest.cmake)
if (USE_INTERNAL_GTEST_LIBRARY)
# Google Test from sources

View File

@ -1,6 +1,6 @@
# This strings autochanged from release_lib.sh:
set(VERSION_DESCRIBE v1.1.54340-testing)
set(VERSION_REVISION 54340)
set(VERSION_DESCRIBE v1.1.54343-testing)
set(VERSION_REVISION 54343)
# end of autochange
set (VERSION_MAJOR 1)

View File

@ -4,6 +4,7 @@
#include <Poco/Net/DNS.h>
#include <Common/getFQDNOrHostName.h>
#include <Common/isLocalAddress.h>
#include <Common/ProfileEvents.h>
#include <Interpreters/Settings.h>
@ -39,14 +40,7 @@ ConnectionPoolWithFailover::ConnectionPoolWithFailover(
for (size_t i = 0; i < nested_pools.size(); ++i)
{
ConnectionPool & connection_pool = dynamic_cast<ConnectionPool &>(*nested_pools[i]);
const std::string & host = connection_pool.getHost();
size_t hostname_difference = 0;
for (size_t i = 0; i < std::min(local_hostname.length(), host.length()); ++i)
if (local_hostname[i] != host[i])
++hostname_difference;
hostname_differences[i] = hostname_difference;
hostname_differences[i] = getHostNameDifference(local_hostname, connection_pool.getHost());
}
}

View File

@ -1,5 +1,7 @@
#pragma once
#if !(defined(__FreeBSD__) || defined(__APPLE__))
#include <Common/Exception.h>
#include <common/logger_useful.h>
#include <ext/singleton.h>
@ -220,3 +222,5 @@ public:
}
#endif

View File

@ -364,6 +364,8 @@ namespace ErrorCodes
extern const int MULTIPLE_STREAMS_REQUIRED = 385;
extern const int NO_COMMON_TYPE = 386;
extern const int EXTERNAL_LOADABLE_ALREADY_EXISTS = 387;
extern const int CANNOT_ASSIGN_OPTIMIZE = 388;
extern const int INSERT_WAS_DEDUPLICATED = 389;
extern const int KEEPER_EXCEPTION = 999;
extern const int POCO_EXCEPTION = 1000;

View File

@ -51,6 +51,7 @@
M(DelayedInserts) \
M(RejectedInserts) \
M(DelayedInsertsMilliseconds) \
M(DuplicatedInsertedBlocks) \
\
M(ZooKeeperInit) \
M(ZooKeeperTransactions) \

View File

@ -41,7 +41,7 @@ struct TypeList<THead, TTail...>
}
};
/// Append Type to TypeList
/// Prepend Type to TypeList
/// Usage:
/// using TypeListWithType = typename AppendToTypeList<Type, ConcreteTypeList>::Type;
template <typename TypeToPrepend, typename List, typename ... Types>
@ -56,6 +56,21 @@ struct PrependToTypeList<TypeToPrepend, TypeList<>, Types ...>
using Type = TypeList<TypeToPrepend, Types ...>;
};
/// Append Type to TypeList
/// Usage:
/// using TypeListWithType = typename AppendToTypeList<Type, ConcreteTypeList>::Type;
template <typename TypeToAppend, typename List, typename ... Types>
struct AppendToTypeList
{
using Type = typename AppendToTypeList<TypeToAppend, typename List::Tail, Types ..., typename List::Head>::Type;
};
template <typename TypeToAppend, typename ... Types>
struct AppendToTypeList<TypeToAppend, TypeList<>, Types ...>
{
using Type = TypeList<Types ..., TypeToAppend>;
};
/// Apply TypeList as variadic template argument of Class.
/// Usage:
/// using ClassWithAppliedTypeList = typename ApplyTypeListForClass<Class, ConcreteTypeList>::Type;
@ -71,4 +86,38 @@ struct ApplyTypeListForClass<Class, TypeList<>, Types ...>
using Type = Class<Types ...>;
};
/// TypeList concatenation.
/// Usage:
/// using ResultTypeList = typename TypeListConcat<LeftList, RightList>::Type;
template <typename TypeListLeft, typename TypeListRight>
struct TypeListConcat
{
using Type = typename TypeListConcat<
typename AppendToTypeList<typename TypeListRight::Head, TypeListLeft>::Type,
typename TypeListRight::Tail>::Type;
};
template <typename TypeListLeft>
struct TypeListConcat<TypeListLeft, TypeList<>>
{
using Type = TypeListLeft;
};
/// TypeList Map function.
/// Usage:
/// using ResultTypeList = typename TypeListMap<Function, TypeListArgs>::Type;
template <template <typename> typename Function, typename TypeListArgs>
struct TypeListMap
{
using Type = typename PrependToTypeList<
Function<typename TypeListArgs::Head>,
typename TypeListMap<Function, typename TypeListArgs::Tail>::Type>::Type;
};
template <template <typename> typename Function>
struct TypeListMap<Function, TypeList<>>
{
using Type = TypeList<>;
};
}

152
dbms/src/Common/Visitor.h Normal file
View File

@ -0,0 +1,152 @@
#pragma once
#include <Common/demangle.h>
#include <Common/TypeList.h>
#include <Common/Exception.h>
/* Generic utils which are intended for visitor pattern implementation.
* The original purpose is to provide possibility to get concrete template specialisation for type in list.
*
* Usage:
* 1. Declare visitor interface base class for types T_1, ..., T_N:
* class MyVisitor : public Visitor<T_1, ..., T_N> {};
* 2. Declare visitor implementation base class using VisitorImpl:
* template <typename Derived>
* class MyVisitorImpl : public VisitorImpl<Derived, MyVisitor> {};
* 3. Add virtual function 'accept' to common base of T_1, ..., T_N:
* class T_Base
* {
* ...
* public:
* virtual void accept(MyVisitor &) { throw Exception("Accept not implemented"); }
* };
* 4. Declare base class for T_1, ..., T_N implementation:
* template <typename Derived>
* class T_Impl : public Visitable<Derived, T_Base, MyVisitor> {};
* 5. Implement 'accept' for each T_i:
* a) inherit from T_Impl:
* class T_i : public T_Impl<T_i> { ... };
* b) or in order to avoid ambiguity:
* class T_i
* {
* ...
* public:
* void accept(MyVisitor & visitor) override { visitor.visit(*this); }
* };
* 6. Implement concrete visitor with visitImpl template function:
* class MyConcreteVisitor : public MyVisitorImpl<MyConcreteVisitor>
* {
* ...
* public:
* template <typename T>
* void visitImpl(T & t) { ... }
* };
* 7. Now you can call concrete implementation for MyConcreteVisitor:
* MyConcreteVisitor visitor;
* T_Base * base;
* base->accept(visitor); /// call MyConcreteVisitor::visitImpl(T & t)
*
* TODO: Add ConstVisitor with 'visit(const Type &)' function in order to implement 'accept(...) const'.
*/
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
}
template <typename ... Types>
class Visitor;
template <>
class Visitor<>
{
public:
using List = TypeList<>;
virtual ~Visitor() = default;
};
template <typename Type>
class Visitor<Type> : public Visitor<>
{
public:
using List = TypeList<Type>;
virtual void visit(Type &) = 0;
};
template <typename Type, typename ... Types>
class Visitor<Type, Types ...> : public Visitor<Types ...>
{
public:
using List = TypeList<Type, Types ...>;
using Visitor<Types ...>::visit;
virtual void visit(Type &) = 0;
};
template <typename Derived, typename VisitorBase, typename ... Types>
class VisitorImplHelper;
template <typename Derived, typename VisitorBase>
class VisitorImplHelper<Derived, VisitorBase> : public VisitorBase
{
};
template <typename Derived, typename VisitorBase, typename Type>
class VisitorImplHelper<Derived, VisitorBase, Type> : public VisitorBase
{
public:
using VisitorBase::visit;
void visit(Type & value) override { static_cast<Derived *>(this)->visitImpl(value); }
protected:
template <typename T>
void visitImpl(Type &)
{
throw Exception("visitImpl(" + demangle(typeid(T).name()) + " &)" + " is not implemented for class"
+ demangle(typeid(Derived).name()), ErrorCodes::LOGICAL_ERROR);
}
};
template <typename Derived, typename VisitorBase, typename Type, typename ... Types>
class VisitorImplHelper<Derived, VisitorBase, Type, Types ...>
: public VisitorImplHelper<Derived, VisitorBase, Types ...>
{
public:
using VisitorImplHelper<Derived, VisitorBase, Types ...>::visit;
void visit(Type & value) override { static_cast<Derived *>(this)->visitImpl(value); }
protected:
template <typename T>
void visitImpl(Type &)
{
throw Exception("visitImpl(" + demangle(typeid(T).name()) + " &)" + " is not implemented for class"
+ demangle(typeid(Derived).name()), ErrorCodes::LOGICAL_ERROR);
}
};
template <typename Derived, typename VisitorBase>
class VisitorImpl : public
ApplyTypeListForClass<
VisitorImplHelper,
typename TypeListConcat<
TypeList<Derived, VisitorBase>,
typename VisitorBase::List
>::Type
>::Type
{
};
template <typename Derived, typename Base, typename Visitor>
class Visitable : public Base
{
public:
void accept(Visitor & visitor) override { visitor.visit(*static_cast<Derived *>(this)); }
};
}

View File

@ -41,6 +41,16 @@ inline bool isHardwareErrorCode(int32_t zk_return_code)
return isUnrecoverableErrorCode(zk_return_code) || isTemporaryErrorCode(zk_return_code);
}
/// Valid errors sent from server
inline bool isUserError(int32_t zk_return_code)
{
return zk_return_code == ZNONODE
|| zk_return_code == ZBADVERSION
|| zk_return_code == ZNOCHILDRENFOREPHEMERALS
|| zk_return_code == ZNODEEXISTS
|| zk_return_code == ZNOTEMPTY;
}
class KeeperException : public DB::Exception
{

View File

@ -21,7 +21,9 @@ public:
virtual std::unique_ptr<Op> clone() const = 0;
virtual std::string describe() = 0;
virtual std::string getPath() const = 0;
virtual std::string describe() const = 0;
std::unique_ptr<zoo_op_t> data;
@ -44,7 +46,9 @@ struct Op::Remove : public Op
return std::unique_ptr<zkutil::Op>(new Remove(path, version));
}
std::string describe() override { return "command: remove, path: " + path; }
std::string getPath() const override { return path; }
std::string describe() const override { return "command: remove, path: " + path; }
private:
std::string path;
@ -53,27 +57,26 @@ private:
struct Op::Create : public Op
{
Create(const std::string & path_, const std::string & value_, ACLPtr acl_, int32_t flags_);
Create(const std::string & path_pattern_, const std::string & value_, ACLPtr acl_, int32_t flags_);
std::unique_ptr<Op> clone() const override
{
return std::unique_ptr<zkutil::Op>(new Create(path, value, acl, flags));
return std::unique_ptr<zkutil::Op>(new Create(path_pattern, value, acl, flags));
}
std::string getPathCreated()
{
return created_path.data();
}
std::string getPathCreated() { return created_path.data(); }
std::string describe() override
std::string getPath() const override { return path_pattern; }
std::string describe() const override
{
return "command: create"
", path: " + path +
", path: " + path_pattern +
", value: " + value;
}
private:
std::string path;
std::string path_pattern;
std::string value;
ACLPtr acl;
int32_t flags;
@ -93,7 +96,9 @@ struct Op::SetData : public Op
return std::unique_ptr<zkutil::Op>(new SetData(path, value, version));
}
std::string describe() override
std::string getPath() const override { return path; }
std::string describe() const override
{
return
"command: set"
@ -122,7 +127,9 @@ struct Op::Check : public Op
return std::unique_ptr<zkutil::Op>(new Check(path, version));
}
std::string describe() override { return "command: check, path: " + path; }
std::string getPath() const override { return path; }
std::string describe() const override { return "command: check, path: " + path; }
private:
std::string path;
@ -131,11 +138,12 @@ private:
struct OpResult : public zoo_op_result_t
{
/// Указатели в этой структуре указывают на поля в классе Op.
/// Поэтому деструктор не нужен
/// Pointers in this class point to fields of class Op.
/// Op instances have the same (or longer lifetime), therefore destructor is not required.
};
using Ops = std::vector<std::unique_ptr<Op>>;
using OpPtr = std::unique_ptr<Op>;
using Ops = std::vector<OpPtr>;
using OpResults = std::vector<OpResult>;
using OpResultsPtr = std::shared_ptr<OpResults>;
using Strings = std::vector<std::string>;
@ -162,4 +170,11 @@ class ZooKeeper;
/// path - znode path to which the change happened. if event == ZOO_SESSION_EVENT it is either NULL or empty string.
using WatchCallback = std::function<void(ZooKeeper & zookeeper, int type, int state, const char * path)>;
/// Returns first op which code != ZOK or throws an exception
/// ZooKeeper client sets correct OP codes if the transaction fails because of logical (user) errors like ZNODEEXISTS
/// If it is failed because of network error, for example, OP codes is not set.
/// Therefore you should make zkutil::isUserError() check before the function invocation.
size_t getFailedOpIndex(const OpResultsPtr & op_results, int32_t transaction_return_code);
}

View File

@ -29,6 +29,15 @@ namespace CurrentMetrics
}
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
}
}
namespace zkutil
{
@ -581,6 +590,7 @@ int32_t ZooKeeper::multiImpl(const Ops & ops_, OpResultsPtr * out_results_)
/// Copy the struct containing pointers with default copy-constructor.
/// It is safe because it hasn't got a destructor.
std::vector<zoo_op_t> ops;
ops.reserve(ops_.size());
for (const auto & op : ops_)
ops.push_back(*(op->data));
@ -605,7 +615,7 @@ OpResultsPtr ZooKeeper::multi(const Ops & ops)
for (size_t i = 0; i < ops.size(); ++i)
{
if (results->at(i).err == code)
throw KeeperException("multi() failed at op #" + std::to_string(i) + ", " + ops[i]->describe(), code);
throw KeeperException("Transaction failed at op #" + std::to_string(i) + ": " + ops[i]->describe(), code);
}
}
@ -619,15 +629,22 @@ int32_t ZooKeeper::tryMulti(const Ops & ops_, OpResultsPtr * out_results_)
int32_t code = multiImpl(ops_, out_results_);
if (!(code == ZOK ||
code == ZNONODE ||
code == ZNODEEXISTS ||
code == ZNOCHILDRENFOREPHEMERALS ||
code == ZBADVERSION ||
code == ZNOTEMPTY))
code == ZNONODE ||
code == ZNODEEXISTS ||
code == ZNOCHILDRENFOREPHEMERALS ||
code == ZBADVERSION ||
code == ZNOTEMPTY))
throw KeeperException(code);
return code;
}
int32_t ZooKeeper::tryMultiUnsafe(const Ops & ops, MultiTransactionInfo & info)
{
info.code = multiImpl(ops, &info.op_results);
info.ops = &ops;
return info.code;
}
int32_t ZooKeeper::tryMultiWithRetries(const Ops & ops, OpResultsPtr * out_results, size_t * attempt)
{
int32_t code = retry(std::bind(&ZooKeeper::multiImpl, this, std::ref(ops), out_results), attempt);
@ -739,10 +756,10 @@ ZooKeeperPtr ZooKeeper::startNewSession() const
return std::make_shared<ZooKeeper>(hosts, identity, session_timeout_ms);
}
Op::Create::Create(const std::string & path_, const std::string & value_, ACLPtr acl_, int32_t flags_)
: path(path_), value(value_), acl(acl_), flags(flags_), created_path(path.size() + ZooKeeper::SEQUENTIAL_SUFFIX_SIZE)
Op::Create::Create(const std::string & path_pattern_, const std::string & value_, ACLPtr acl_, int32_t flags_)
: path_pattern(path_pattern_), value(value_), acl(acl_), flags(flags_), created_path(path_pattern_.size() + ZooKeeper::SEQUENTIAL_SUFFIX_SIZE)
{
zoo_create_op_init(data.get(), path.c_str(), value.c_str(), value.size(), acl, flags, created_path.data(), created_path.size());
zoo_create_op_init(data.get(), path_pattern.c_str(), value.c_str(), value.size(), acl, flags, created_path.data(), created_path.size());
}
ACLPtr ZooKeeper::getDefaultACL()
@ -1029,4 +1046,26 @@ ZooKeeper::MultiFuture ZooKeeper::asyncMulti(const zkutil::Ops & ops)
return asyncMultiImpl(ops, true);
}
size_t getFailedOpIndex(const OpResultsPtr & op_results, int32_t transaction_return_code)
{
if (op_results == nullptr || op_results->empty())
throw DB::Exception("OpResults is empty", DB::ErrorCodes::LOGICAL_ERROR);
for (size_t index = 0; index < op_results->size(); ++index)
{
if ((*op_results)[index].err != ZOK)
return index;
}
if (!isUserError(transaction_return_code))
{
throw DB::Exception("There are no failed OPs because '" + ZooKeeper::error2string(transaction_return_code) + "' is not valid response code for that",
DB::ErrorCodes::LOGICAL_ERROR);
}
throw DB::Exception("There is no failed OpResult", DB::ErrorCodes::LOGICAL_ERROR);
}
}

View File

@ -36,7 +36,7 @@ const UInt32 BIG_SESSION_TIMEOUT = 600000;
constexpr size_t MULTI_BATCH_SIZE = 100;
struct WatchContext;
struct MultiTransactionInfo;
/// ZooKeeper session. The interface is substantially different from the usual libzookeeper API.
///
@ -200,6 +200,8 @@ public:
/// Throws only if some operation has returned an "unexpected" error
/// - an error that would cause the corresponding try- method to throw.
int32_t tryMulti(const Ops & ops, OpResultsPtr * out_results = nullptr);
/// Like previous one, but does not throw any ZooKeeper exceptions
int32_t tryMultiUnsafe(const Ops & ops, MultiTransactionInfo & info);
/// Use only with read-only operations.
int32_t tryMultiWithRetries(const Ops & ops, OpResultsPtr * out_results = nullptr, size_t * attempt = nullptr);
@ -429,6 +431,7 @@ private:
bool is_dirty = false;
};
using ZooKeeperPtr = ZooKeeper::Ptr;
@ -490,4 +493,41 @@ private:
using EphemeralNodeHolderPtr = EphemeralNodeHolder::Ptr;
/// Simple structure to handle transaction execution results
struct MultiTransactionInfo
{
MultiTransactionInfo() = default;
const Ops * ops = nullptr;
int32_t code = ZOK;
OpResultsPtr op_results;
bool empty() const
{
return ops == nullptr;
}
bool hasFailedOp() const
{
return zkutil::isUserError(code);
}
const Op & getFailedOp() const
{
return *ops->at(getFailedOpIndex(op_results, code));
}
KeeperException getException() const
{
if (hasFailedOp())
{
size_t i = getFailedOpIndex(op_results, code);
return KeeperException("Transaction failed at op #" + std::to_string(i) + ": " + ops->at(i)->describe(), code);
}
else
return KeeperException(code);
}
};
}

View File

@ -15,6 +15,3 @@ target_link_libraries(zkutil_zookeeper_holder clickhouse_common_zookeeper)
add_executable (zk_many_watches_reconnect zk_many_watches_reconnect.cpp)
target_link_libraries (zk_many_watches_reconnect clickhouse_common_zookeeper clickhouse_common_configprocessor)
add_executable (zkutil_test_multi_exception zkutil_test_multi_exception.cpp)
target_link_libraries (zkutil_test_multi_exception clickhouse_common_zookeeper gtest_main)

View File

@ -1,5 +1,5 @@
#include <Common/ZooKeeper/ZooKeeper.h>
#include <Common/Exception.h>
#include <Common/StringUtils/StringUtils.h>
#include <iostream>
#include <chrono>
@ -113,3 +113,32 @@ TEST(zkutil, multi_async)
ASSERT_EQ(res.ops_ptr->size(), 2);
}
}
TEST(zkutil, multi_create_sequential)
{
try
{
auto zookeeper = std::make_unique<zkutil::ZooKeeper>("localhost:2181");
auto acl = zookeeper->getDefaultACL();
zkutil::Ops ops;
String base_path = "/clickhouse_test/zkutil/multi_create_sequential";
zookeeper->tryRemoveRecursive(base_path);
zookeeper->createAncestors(base_path + "/");
String entry_path = base_path + "/queue-";
ops.emplace_back(new zkutil::Op::Create(entry_path, "", acl, zkutil::CreateMode::EphemeralSequential));
zkutil::OpResultsPtr results = zookeeper->multi(ops);
zkutil::OpResult & result = results->at(0);
EXPECT_TRUE(result.value != nullptr);
EXPECT_TRUE(startsWith(result.value, entry_path));
}
catch (...)
{
std::cerr << getCurrentExceptionMessage(false);
throw;
}
}

View File

@ -8,3 +8,8 @@
sudo iptables -A OUTPUT -p tcp --dport 2181 -j DROP
sudo ip6tables -A OUTPUT -p tcp --dport 2181 -j DROP
# You could also test random drops:
#sudo iptables -A OUTPUT -p tcp --dport 2181 -j REJECT --reject-with tcp-reset -m statistic --mode random --probability 0.1
#sudo ip6tables -A OUTPUT -p tcp --dport 2181 -j REJECT --reject-with tcp-reset -m statistic --mode random --probability 0.1

View File

@ -1,6 +1,7 @@
#include <iostream>
#include <Common/ZooKeeper/ZooKeeper.h>
#include <Poco/ConsoleChannel.h>
#include <Common/Exception.h>
/// Проверяет, какие ошибки выдает ZooKeeper при попытке сделать какую-нибудь операцию через разное время после истечения сессии.
@ -36,17 +37,20 @@ int main(int argc, char ** argv)
ops.emplace_back(std::make_unique<zkutil::Op::Create>("/test/zk_expiration_test", "hello", zk.getDefaultACL(), zkutil::CreateMode::Persistent));
ops.emplace_back(std::make_unique<zkutil::Op::Remove>("/test/zk_expiration_test", -1));
int code;
zkutil::MultiTransactionInfo info;
zk.tryMultiUnsafe(ops, info);
std::cout << time(nullptr) - time0 << "s: " << zkutil::ZooKeeper::error2string(info.code) << std::endl;
try
{
code = zk.tryMulti(ops);
if (info.code != ZOK)
std::cout << "Path: " << info.getFailedOp().getPath() << std::endl;
}
catch (zkutil::KeeperException & e)
catch (...)
{
code = e.code;
std::cout << DB::getCurrentExceptionMessage(false) << std::endl;
}
std::cout << time(nullptr) - time0 << "s: " << zkutil::ZooKeeper::error2string(code) << std::endl;
}
sleep(1);
@ -54,12 +58,12 @@ int main(int argc, char ** argv)
}
catch (zkutil::KeeperException & e)
{
std::cerr << "KeeperException: " << e.displayText() << std::endl;
std::cerr << "KeeperException: " << DB::getCurrentExceptionMessage(true) << std::endl;
return 1;
}
catch (...)
{
std::cerr << "Some exception" << std::endl;
std::cerr << "Some exception: " << DB::getCurrentExceptionMessage(true) << std::endl;
return 2;
}

View File

@ -2,6 +2,7 @@
#include <string>
/** Get the FQDN for the local server by resolving DNS hostname - similar to calling the 'hostname' tool with the -f flag.
* If it does not work, return hostname - similar to calling 'hostname' without flags or 'uname -n'.
*/

View File

@ -10,25 +10,35 @@
namespace DB
{
bool isLocalAddress(const Poco::Net::SocketAddress & address, UInt16 clickhouse_port)
bool isLocalAddress(const Poco::Net::SocketAddress & address)
{
static auto interfaces = Poco::Net::NetworkInterface::list();
if (clickhouse_port == address.port())
{
return interfaces.end() != std::find_if(interfaces.begin(), interfaces.end(),
[&] (const Poco::Net::NetworkInterface & interface)
{
/** Compare the addresses without taking into account `scope`.
* Theoretically, this may not be correct - depends on `route` setting
* - through which interface we will actually access the specified address.
*/
return interface.address().length() == address.host().length()
&& 0 == memcmp(interface.address().addr(), address.host().addr(), address.host().length());
});
}
return interfaces.end() != std::find_if(interfaces.begin(), interfaces.end(),
[&] (const Poco::Net::NetworkInterface & interface)
{
/** Compare the addresses without taking into account `scope`.
* Theoretically, this may not be correct - depends on `route` setting
* - through which interface we will actually access the specified address.
*/
return interface.address().length() == address.host().length()
&& 0 == memcmp(interface.address().addr(), address.host().addr(), address.host().length());
});
}
return false;
bool isLocalAddress(const Poco::Net::SocketAddress & address, UInt16 clickhouse_port)
{
return clickhouse_port == address.port() && isLocalAddress(address);
}
size_t getHostNameDifference(const std::string & local_hostname, const std::string & host)
{
size_t hostname_difference = 0;
for (size_t i = 0; i < std::min(local_hostname.length(), host.length()); ++i)
if (local_hostname[i] != host[i])
++hostname_difference;
return hostname_difference;
}
}

View File

@ -23,4 +23,8 @@ namespace DB
*/
bool isLocalAddress(const Poco::Net::SocketAddress & address, UInt16 clickhouse_port);
bool isLocalAddress(const Poco::Net::SocketAddress & address);
/// Returns number of different bytes in hostnames, used for load balancing
size_t getHostNameDifference(const std::string & local_hostname, const std::string & host);
}

View File

@ -99,8 +99,8 @@ Block GraphiteRollupSortedBlockInputStream::readImpl()
if (i != time_column_num && i != value_column_num && i != version_column_num)
unmodified_column_numbers.push_back(i);
if (current_selected_row.empty())
current_selected_row.columns.resize(num_columns);
if (current_subgroup_newest_row.empty())
current_subgroup_newest_row.columns.resize(num_columns);
}
merge(merged_columns, queue);
@ -125,44 +125,44 @@ void GraphiteRollupSortedBlockInputStream::merge(MutableColumns & merged_columns
SortCursor next_cursor = queue.top();
StringRef next_path = next_cursor->all_columns[path_column_num]->getDataAt(next_cursor->pos);
bool path_differs = is_first || next_path != StringRef(current_path);
bool new_path = is_first || next_path != current_group_path;
is_first = false;
time_t next_time = next_cursor->all_columns[time_column_num]->get64(next_cursor->pos);
time_t next_row_time = next_cursor->all_columns[time_column_num]->get64(next_cursor->pos);
/// Is new key before rounding.
bool is_new_key = path_differs || next_time != current_time;
bool is_new_key = new_path || next_row_time != current_time;
if (is_new_key)
{
/// Accumulate the row that has maximum version in the previous group of rows with the same key:
if (started_rows)
accumulateRow(current_selected_row);
accumulateRow(current_subgroup_newest_row);
const Graphite::Pattern * next_pattern = current_pattern;
if (path_differs)
if (new_path)
next_pattern = selectPatternForPath(next_path);
time_t next_time_rounded;
if (next_pattern)
{
UInt32 precision = selectPrecision(next_pattern->retentions, next_time);
next_time_rounded = roundTimeToPrecision(date_lut, next_time, precision);
UInt32 precision = selectPrecision(next_pattern->retentions, next_row_time);
next_time_rounded = roundTimeToPrecision(date_lut, next_row_time, precision);
}
else
{
/// If no pattern has matched - take the value as-is.
next_time_rounded = next_time;
next_time_rounded = next_row_time;
}
/// Key will be new after rounding. It means new result row.
bool will_be_new_key = path_differs || next_time_rounded != current_time_rounded;
bool will_be_new_key = new_path || next_time_rounded != current_time_rounded;
if (will_be_new_key)
{
if (started_rows)
{
finishCurrentRow(merged_columns);
finishCurrentGroup(merged_columns);
/// We have enough rows - return, but don't advance the loop. At the beginning of the
/// next call to merge() the same next_cursor will be processed once more and
@ -174,24 +174,27 @@ void GraphiteRollupSortedBlockInputStream::merge(MutableColumns & merged_columns
/// At this point previous row has been fully processed, so we can advance the loop
/// (substitute current_* values for next_*, advance the cursor).
startNextRow(merged_columns, next_cursor, next_pattern);
startNextGroup(merged_columns, next_cursor, next_pattern);
++started_rows;
current_time_rounded = next_time_rounded;
}
/// We must make copy of next_path to avoid dangling pointers after fetchNextBlock
current_path = next_path.toString();
current_time = next_time;
current_time = next_row_time;
}
/// Within all rows with same key, we should leave only one row with maximum version;
/// and for rows with same maximum version - only last row.
UInt64 next_version = next_cursor->all_columns[version_column_num]->get64(next_cursor->pos);
if (is_new_key || next_version >= current_max_version)
if (is_new_key || next_version >= current_subgroup_max_version)
{
current_max_version = next_version;
setRowRef(current_selected_row, next_cursor);
current_subgroup_max_version = next_version;
setRowRef(current_subgroup_newest_row, next_cursor);
/// Small hack: group and subgroups have the same path, so we can set current_group_path here instead of startNextGroup
/// But since we keep in memory current_subgroup_newest_row's block, we could use StringRef for current_group_path and don't
/// make deep copy of the path.
current_group_path = next_path;
}
queue.pop();
@ -211,17 +214,19 @@ void GraphiteRollupSortedBlockInputStream::merge(MutableColumns & merged_columns
/// Write result row for the last group.
if (started_rows)
{
accumulateRow(current_selected_row);
finishCurrentRow(merged_columns);
accumulateRow(current_subgroup_newest_row);
finishCurrentGroup(merged_columns);
}
finished = true;
}
void GraphiteRollupSortedBlockInputStream::startNextRow(MutableColumns & merged_columns, SortCursor & cursor, const Graphite::Pattern * next_pattern)
template <typename TSortCursor>
void GraphiteRollupSortedBlockInputStream::startNextGroup(MutableColumns & merged_columns, TSortCursor & cursor,
const Graphite::Pattern * next_pattern)
{
/// Copy unmodified column values.
/// Copy unmodified column values (including path column).
for (size_t i = 0, size = unmodified_column_numbers.size(); i < size; ++i)
{
size_t j = unmodified_column_numbers[i];
@ -238,11 +243,11 @@ void GraphiteRollupSortedBlockInputStream::startNextRow(MutableColumns & merged_
}
void GraphiteRollupSortedBlockInputStream::finishCurrentRow(MutableColumns & merged_columns)
void GraphiteRollupSortedBlockInputStream::finishCurrentGroup(MutableColumns & merged_columns)
{
/// Insert calculated values of the columns `time`, `value`, `version`.
merged_columns[time_column_num]->insert(UInt64(current_time_rounded));
merged_columns[version_column_num]->insert(current_max_version);
merged_columns[version_column_num]->insert(current_subgroup_max_version);
if (aggregate_state_created)
{
@ -252,7 +257,7 @@ void GraphiteRollupSortedBlockInputStream::finishCurrentRow(MutableColumns & mer
}
else
merged_columns[value_column_num]->insertFrom(
*current_selected_row.columns[value_column_num], current_selected_row.row_num);
*current_subgroup_newest_row.columns[value_column_num], current_subgroup_newest_row.row_num);
}

View File

@ -175,14 +175,35 @@ private:
ColumnNumbers unmodified_column_numbers;
time_t time_of_merge;
/// No data has been read.
bool is_first = true;
/// All data has been read.
bool finished = false;
RowRef current_selected_row; /// Last row with maximum version for current primary key.
UInt64 current_max_version = 0;
/* | path | time | rounded_time | version | value | unmodified |
* -----------------------------------------------------------------------------------
* | A | 11 | 10 | 1 | 1 | a | |
* | A | 11 | 10 | 3 | 2 | b |> subgroup(A, 11) |
* | A | 11 | 10 | 2 | 3 | c | |> group(A, 10)
* ----------------------------------------------------------------------------------|>
* | A | 12 | 10 | 0 | 4 | d | |> Outputs (A, 10, avg(2, 5), a)
* | A | 12 | 10 | 1 | 5 | e |> subgroup(A, 12) |
* -----------------------------------------------------------------------------------
* | A | 21 | 20 | 1 | 6 | f |
* | B | 11 | 10 | 1 | 7 | g |
* ...
*/
bool is_first = true;
String current_path;
/// Path name of current bucket
StringRef current_group_path;
/// Last row with maximum version for current primary key (time bucket).
RowRef current_subgroup_newest_row;
UInt64 current_subgroup_max_version = 0;
/// Time of last read row
time_t current_time = 0;
time_t current_time_rounded = 0;
@ -197,10 +218,11 @@ private:
void merge(MutableColumns & merged_columns, std::priority_queue<SortCursor> & queue);
/// Insert the values into the resulting columns, which will not be changed in the future.
void startNextRow(MutableColumns & merged_columns, SortCursor & cursor, const Graphite::Pattern * next_pattern);
template <typename TSortCursor>
void startNextGroup(MutableColumns & merged_columns, TSortCursor & cursor, const Graphite::Pattern * next_pattern);
/// Insert the calculated `time`, `value`, `version` values into the resulting columns by the last group of rows.
void finishCurrentRow(MutableColumns & merged_columns);
void finishCurrentGroup(MutableColumns & merged_columns);
/// Update the state of the aggregate function with the new `value`.
void accumulateRow(RowRef & row);

View File

@ -63,6 +63,8 @@ void NativeBlockOutputStream::write(const Block & block)
if (client_revision > 0)
block.info.write(ostr);
block.checkNumberOfRows();
/// Dimensions
size_t columns = block.columns();
size_t rows = block.rows();

View File

@ -4,7 +4,8 @@
namespace DB
{
SquashingBlockInputStream::SquashingBlockInputStream(BlockInputStreamPtr & src, size_t min_block_size_rows, size_t min_block_size_bytes)
SquashingBlockInputStream::SquashingBlockInputStream(const BlockInputStreamPtr & src,
size_t min_block_size_rows, size_t min_block_size_bytes)
: transform(min_block_size_rows, min_block_size_bytes)
{
children.emplace_back(src);

View File

@ -12,7 +12,7 @@ namespace DB
class SquashingBlockInputStream : public IProfilingBlockInputStream
{
public:
SquashingBlockInputStream(BlockInputStreamPtr & src, size_t min_block_size_rows, size_t min_block_size_bytes);
SquashingBlockInputStream(const BlockInputStreamPtr & src, size_t min_block_size_rows, size_t min_block_size_bytes);
String getName() const override { return "Squashing"; }

View File

@ -1,3 +1,5 @@
#pragma once
#include <Core/Block.h>

View File

@ -16,20 +16,21 @@ bool isAtomicSet(std::atomic<bool> * val)
}
void copyData(IBlockInputStream & from, IBlockOutputStream & to, std::atomic<bool> * is_cancelled)
template <typename Pred>
void copyDataImpl(IBlockInputStream & from, IBlockOutputStream & to, Pred && is_cancelled)
{
from.readPrefix();
to.writePrefix();
while (Block block = from.read())
{
if (isAtomicSet(is_cancelled))
if (is_cancelled())
break;
to.write(block);
}
if (isAtomicSet(is_cancelled))
if (is_cancelled())
return;
/// For outputting additional information in some formats.
@ -42,11 +43,28 @@ void copyData(IBlockInputStream & from, IBlockOutputStream & to, std::atomic<boo
to.setExtremes(input->getExtremes());
}
if (isAtomicSet(is_cancelled))
if (is_cancelled())
return;
from.readSuffix();
to.writeSuffix();
}
void copyData(IBlockInputStream & from, IBlockOutputStream & to, std::atomic<bool> * is_cancelled)
{
auto is_cancelled_pred = [is_cancelled] ()
{
return isAtomicSet(is_cancelled);
};
copyDataImpl(from, to, is_cancelled_pred);
}
void copyData(IBlockInputStream & from, IBlockOutputStream & to, const std::function<bool()> & is_cancelled)
{
copyDataImpl(from, to, is_cancelled);
}
}

Some files were not shown because too many files have changed in this diff Show More