ClickHouse/dbms/src/Common/OptimizedRegularExpression.cpp

#include <Common/Exception.h>
#include <Common/OptimizedRegularExpression.h>

#define MIN_LENGTH_FOR_STRSTR 3
#define MAX_SUBPATTERNS 5


namespace DB
{
    namespace ErrorCodes
    {
        extern const int CANNOT_COMPILE_REGEXP;
    }
}


template <bool thread_safe>
void OptimizedRegularExpressionImpl<thread_safe>::analyze(
    const std::string & regexp,
    std::string & required_substring,
    bool & is_trivial,
    bool & required_substring_is_prefix)
{
    /** The expression is trivial if all the metacharacters in it are escaped.
      * The non-alternative string is
      *  a string outside parentheses,
      *  in which all metacharacters are escaped,
      *  and also if there are no '|' outside the brackets,
      *  and also avoid substrings of the form `http://` or `www` and some other
      *   (this is the hack for typical use case in Yandex.Metrica).
      */
    const char * begin = regexp.data();
    const char * pos = begin;
    const char * end = regexp.data() + regexp.size();
    int depth = 0;
    is_trivial = true;
    required_substring_is_prefix = false;
    required_substring.clear();
    bool has_alternative_on_depth_0 = false;

    /// Substring with a position.
    using Substring = std::pair<std::string, size_t>;
    using Substrings = std::vector<Substring>;

    Substrings trivial_substrings(1);
    Substring * last_substring = &trivial_substrings.back();

    bool in_curly_braces = false;
    bool in_square_braces = false;

    while (pos != end)
    {
        switch (*pos)
        {
            case '\0':
                pos = end;
                break;

            case '\\':
            {
                ++pos;
                if (pos == end)
                    break;

                switch (*pos)
                {
                    case '|': case '(': case ')': case '^': case '$': case '.': case '[': case '?': case '*': case '+': case '{':
                        if (depth == 0 && !in_curly_braces && !in_square_braces)
                        {
                            if (last_substring->first.empty())
                                last_substring->second = pos - begin;
                            last_substring->first.push_back(*pos);
                        }
                        break;
                    default:
                        /// all other escape sequences are not supported
                        is_trivial = false;
                        if (!last_substring->first.empty())
                        {
                            trivial_substrings.resize(trivial_substrings.size() + 1);
                            last_substring = &trivial_substrings.back();
                        }
                        break;
                }

                ++pos;
                break;
            }

            case '|':
                if (depth == 0)
                    has_alternative_on_depth_0 = true;
                is_trivial = false;
                if (!in_square_braces && !last_substring->first.empty())
                {
                    trivial_substrings.resize(trivial_substrings.size() + 1);
                    last_substring = &trivial_substrings.back();
                }
                ++pos;
                break;

            case '(':
                if (!in_square_braces)
                {
                    ++depth;
                    is_trivial = false;
                    if (!last_substring->first.empty())
                    {
                        trivial_substrings.resize(trivial_substrings.size() + 1);
                        last_substring = &trivial_substrings.back();
                    }
                }
                ++pos;
                break;

            case '[':
                in_square_braces = true;
                ++depth;
                is_trivial = false;
                if (!last_substring->first.empty())
                {
                    trivial_substrings.resize(trivial_substrings.size() + 1);
                    last_substring = &trivial_substrings.back();
                }
                ++pos;
                break;

            case ']':
                if (!in_square_braces)
                    goto ordinary;

                in_square_braces = false;
                --depth;
                is_trivial = false;
                if (!last_substring->first.empty())
                {
                    trivial_substrings.resize(trivial_substrings.size() + 1);
                    last_substring = &trivial_substrings.back();
                }
                ++pos;
                break;

            case ')':
                if (!in_square_braces)
                {
                    --depth;
                    is_trivial = false;
                    if (!last_substring->first.empty())
                    {
                        trivial_substrings.resize(trivial_substrings.size() + 1);
                        last_substring = &trivial_substrings.back();
                    }
                }
                ++pos;
                break;

            case '^': case '$': case '.': case '+':
                is_trivial = false;
                if (!last_substring->first.empty() && !in_square_braces)
                {
                    trivial_substrings.resize(trivial_substrings.size() + 1);
                    last_substring = &trivial_substrings.back();
                }
                ++pos;
                break;

            /// Quantifiers that allow a zero number of occurrences.
            case '{':
                in_curly_braces = true;
                [[fallthrough]];
            case '?':
                [[fallthrough]];
            case '*':
                is_trivial = false;
                if (!last_substring->first.empty() && !in_square_braces)
                {
                    last_substring->first.resize(last_substring->first.size() - 1);
                    trivial_substrings.resize(trivial_substrings.size() + 1);
                    last_substring = &trivial_substrings.back();
                }
                ++pos;
                break;

            case '}':
                if (!in_curly_braces)
                    goto ordinary;

                in_curly_braces = false;
                ++pos;
                break;

            ordinary:   /// Normal, not escaped symbol.
            [[fallthrough]];
            default:
                if (depth == 0 && !in_curly_braces && !in_square_braces)
                {
                    if (last_substring->first.empty())
                        last_substring->second = pos - begin;
                    last_substring->first.push_back(*pos);
                }
                ++pos;
                break;
        }
    }

    if (last_substring && last_substring->first.empty())
        trivial_substrings.pop_back();

    if (!is_trivial)
    {
        if (!has_alternative_on_depth_0)
        {
            /** We choose the non-alternative substring of the maximum length, among the prefixes,
              *  or a non-alternative substring of maximum length.
              */

            /// Tuning for typical usage domain
            auto tuning_strings_condition = [](const std::string & str)
            {
                return str != "://" && str != "http://" && str != "www" && str != "Windows ";
            };
            size_t max_length = 0;
            Substrings::const_iterator candidate_it = trivial_substrings.begin();
            for (Substrings::const_iterator it = trivial_substrings.begin(); it != trivial_substrings.end(); ++it)
            {
                if (((it->second == 0 && candidate_it->second != 0)
                        || ((it->second == 0) == (candidate_it->second == 0) && it->first.size() > max_length))
                    && tuning_strings_condition(it->first))
                {
                    max_length = it->first.size();
                    candidate_it = it;
                }
            }

            /// If prefix is small, it won't be chosen
            if (max_length < MIN_LENGTH_FOR_STRSTR)
            {
                for (Substrings::const_iterator it = trivial_substrings.begin(); it != trivial_substrings.end(); ++it)
                {
                    if (it->first.size() > max_length && tuning_strings_condition(it->first))
                    {
                        max_length = it->first.size();
                        candidate_it = it;
                    }
                }
            }

            if (max_length >= MIN_LENGTH_FOR_STRSTR)
            {
                required_substring = candidate_it->first;
                required_substring_is_prefix = candidate_it->second == 0;
            }
        }
    }
    else if (!trivial_substrings.empty())
    {
        required_substring = trivial_substrings.front().first;
        required_substring_is_prefix = trivial_substrings.front().second == 0;
    }

/*    std::cerr
        << "regexp: " << regexp
        << ", is_trivial: " << is_trivial
        << ", required_substring: " << required_substring
        << ", required_substring_is_prefix: " << required_substring_is_prefix
        << std::endl;*/
}


template <bool thread_safe>
OptimizedRegularExpressionImpl<thread_safe>::OptimizedRegularExpressionImpl(const std::string & regexp_, int options)
{
    analyze(regexp_, required_substring, is_trivial, required_substring_is_prefix);

    /// Just three following options are supported
    if (options & (~(RE_CASELESS | RE_NO_CAPTURE | RE_DOT_NL)))
        throw DB::Exception("OptimizedRegularExpression: Unsupported option.", DB::ErrorCodes::CANNOT_COMPILE_REGEXP);

    is_case_insensitive = options & RE_CASELESS;
    bool is_no_capture = options & RE_NO_CAPTURE;
    bool is_dot_nl = options & RE_DOT_NL;

    number_of_subpatterns = 0;
    if (!is_trivial)
    {
        /// Compile the re2 regular expression.
        typename RegexType::Options regexp_options;

        /// Never write error messages to stderr. It's ignorant to do it from library code.
        regexp_options.set_log_errors(false);

        if (is_case_insensitive)
            regexp_options.set_case_sensitive(false);

        if (is_dot_nl)
            regexp_options.set_dot_nl(true);

        re2 = std::make_unique<RegexType>(regexp_, regexp_options);
        if (!re2->ok())
            throw DB::Exception("OptimizedRegularExpression: cannot compile re2: " + regexp_ + ", error: " + re2->error() + ". Look at https://github.com/google/re2/wiki/Syntax for reference.", DB::ErrorCodes::CANNOT_COMPILE_REGEXP);

        if (!is_no_capture)
        {
            number_of_subpatterns = re2->NumberOfCapturingGroups();
            if (number_of_subpatterns > MAX_SUBPATTERNS)
                throw DB::Exception("OptimizedRegularExpression: too many subpatterns in regexp: " + regexp_, DB::ErrorCodes::CANNOT_COMPILE_REGEXP);
        }
    }
}


template <bool thread_safe>
bool OptimizedRegularExpressionImpl<thread_safe>::match(const char * subject, size_t subject_size) const
{
    if (is_trivial)
    {
        if (is_case_insensitive)
            return nullptr != strcasestr(subject, required_substring.data());
        else
            return nullptr != strstr(subject, required_substring.data());
    }
    else
    {
        if (!required_substring.empty())
        {
            const char * pos;
            if (is_case_insensitive)
                pos = strcasestr(subject, required_substring.data());
            else
                pos = strstr(subject, required_substring.data());

            if (nullptr == pos)
                return 0;
        }

        return re2->Match(StringPieceType(subject, subject_size), 0, subject_size, RegexType::UNANCHORED, nullptr, 0);
    }
}


template <bool thread_safe>
bool OptimizedRegularExpressionImpl<thread_safe>::match(const char * subject, size_t subject_size, Match & match) const
{
    if (is_trivial)
    {
        const char * pos;
        if (is_case_insensitive)
            pos = strcasestr(subject, required_substring.data());
        else
            pos = strstr(subject, required_substring.data());

        if (pos == nullptr)
            return 0;
        else
        {
            match.offset = pos - subject;
            match.length = required_substring.size();
            return 1;
        }
    }
    else
    {
        if (!required_substring.empty())
        {
            const char * pos;
            if (is_case_insensitive)
                pos = strcasestr(subject, required_substring.data());
            else
                pos = strstr(subject, required_substring.data());

            if (nullptr == pos)
                return 0;
        }

        StringPieceType piece;

        if (!RegexType::PartialMatch(StringPieceType(subject, subject_size), *re2, &piece))
            return 0;
        else
        {
            match.offset = piece.data() - subject;
            match.length = piece.length();
            return 1;
        }
    }
}


template <bool thread_safe>
unsigned OptimizedRegularExpressionImpl<thread_safe>::match(const char * subject, size_t subject_size, MatchVec & matches, unsigned limit) const
{
    matches.clear();

    if (limit == 0)
        return 0;

    if (limit > number_of_subpatterns + 1)
        limit = number_of_subpatterns + 1;

    if (is_trivial)
    {
        const char * pos;
        if (is_case_insensitive)
            pos = strcasestr(subject, required_substring.data());
        else
            pos = strstr(subject, required_substring.data());

        if (pos == nullptr)
            return 0;
        else
        {
            Match match;
            match.offset = pos - subject;
            match.length = required_substring.size();
            matches.push_back(match);
            return 1;
        }
    }
    else
    {
        if (!required_substring.empty())
        {
            const char * pos;
            if (is_case_insensitive)
                pos = strcasestr(subject, required_substring.data());
            else
                pos = strstr(subject, required_substring.data());

            if (nullptr == pos)
                return 0;
        }

        StringPieceType pieces[MAX_SUBPATTERNS];

        if (!re2->Match(StringPieceType(subject, subject_size), 0, subject_size, RegexType::UNANCHORED, pieces, limit))
            return 0;
        else
        {
            matches.resize(limit);
            for (size_t i = 0; i < limit; ++i)
            {
                if (pieces[i] != nullptr)
                {
                    matches[i].offset = pieces[i].data() - subject;
                    matches[i].length = pieces[i].length();
                }
                else
                {
                    matches[i].offset = std::string::npos;
                    matches[i].length = 0;
                }
            }
            return limit;
        }
    }
}

template class OptimizedRegularExpressionImpl<true>;
template class OptimizedRegularExpressionImpl<false>;
Don't log to stderr within RE2 library [#CLICKHOUSE-2] 2018-11-30 19:37:31 +00:00			`#include <Common/Exception.h>`
Moved headers and sources to same place [#CLICKHOUSE-3]. 2017-04-01 09:19:00 +00:00			`#include <Common/OptimizedRegularExpression.h>`
Moved files [#METR-17973]. 2015-10-05 01:11:12 +00:00
			`#define MIN_LENGTH_FOR_STRSTR 3`
			`#define MAX_SUBPATTERNS 5`

Fixed translation errors; miscellaneous changes [#CLICKHOUSE-3]. 2017-05-10 04:00:19 +00:00
Don't log to stderr within RE2 library [#CLICKHOUSE-2] 2018-11-30 19:37:31 +00:00			`namespace DB`
			`{`
			`namespace ErrorCodes`
			`{`
			`extern const int CANNOT_COMPILE_REGEXP;`
			`}`
			`}`


Fixed translation errors; miscellaneous changes [#CLICKHOUSE-3]. 2017-05-10 04:00:19 +00:00			`template <bool thread_safe>`
			`void OptimizedRegularExpressionImpl<thread_safe>::analyze(`
Addition to prev. revision [#CLICKHOUSE-2]. 2017-05-10 02:45:21 +00:00			`const std::string & regexp,`
			`std::string & required_substring,`
			`bool & is_trivial,`
			`bool & required_substring_is_prefix)`
Moved files [#METR-17973]. 2015-10-05 01:11:12 +00:00			`{`
Addition to prev. revision [#CLICKHOUSE-2]. 2017-05-10 02:45:21 +00:00			`/** The expression is trivial if all the metacharacters in it are escaped.`
			`* The non-alternative string is`
			`* a string outside parentheses,`
			`* in which all metacharacters are escaped,`
			`* and also if there are no '\|' outside the brackets,`
Fixed translation errors; miscellaneous changes [#CLICKHOUSE-3]. 2017-05-10 04:00:19 +00:00			* and also avoid substrings of the form `http://` or `www` and some other
			`* (this is the hack for typical use case in Yandex.Metrica).`
Addition to prev. revision [#CLICKHOUSE-2]. 2017-05-10 02:45:21 +00:00			`*/`
			`const char * begin = regexp.data();`
			`const char * pos = begin;`
			`const char * end = regexp.data() + regexp.size();`
			`int depth = 0;`
			`is_trivial = true;`
			`required_substring_is_prefix = false;`
			`required_substring.clear();`
			`bool has_alternative_on_depth_0 = false;`

			`/// Substring with a position.`
Fixed translation errors; miscellaneous changes [#CLICKHOUSE-3]. 2017-05-10 04:00:19 +00:00			`using Substring = std::pair<std::string, size_t>;`
			`using Substrings = std::vector<Substring>;`
Addition to prev. revision [#CLICKHOUSE-2]. 2017-05-10 02:45:21 +00:00
			`Substrings trivial_substrings(1);`
			`Substring * last_substring = &trivial_substrings.back();`

			`bool in_curly_braces = false;`
			`bool in_square_braces = false;`

			`while (pos != end)`
			`{`
			`switch (*pos)`
			`{`
			`case '\0':`
			`pos = end;`
			`break;`

			`case '\\':`
			`{`
			`++pos;`
			`if (pos == end)`
			`break;`

			`switch (*pos)`
			`{`
			`case '\|': case '(': case ')': case '^': case '$': case '.': case '[': case '?': case '*': case '+': case '{':`
			`if (depth == 0 && !in_curly_braces && !in_square_braces)`
			`{`
			`if (last_substring->first.empty())`
			`last_substring->second = pos - begin;`
			`last_substring->first.push_back(*pos);`
			`}`
			`break;`
			`default:`
			`/// all other escape sequences are not supported`
			`is_trivial = false;`
			`if (!last_substring->first.empty())`
			`{`
			`trivial_substrings.resize(trivial_substrings.size() + 1);`
			`last_substring = &trivial_substrings.back();`
			`}`
			`break;`
			`}`

			`++pos;`
			`break;`
			`}`

			`case '\|':`
			`if (depth == 0)`
			`has_alternative_on_depth_0 = true;`
			`is_trivial = false;`
			`if (!in_square_braces && !last_substring->first.empty())`
			`{`
			`trivial_substrings.resize(trivial_substrings.size() + 1);`
			`last_substring = &trivial_substrings.back();`
			`}`
			`++pos;`
			`break;`

			`case '(':`
			`if (!in_square_braces)`
			`{`
			`++depth;`
			`is_trivial = false;`
			`if (!last_substring->first.empty())`
			`{`
			`trivial_substrings.resize(trivial_substrings.size() + 1);`
			`last_substring = &trivial_substrings.back();`
			`}`
			`}`
			`++pos;`
			`break;`

			`case '[':`
			`in_square_braces = true;`
			`++depth;`
			`is_trivial = false;`
			`if (!last_substring->first.empty())`
			`{`
			`trivial_substrings.resize(trivial_substrings.size() + 1);`
			`last_substring = &trivial_substrings.back();`
			`}`
			`++pos;`
			`break;`

			`case ']':`
			`if (!in_square_braces)`
			`goto ordinary;`

			`in_square_braces = false;`
			`--depth;`
			`is_trivial = false;`
			`if (!last_substring->first.empty())`
			`{`
			`trivial_substrings.resize(trivial_substrings.size() + 1);`
			`last_substring = &trivial_substrings.back();`
			`}`
			`++pos;`
			`break;`

			`case ')':`
			`if (!in_square_braces)`
			`{`
			`--depth;`
			`is_trivial = false;`
			`if (!last_substring->first.empty())`
			`{`
			`trivial_substrings.resize(trivial_substrings.size() + 1);`
			`last_substring = &trivial_substrings.back();`
			`}`
			`}`
			`++pos;`
			`break;`

			`case '^': case '$': case '.': case '+':`
			`is_trivial = false;`
			`if (!last_substring->first.empty() && !in_square_braces)`
			`{`
			`trivial_substrings.resize(trivial_substrings.size() + 1);`
			`last_substring = &trivial_substrings.back();`
			`}`
			`++pos;`
			`break;`

dbms: Fixed misspells in comments 2019-01-22 19:56:53 +00:00			`/// Quantifiers that allow a zero number of occurrences.`
Addition to prev. revision [#CLICKHOUSE-2]. 2017-05-10 02:45:21 +00:00			`case '{':`
			`in_curly_braces = true;`
Better [#CLICKHOUSE-2]. 2017-12-02 03:22:51 +00:00			`[[fallthrough]];`
			`case '?':`
			`[[fallthrough]];`
			`case '*':`
Addition to prev. revision [#CLICKHOUSE-2]. 2017-05-10 02:45:21 +00:00			`is_trivial = false;`
			`if (!last_substring->first.empty() && !in_square_braces)`
			`{`
			`last_substring->first.resize(last_substring->first.size() - 1);`
			`trivial_substrings.resize(trivial_substrings.size() + 1);`
			`last_substring = &trivial_substrings.back();`
			`}`
			`++pos;`
			`break;`

			`case '}':`
			`if (!in_curly_braces)`
			`goto ordinary;`

			`in_curly_braces = false;`
			`++pos;`
			`break;`

			`ordinary: /// Normal, not escaped symbol.`
Better [#CLICKHOUSE-2]. 2017-12-02 03:22:51 +00:00			`[[fallthrough]];`
Addition to prev. revision [#CLICKHOUSE-2]. 2017-05-10 02:45:21 +00:00			`default:`
			`if (depth == 0 && !in_curly_braces && !in_square_braces)`
			`{`
			`if (last_substring->first.empty())`
			`last_substring->second = pos - begin;`
			`last_substring->first.push_back(*pos);`
			`}`
			`++pos;`
			`break;`
			`}`
			`}`

			`if (last_substring && last_substring->first.empty())`
			`trivial_substrings.pop_back();`

			`if (!is_trivial)`
			`{`
			`if (!has_alternative_on_depth_0)`
			`{`
			`/** We choose the non-alternative substring of the maximum length, among the prefixes,`
			`* or a non-alternative substring of maximum length.`
			`*/`
Fix word extraction if prefix candidate is small 2019-05-05 09:32:26 +00:00
			`/// Tuning for typical usage domain`
			`auto tuning_strings_condition = [](const std::string & str)`
			`{`
			`return str != "://" && str != "http://" && str != "www" && str != "Windows ";`
			`};`
Addition to prev. revision [#CLICKHOUSE-2]. 2017-05-10 02:45:21 +00:00			`size_t max_length = 0;`
			`Substrings::const_iterator candidate_it = trivial_substrings.begin();`
			`for (Substrings::const_iterator it = trivial_substrings.begin(); it != trivial_substrings.end(); ++it)`
			`{`
			`if (((it->second == 0 && candidate_it->second != 0)`
			`\|\| ((it->second == 0) == (candidate_it->second == 0) && it->first.size() > max_length))`
Fix word extraction if prefix candidate is small 2019-05-05 09:32:26 +00:00			`&& tuning_strings_condition(it->first))`
Addition to prev. revision [#CLICKHOUSE-2]. 2017-05-10 02:45:21 +00:00			`{`
			`max_length = it->first.size();`
			`candidate_it = it;`
			`}`
			`}`

Fix word extraction if prefix candidate is small 2019-05-05 09:32:26 +00:00			`/// If prefix is small, it won't be chosen`
Better condition 2019-05-05 09:34:43 +00:00			`if (max_length < MIN_LENGTH_FOR_STRSTR)`
Fix word extraction if prefix candidate is small 2019-05-05 09:32:26 +00:00			`{`
			`for (Substrings::const_iterator it = trivial_substrings.begin(); it != trivial_substrings.end(); ++it)`
			`{`
			`if (it->first.size() > max_length && tuning_strings_condition(it->first))`
			`{`
			`max_length = it->first.size();`
			`candidate_it = it;`
			`}`
			`}`
			`}`

Addition to prev. revision [#CLICKHOUSE-2]. 2017-05-10 02:45:21 +00:00			`if (max_length >= MIN_LENGTH_FOR_STRSTR)`
			`{`
			`required_substring = candidate_it->first;`
			`required_substring_is_prefix = candidate_it->second == 0;`
			`}`
			`}`
			`}`
Fix warnings (#1406) * Log query id in executeQuery; Better type mismatch error; change format in report tool * Better log query_id * fix message * Use c++11 thread_local instaed of gcc's __thread * lock mutex before notifying waiting thread in sync insertion into distributed [#CLICKHOUSE-3379] * Cmake: fix build without downloaded submodules (#1379) * fix * ZooKeeper: fixed stack smashing with tryGet() The tryGet() operation creates a 1MB buffer on stack. This may or may not work depending on the default stack size for threads, whether the stack protector is enabled or not, recursion depth, and the actual value size. This is probably going to slow down some ZK operations, but I don't see how else this could work reliably with the existing API. * increased timeout for test_insertion_sync_fails_with_timeout * Update CHANGELOG_RU.md * Update ZooKeeper.cpp * Fix warnings * Fixes * Dont strip debug info from asan, tsan and other builds except releases * Fix asan error causd by test 00144 * Fix empty log message (#CLICKHOUSE-3378) 2017-10-25 18:39:10 +00:00			`else if (!trivial_substrings.empty())`
Addition to prev. revision [#CLICKHOUSE-2]. 2017-05-10 02:45:21 +00:00			`{`
			`required_substring = trivial_substrings.front().first;`
			`required_substring_is_prefix = trivial_substrings.front().second == 0;`
			`}`

			`/* std::cerr`
			`<< "regexp: " << regexp`
			`<< ", is_trivial: " << is_trivial`
			`<< ", required_substring: " << required_substring`
			`<< ", required_substring_is_prefix: " << required_substring_is_prefix`
			`<< std::endl;*/`
Moved files [#METR-17973]. 2015-10-05 01:11:12 +00:00			`}`


Fixed translation errors; miscellaneous changes [#CLICKHOUSE-3]. 2017-05-10 04:00:19 +00:00			`template <bool thread_safe>`
			`OptimizedRegularExpressionImpl<thread_safe>::OptimizedRegularExpressionImpl(const std::string & regexp_, int options)`
Moved files [#METR-17973]. 2015-10-05 01:11:12 +00:00			`{`
Addition to prev. revision [#CLICKHOUSE-2]. 2017-05-10 02:45:21 +00:00			`analyze(regexp_, required_substring, is_trivial, required_substring_is_prefix);`

Fixed translation errors; miscellaneous changes [#CLICKHOUSE-3]. 2017-05-10 04:00:19 +00:00			`/// Just three following options are supported`
Addition to prev. revision [#CLICKHOUSE-2]. 2017-05-10 02:45:21 +00:00			`if (options & (~(RE_CASELESS \| RE_NO_CAPTURE \| RE_DOT_NL)))`
Don't log to stderr within RE2 library [#CLICKHOUSE-2] 2018-11-30 19:37:31 +00:00			`throw DB::Exception("OptimizedRegularExpression: Unsupported option.", DB::ErrorCodes::CANNOT_COMPILE_REGEXP);`
Addition to prev. revision [#CLICKHOUSE-2]. 2017-05-10 02:45:21 +00:00
Don't log to stderr within RE2 library [#CLICKHOUSE-2] 2018-11-30 19:37:31 +00:00			`is_case_insensitive = options & RE_CASELESS;`
			`bool is_no_capture = options & RE_NO_CAPTURE;`
			`bool is_dot_nl = options & RE_DOT_NL;`
Addition to prev. revision [#CLICKHOUSE-2]. 2017-05-10 02:45:21 +00:00
			`number_of_subpatterns = 0;`
			`if (!is_trivial)`
			`{`
			`/// Compile the re2 regular expression.`
Non significant changes according to clang's -Weverything, part 1 [#CLICKHOUSE-2] 2018-06-03 16:51:31 +00:00			`typename RegexType::Options regexp_options;`
Addition to prev. revision [#CLICKHOUSE-2]. 2017-05-10 02:45:21 +00:00
Don't log to stderr within RE2 library [#CLICKHOUSE-2] 2018-11-30 19:37:31 +00:00			`/// Never write error messages to stderr. It's ignorant to do it from library code.`
			`regexp_options.set_log_errors(false);`

Addition to prev. revision [#CLICKHOUSE-2]. 2017-05-10 02:45:21 +00:00			`if (is_case_insensitive)`
Non significant changes according to clang's -Weverything, part 1 [#CLICKHOUSE-2] 2018-06-03 16:51:31 +00:00			`regexp_options.set_case_sensitive(false);`
Addition to prev. revision [#CLICKHOUSE-2]. 2017-05-10 02:45:21 +00:00
			`if (is_dot_nl)`
Non significant changes according to clang's -Weverything, part 1 [#CLICKHOUSE-2] 2018-06-03 16:51:31 +00:00			`regexp_options.set_dot_nl(true);`
Addition to prev. revision [#CLICKHOUSE-2]. 2017-05-10 02:45:21 +00:00
Non significant changes according to clang's -Weverything, part 1 [#CLICKHOUSE-2] 2018-06-03 16:51:31 +00:00			`re2 = std::make_unique<RegexType>(regexp_, regexp_options);`
Addition to prev. revision [#CLICKHOUSE-2]. 2017-05-10 02:45:21 +00:00			`if (!re2->ok())`
Don't log to stderr within RE2 library [#CLICKHOUSE-2] 2018-11-30 19:37:31 +00:00			`throw DB::Exception("OptimizedRegularExpression: cannot compile re2: " + regexp_ + ", error: " + re2->error() + ". Look at https://github.com/google/re2/wiki/Syntax for reference.", DB::ErrorCodes::CANNOT_COMPILE_REGEXP);`
Addition to prev. revision [#CLICKHOUSE-2]. 2017-05-10 02:45:21 +00:00
			`if (!is_no_capture)`
			`{`
			`number_of_subpatterns = re2->NumberOfCapturingGroups();`
			`if (number_of_subpatterns > MAX_SUBPATTERNS)`
Don't log to stderr within RE2 library [#CLICKHOUSE-2] 2018-11-30 19:37:31 +00:00			`throw DB::Exception("OptimizedRegularExpression: too many subpatterns in regexp: " + regexp_, DB::ErrorCodes::CANNOT_COMPILE_REGEXP);`
Addition to prev. revision [#CLICKHOUSE-2]. 2017-05-10 02:45:21 +00:00			`}`
			`}`
Moved files [#METR-17973]. 2015-10-05 01:11:12 +00:00			`}`


Fixed translation errors; miscellaneous changes [#CLICKHOUSE-3]. 2017-05-10 04:00:19 +00:00			`template <bool thread_safe>`
			`bool OptimizedRegularExpressionImpl<thread_safe>::match(const char * subject, size_t subject_size) const`
Moved files [#METR-17973]. 2015-10-05 01:11:12 +00:00			`{`
Addition to prev. revision [#CLICKHOUSE-2]. 2017-05-10 02:45:21 +00:00			`if (is_trivial)`
			`{`
			`if (is_case_insensitive)`
			`return nullptr != strcasestr(subject, required_substring.data());`
			`else`
			`return nullptr != strstr(subject, required_substring.data());`
			`}`
			`else`
			`{`
			`if (!required_substring.empty())`
			`{`
			`const char * pos;`
			`if (is_case_insensitive)`
			`pos = strcasestr(subject, required_substring.data());`
			`else`
			`pos = strstr(subject, required_substring.data());`

			`if (nullptr == pos)`
			`return 0;`
			`}`

			`return re2->Match(StringPieceType(subject, subject_size), 0, subject_size, RegexType::UNANCHORED, nullptr, 0);`
			`}`
Moved files [#METR-17973]. 2015-10-05 01:11:12 +00:00			`}`


Fixed translation errors; miscellaneous changes [#CLICKHOUSE-3]. 2017-05-10 04:00:19 +00:00			`template <bool thread_safe>`
			`bool OptimizedRegularExpressionImpl<thread_safe>::match(const char * subject, size_t subject_size, Match & match) const`
Moved files [#METR-17973]. 2015-10-05 01:11:12 +00:00			`{`
Addition to prev. revision [#CLICKHOUSE-2]. 2017-05-10 02:45:21 +00:00			`if (is_trivial)`
			`{`
			`const char * pos;`
			`if (is_case_insensitive)`
			`pos = strcasestr(subject, required_substring.data());`
			`else`
			`pos = strstr(subject, required_substring.data());`

			`if (pos == nullptr)`
			`return 0;`
			`else`
			`{`
			`match.offset = pos - subject;`
			`match.length = required_substring.size();`
			`return 1;`
			`}`
			`}`
			`else`
			`{`
			`if (!required_substring.empty())`
			`{`
			`const char * pos;`
			`if (is_case_insensitive)`
			`pos = strcasestr(subject, required_substring.data());`
			`else`
			`pos = strstr(subject, required_substring.data());`

			`if (nullptr == pos)`
			`return 0;`
			`}`

			`StringPieceType piece;`

			`if (!RegexType::PartialMatch(StringPieceType(subject, subject_size), *re2, &piece))`
			`return 0;`
			`else`
			`{`
			`match.offset = piece.data() - subject;`
			`match.length = piece.length();`
			`return 1;`
			`}`
			`}`
Moved files [#METR-17973]. 2015-10-05 01:11:12 +00:00			`}`


Fixed translation errors; miscellaneous changes [#CLICKHOUSE-3]. 2017-05-10 04:00:19 +00:00			`template <bool thread_safe>`
			`unsigned OptimizedRegularExpressionImpl<thread_safe>::match(const char * subject, size_t subject_size, MatchVec & matches, unsigned limit) const`
Moved files [#METR-17973]. 2015-10-05 01:11:12 +00:00			`{`
Addition to prev. revision [#CLICKHOUSE-2]. 2017-05-10 02:45:21 +00:00			`matches.clear();`

			`if (limit == 0)`
			`return 0;`

			`if (limit > number_of_subpatterns + 1)`
			`limit = number_of_subpatterns + 1;`

			`if (is_trivial)`
			`{`
			`const char * pos;`
			`if (is_case_insensitive)`
			`pos = strcasestr(subject, required_substring.data());`
			`else`
			`pos = strstr(subject, required_substring.data());`

			`if (pos == nullptr)`
			`return 0;`
			`else`
			`{`
			`Match match;`
			`match.offset = pos - subject;`
			`match.length = required_substring.size();`
			`matches.push_back(match);`
			`return 1;`
			`}`
			`}`
			`else`
			`{`
			`if (!required_substring.empty())`
			`{`
			`const char * pos;`
			`if (is_case_insensitive)`
			`pos = strcasestr(subject, required_substring.data());`
			`else`
			`pos = strstr(subject, required_substring.data());`

			`if (nullptr == pos)`
			`return 0;`
			`}`

			`StringPieceType pieces[MAX_SUBPATTERNS];`

			`if (!re2->Match(StringPieceType(subject, subject_size), 0, subject_size, RegexType::UNANCHORED, pieces, limit))`
			`return 0;`
			`else`
			`{`
			`matches.resize(limit);`
			`for (size_t i = 0; i < limit; ++i)`
			`{`
			`if (pieces[i] != nullptr)`
			`{`
			`matches[i].offset = pieces[i].data() - subject;`
			`matches[i].length = pieces[i].length();`
			`}`
			`else`
			`{`
			`matches[i].offset = std::string::npos;`
			`matches[i].length = 0;`
			`}`
			`}`
			`return limit;`
			`}`
			`}`
Moved files [#METR-17973]. 2015-10-05 01:11:12 +00:00			`}`

Don't log to stderr within RE2 library [#CLICKHOUSE-2] 2018-11-30 19:37:31 +00:00			`template class OptimizedRegularExpressionImpl<true>;`
			`template class OptimizedRegularExpressionImpl<false>;`