ClickHouse/src/Common/OptimizedRegularExpression.cpp
Alexey Milovidov 0d14a2c67e Useless changes
2024-05-09 03:11:02 +02:00

724 lines
24 KiB
C++

#include <limits>
#include <Common/Exception.h>
#include <Common/logger_useful.h>
#include <Common/PODArray.h>
#include <Common/checkStackSize.h>
#include <Common/OptimizedRegularExpression.h>
#define MIN_LENGTH_FOR_STRSTR 3
#define MAX_SUBPATTERNS 1024
namespace DB
{
namespace ErrorCodes
{
extern const int CANNOT_COMPILE_REGEXP;
}
}
namespace
{
struct Literal
{
std::string literal;
bool prefix; /// this literal string is the prefix of the whole string.
bool suffix; /// this literal string is the suffix of the whole string.
void clear()
{
literal.clear();
prefix = false;
suffix = false;
}
};
using Literals = std::vector<Literal>;
size_t shortest_literal_length(const Literals & literals)
{
if (literals.empty()) return 0;
size_t shortest = std::numeric_limits<size_t>::max();
for (const auto & lit : literals)
shortest = std::min(shortest, lit.literal.size());
return shortest;
}
const char * skipNameCapturingGroup(const char * pos, size_t offset, const char * end)
{
const char special = *(pos + offset) == '<' ? '>' : '\'';
offset ++;
while (pos + offset < end)
{
const char cur = *(pos + offset);
if (cur == special)
{
return pos + offset;
}
if (('0' <= cur && cur <= '9') || ('a' <= cur && cur <= 'z') || ('A' <= cur && cur <= 'Z'))
offset ++;
else
return pos;
}
return pos;
}
const char * analyzeImpl(
std::string_view regexp,
const char * pos,
Literal & required_substring,
bool & is_trivial,
Literals & global_alternatives)
{
checkStackSize();
/** The expression is trivial if all the metacharacters in it are escaped.
* The non-alternative string is
* a string outside parentheses,
* in which all metacharacters are escaped,
* and also if there are no '|' outside the brackets,
* and also avoid substrings of the form `http://` or `www` and some other
* (this is the hack for typical use case in web analytics applications).
*/
const char * begin = pos;
const char * end = regexp.data() + regexp.size();
bool is_first_call = begin == regexp.data();
int depth = 0;
is_trivial = true;
bool is_prefix = true;
required_substring.clear();
bool has_alternative_on_depth_0 = false;
bool has_case_insensitive_flag = false;
/// Substring with is_prefix.
using Substring = std::pair<std::string, bool>;
using Substrings = std::vector<Substring>;
Substrings trivial_substrings(1);
Substring * last_substring = &trivial_substrings.back();
Literals cur_alternatives;
auto finish_cur_alternatives = [&]()
{
if (cur_alternatives.empty())
return;
if (global_alternatives.empty())
{
global_alternatives = cur_alternatives;
cur_alternatives.clear();
return;
}
/// that means current alternatives have better quality.
if (shortest_literal_length(global_alternatives) < shortest_literal_length(cur_alternatives))
{
global_alternatives.clear();
global_alternatives = cur_alternatives;
}
cur_alternatives.clear();
};
auto finish_non_trivial_char = [&](bool create_new_substr = true)
{
is_trivial = false;
if (create_new_substr)
is_prefix = false;
if (depth != 0)
return;
for (auto & alter : cur_alternatives)
{
if (alter.suffix)
{
alter.literal += last_substring->first;
alter.suffix = false;
}
}
finish_cur_alternatives();
if (!last_substring->first.empty() && create_new_substr)
{
trivial_substrings.resize(trivial_substrings.size() + 1);
last_substring = &trivial_substrings.back();
}
};
/// Resolve the string or alters in a group (xxxxx)
auto finish_group = [&](Literal & group_required_string, Literals & group_alternatives)
{
for (auto & alter : group_alternatives)
{
if (alter.prefix)
{
alter.literal = last_substring->first + alter.literal;
alter.prefix = is_prefix;
}
}
if (group_required_string.prefix)
{
last_substring->first += group_required_string.literal;
last_substring->second = is_prefix;
}
else
{
finish_non_trivial_char();
last_substring->first = group_required_string.literal;
last_substring->second = false;
}
is_prefix = is_prefix && group_required_string.prefix && group_required_string.suffix;
/// if we can still append, no need to finish it. e.g. abc(de)fg should capture abcdefg
if (!last_substring->first.empty() && !group_required_string.suffix)
{
trivial_substrings.resize(trivial_substrings.size() + 1);
last_substring = &trivial_substrings.back();
}
/// assign group alters to current alters.
finish_cur_alternatives();
cur_alternatives = std::move(group_alternatives);
};
bool in_curly_braces = false;
bool in_square_braces = false;
while (pos != end)
{
switch (*pos)
{
case '\0':
pos = end;
break;
case '\\':
{
++pos;
if (pos == end)
break;
switch (*pos)
{
case '|':
case '(':
case ')':
case '^':
case '$':
case '.':
case '[':
case ']':
case '?':
case '*':
case '+':
case '-':
case '{':
case '}':
case '/':
goto ordinary;
default:
/// all other escape sequences are not supported
finish_non_trivial_char();
break;
}
++pos;
break;
}
case '|':
is_trivial = false;
is_prefix = false;
++pos;
if (depth == 0)
{
has_alternative_on_depth_0 = true;
goto finish;
}
break;
case '(':
/// bracket does not break is_prefix. for example abc(d) has a prefix 'abcd'
is_trivial = false;
if (!in_square_braces)
{
/// Check for case-insensitive flag.
if (pos + 1 < end && pos[1] == '?')
{
for (size_t offset = 2; pos + offset < end; ++offset)
{
if (pos[offset] == '-' /// it means flag negation
/// various possible flags, actually only imsU are supported by re2
|| (pos[offset] >= 'a' && pos[offset] <= 'z')
|| (pos[offset] >= 'A' && pos[offset] <= 'Z'))
{
if (pos[offset] == 'i')
{
/// Actually it can be negated case-insensitive flag. But we don't care.
has_case_insensitive_flag = true;
break;
}
}
else
break;
}
}
/// (?:regex) means non-capturing parentheses group
if (pos + 2 < end && pos[1] == '?' && pos[2] == ':')
{
pos += 2;
}
if (pos + 3 < end && pos[1] == '?' && (pos[2] == '<' || pos[2] == '\'' || (pos[2] == 'P' && pos[3] == '<')))
{
pos = skipNameCapturingGroup(pos, pos[2] == 'P' ? 3: 2, end);
}
Literal group_required_substr;
bool group_is_trival = true;
Literals group_alters;
pos = analyzeImpl(regexp, pos + 1, group_required_substr, group_is_trival, group_alters);
/// pos should be ')', if not, then it is not a valid regular expression
if (pos == end)
return pos;
/// For ()? or ()* or (){0,1}, we can just ignore the whole group.
if ((pos + 1 < end && (pos[1] == '?' || pos[1] == '*')) ||
(pos + 2 < end && pos[1] == '{' && pos[2] == '0'))
{
finish_non_trivial_char();
}
else
{
finish_group(group_required_substr, group_alters);
}
}
++pos;
break;
case '[':
in_square_braces = true;
++depth;
finish_non_trivial_char();
++pos;
break;
case ']':
if (!in_square_braces)
goto ordinary;
--depth;
if (depth == 0)
in_square_braces = false;
finish_non_trivial_char();
++pos;
break;
case ')':
if (!in_square_braces)
{
goto finish;
}
++pos;
break;
case '^': case '$': case '.': case '+':
finish_non_trivial_char();
++pos;
break;
/// Quantifiers that allow a zero number of occurrences.
case '{':
in_curly_braces = true;
[[fallthrough]];
case '?':
[[fallthrough]];
case '*':
if (depth == 0 && !last_substring->first.empty() && !in_square_braces)
{
last_substring->first.resize(last_substring->first.size() - 1);
}
finish_non_trivial_char();
++pos;
break;
case '}':
if (!in_curly_braces)
goto ordinary;
in_curly_braces = false;
++pos;
break;
ordinary: /// Normal, not escaped symbol.
[[fallthrough]];
default:
if (depth == 0 && !in_curly_braces && !in_square_braces)
{
/// record the first position of last string.
if (last_substring->first.empty())
last_substring->second = is_prefix;
last_substring->first.push_back(*pos);
}
++pos;
break;
}
}
finish:
if (!is_trivial)
{
finish_non_trivial_char(false);
/// we calculate required substring even though has_alternative_on_depth_0.
/// we will clear the required substring after putting it to alternatives.
if (!has_case_insensitive_flag)
{
/// We choose the non-alternative substring of the maximum length for first search.
/// Tuning for typical usage domain
auto tuning_strings_condition = [](const std::string & str)
{
return str != "://" && str != "http://" && str != "www" && str != "Windows ";
};
size_t max_length = 0;
Substrings::const_iterator candidate_it = trivial_substrings.begin();
for (Substrings::const_iterator it = trivial_substrings.begin(); it != trivial_substrings.end(); ++it)
{
if (it->first.size() > max_length && tuning_strings_condition(it->first))
{
max_length = it->first.size();
candidate_it = it;
}
}
if (max_length >= MIN_LENGTH_FOR_STRSTR || (!is_first_call && max_length > 0))
{
required_substring.literal = candidate_it->first;
required_substring.prefix = candidate_it->second;
required_substring.suffix = candidate_it + 1 == trivial_substrings.end();
}
}
}
else if (!trivial_substrings.empty())
{
required_substring.literal = trivial_substrings.front().first;
/// trivial string means the whole regex is a simple string literal, so the prefix and suffix should be true.
required_substring.prefix = true;
required_substring.suffix = true;
}
/// if it is xxx|xxx|xxx, we should call the next xxx|xxx recursively and collect the result.
if (has_alternative_on_depth_0)
{
/// compare the quality of required substring and alternatives and choose the better one.
if (shortest_literal_length(global_alternatives) < required_substring.literal.size())
global_alternatives = {required_substring};
Literals next_alternatives;
/// this two vals are useless, xxx|xxx cannot be trivial nor prefix.
bool next_is_trivial = true;
pos = analyzeImpl(regexp, pos, required_substring, next_is_trivial, next_alternatives);
/// For xxx|xxx|xxx, we only combine the alternatives and return a empty required_substring.
if (next_alternatives.empty() || shortest_literal_length(next_alternatives) < required_substring.literal.size())
{
global_alternatives.push_back(required_substring);
}
else
{
global_alternatives.insert(global_alternatives.end(), next_alternatives.begin(), next_alternatives.end());
}
required_substring.clear();
}
return pos;
/* std::cerr
<< "regexp: " << regexp
<< ", is_trivial: " << is_trivial
<< ", required_substring: " << required_substring
<< ", required_substring_is_prefix: " << required_substring_is_prefix
<< std::endl;*/
}
}
void OptimizedRegularExpression::analyze(
std::string_view regexp_,
std::string & required_substring,
bool & is_trivial,
bool & required_substring_is_prefix,
std::vector<std::string> & alternatives)
try
{
Literals alternative_literals;
Literal required_literal;
analyzeImpl(regexp_, regexp_.data(), required_literal, is_trivial, alternative_literals); // NOLINT
required_substring = std::move(required_literal.literal);
required_substring_is_prefix = required_literal.prefix;
for (auto & lit : alternative_literals)
alternatives.push_back(std::move(lit.literal));
}
catch (...)
{
required_substring = "";
is_trivial = false;
required_substring_is_prefix = false;
alternatives.clear();
LOG_ERROR(getLogger("OptimizeRegularExpression"), "Analyze RegularExpression failed, got error: {}", DB::getCurrentExceptionMessage(false));
}
OptimizedRegularExpression::OptimizedRegularExpression(const std::string & regexp_, int options)
{
std::vector<std::string> alternatives_dummy; /// this vector extracts patterns a,b,c from pattern (a|b|c). for now it's not used.
analyze(regexp_, required_substring, is_trivial, required_substring_is_prefix, alternatives_dummy);
/// Just three following options are supported
if (options & (~(RE_CASELESS | RE_NO_CAPTURE | RE_DOT_NL)))
throw DB::Exception(DB::ErrorCodes::CANNOT_COMPILE_REGEXP, "OptimizedRegularExpression: Unsupported option.");
is_case_insensitive = options & RE_CASELESS;
bool is_no_capture = options & RE_NO_CAPTURE;
bool is_dot_nl = options & RE_DOT_NL;
number_of_subpatterns = 0;
if (!is_trivial)
{
/// Compile the re2 regular expression.
re2::RE2::Options regexp_options;
/// Never write error messages to stderr. It's ignorant to do it from library code.
regexp_options.set_log_errors(false);
if (is_case_insensitive)
regexp_options.set_case_sensitive(false);
if (is_dot_nl)
regexp_options.set_dot_nl(true);
re2 = std::make_unique<re2::RE2>(regexp_, regexp_options);
/// Fallback to latin1 to allow matching binary data.
if (!re2->ok() && re2->error_code() == re2::RE2::ErrorCode::ErrorBadUTF8)
{
regexp_options.set_encoding(re2::RE2::Options::EncodingLatin1);
re2 = std::make_unique<re2::RE2>(regexp_, regexp_options);
}
if (!re2->ok())
{
throw DB::Exception(DB::ErrorCodes::CANNOT_COMPILE_REGEXP,
"OptimizedRegularExpression: cannot compile re2: {}, error: {}. "
"Look at https://github.com/google/re2/wiki/Syntax "
"for reference. Please note that if you specify regex as an SQL "
"string literal, the slashes have to be additionally escaped. "
"For example, to match an opening brace, write '\\(' -- "
"the first slash is for SQL and the second one is for regex",
regexp_, re2->error());
}
if (!is_no_capture)
{
number_of_subpatterns = re2->NumberOfCapturingGroups();
if (number_of_subpatterns > MAX_SUBPATTERNS)
throw DB::Exception(DB::ErrorCodes::CANNOT_COMPILE_REGEXP, "OptimizedRegularExpression: too many subpatterns in regexp: {}", regexp_);
}
}
if (!required_substring.empty())
{
if (is_case_insensitive)
case_insensitive_substring_searcher.emplace(required_substring.data(), required_substring.size());
else
case_sensitive_substring_searcher.emplace(required_substring.data(), required_substring.size());
}
}
OptimizedRegularExpression::OptimizedRegularExpression(OptimizedRegularExpression && rhs) noexcept
: is_trivial(rhs.is_trivial)
, required_substring_is_prefix(rhs.required_substring_is_prefix)
, is_case_insensitive(rhs.is_case_insensitive)
, required_substring(std::move(rhs.required_substring))
, re2(std::move(rhs.re2))
, number_of_subpatterns(rhs.number_of_subpatterns)
{
if (!required_substring.empty())
{
if (is_case_insensitive)
case_insensitive_substring_searcher.emplace(required_substring.data(), required_substring.size());
else
case_sensitive_substring_searcher.emplace(required_substring.data(), required_substring.size());
}
}
bool OptimizedRegularExpression::match(const char * subject, size_t subject_size) const
{
const UInt8 * haystack = reinterpret_cast<const UInt8 *>(subject);
const UInt8 * haystack_end = haystack + subject_size;
if (is_trivial)
{
if (required_substring.empty())
return true;
if (is_case_insensitive)
return haystack_end != case_insensitive_substring_searcher->search(haystack, subject_size);
else
return haystack_end != case_sensitive_substring_searcher->search(haystack, subject_size);
}
else
{
if (!required_substring.empty())
{
if (is_case_insensitive)
{
if (haystack_end == case_insensitive_substring_searcher->search(haystack, subject_size))
return false;
}
else
{
if (haystack_end == case_sensitive_substring_searcher->search(haystack, subject_size))
return false;
}
}
return re2->Match({subject, subject_size}, 0, subject_size, re2::RE2::UNANCHORED, nullptr, 0);
}
}
bool OptimizedRegularExpression::match(const char * subject, size_t subject_size, Match & match) const
{
const UInt8 * haystack = reinterpret_cast<const UInt8 *>(subject);
const UInt8 * haystack_end = haystack + subject_size;
if (is_trivial)
{
if (required_substring.empty())
return true;
const UInt8 * pos;
if (is_case_insensitive)
pos = case_insensitive_substring_searcher->search(haystack, subject_size);
else
pos = case_sensitive_substring_searcher->search(haystack, subject_size);
if (haystack_end == pos)
return false;
else
{
match.offset = pos - haystack;
match.length = required_substring.size();
return true;
}
}
else
{
if (!required_substring.empty())
{
const UInt8 * pos;
if (is_case_insensitive)
pos = case_insensitive_substring_searcher->search(haystack, subject_size);
else
pos = case_sensitive_substring_searcher->search(haystack, subject_size);
if (haystack_end == pos)
return false;
}
std::string_view piece;
if (!re2::RE2::PartialMatch({subject, subject_size}, *re2, &piece))
return false;
else
{
match.offset = piece.data() - subject;
match.length = piece.length();
return true;
}
}
}
unsigned OptimizedRegularExpression::match(const char * subject, size_t subject_size, MatchVec & matches, unsigned limit) const
{
const UInt8 * haystack = reinterpret_cast<const UInt8 *>(subject);
const UInt8 * haystack_end = haystack + subject_size;
matches.clear();
if (limit == 0)
return 0;
limit = std::min(limit, number_of_subpatterns + 1);
if (is_trivial)
{
if (required_substring.empty())
{
matches.emplace_back(Match{0, 0});
return 1;
}
const UInt8 * pos;
if (is_case_insensitive)
pos = case_insensitive_substring_searcher->search(haystack, subject_size);
else
pos = case_sensitive_substring_searcher->search(haystack, subject_size);
if (haystack_end == pos)
return 0;
else
{
Match match;
match.offset = pos - haystack;
match.length = required_substring.size();
matches.push_back(match);
return 1;
}
}
else
{
if (!required_substring.empty())
{
const UInt8 * pos;
if (is_case_insensitive)
pos = case_insensitive_substring_searcher->search(haystack, subject_size);
else
pos = case_sensitive_substring_searcher->search(haystack, subject_size);
if (haystack_end == pos)
return 0;
}
DB::PODArrayWithStackMemory<std::string_view, 128> pieces(limit);
if (!re2->Match(
{subject, subject_size},
0,
subject_size,
re2::RE2::UNANCHORED,
pieces.data(),
static_cast<int>(pieces.size())))
{
return 0;
}
else
{
matches.resize(limit);
for (size_t i = 0; i < limit; ++i)
{
if (pieces[i].empty())
{
matches[i].offset = std::string::npos;
matches[i].length = 0;
}
else
{
matches[i].offset = pieces[i].data() - subject;
matches[i].length = pieces[i].length();
}
}
return limit;
}
}
}