2023-03-22 16:47:25 +00:00
|
|
|
#include <limits>
|
2018-11-30 19:37:31 +00:00
|
|
|
#include <Common/Exception.h>
|
2019-08-23 18:40:42 +00:00
|
|
|
#include <Common/PODArray.h>
|
2023-07-22 03:06:02 +00:00
|
|
|
#include <Common/checkStackSize.h>
|
2017-04-01 09:19:00 +00:00
|
|
|
#include <Common/OptimizedRegularExpression.h>
|
2015-10-05 01:11:12 +00:00
|
|
|
|
|
|
|
#define MIN_LENGTH_FOR_STRSTR 3
|
2020-05-07 01:29:31 +00:00
|
|
|
#define MAX_SUBPATTERNS 1024
|
2015-10-05 01:11:12 +00:00
|
|
|
|
2017-05-10 04:00:19 +00:00
|
|
|
|
2018-11-30 19:37:31 +00:00
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
|
|
|
extern const int CANNOT_COMPILE_REGEXP;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-03-13 17:34:47 +00:00
|
|
|
namespace
|
|
|
|
{
|
2018-11-30 19:37:31 +00:00
|
|
|
|
2023-03-13 17:34:47 +00:00
|
|
|
struct Literal
|
|
|
|
{
|
|
|
|
std::string literal;
|
|
|
|
bool prefix; /// this literal string is the prefix of the whole string.
|
2023-03-22 16:47:25 +00:00
|
|
|
bool suffix; /// this literal string is the suffix of the whole string.
|
2023-03-13 17:34:47 +00:00
|
|
|
void clear()
|
|
|
|
{
|
|
|
|
literal.clear();
|
|
|
|
prefix = false;
|
|
|
|
suffix = false;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
using Literals = std::vector<Literal>;
|
|
|
|
|
2023-03-22 16:50:19 +00:00
|
|
|
size_t shortest_literal_length(const Literals & literals)
|
2023-03-13 17:34:47 +00:00
|
|
|
{
|
|
|
|
if (literals.empty()) return 0;
|
2023-03-22 16:47:25 +00:00
|
|
|
size_t shortest = std::numeric_limits<size_t>::max();
|
2023-03-13 17:34:47 +00:00
|
|
|
for (const auto & lit : literals)
|
|
|
|
if (shortest > lit.literal.size())
|
|
|
|
shortest = lit.literal.size();
|
|
|
|
return shortest;
|
|
|
|
}
|
|
|
|
|
|
|
|
const char * analyzeImpl(
|
2022-05-24 19:29:43 +00:00
|
|
|
std::string_view regexp,
|
2023-03-06 18:10:36 +00:00
|
|
|
const char * pos,
|
2023-03-13 17:34:47 +00:00
|
|
|
Literal & required_substring,
|
2017-05-10 02:45:21 +00:00
|
|
|
bool & is_trivial,
|
2023-03-22 16:47:25 +00:00
|
|
|
Literals & global_alternatives)
|
2015-10-05 01:11:12 +00:00
|
|
|
{
|
2023-07-22 03:06:02 +00:00
|
|
|
checkStackSize();
|
|
|
|
|
2017-05-10 02:45:21 +00:00
|
|
|
/** The expression is trivial if all the metacharacters in it are escaped.
|
|
|
|
* The non-alternative string is
|
|
|
|
* a string outside parentheses,
|
|
|
|
* in which all metacharacters are escaped,
|
|
|
|
* and also if there are no '|' outside the brackets,
|
2017-05-10 04:00:19 +00:00
|
|
|
* and also avoid substrings of the form `http://` or `www` and some other
|
2022-04-15 22:20:47 +00:00
|
|
|
* (this is the hack for typical use case in web analytics applications).
|
2017-05-10 02:45:21 +00:00
|
|
|
*/
|
2023-03-06 18:10:36 +00:00
|
|
|
const char * begin = pos;
|
2017-05-10 02:45:21 +00:00
|
|
|
const char * end = regexp.data() + regexp.size();
|
2023-03-22 16:47:25 +00:00
|
|
|
bool is_first_call = begin == regexp.data();
|
2017-05-10 02:45:21 +00:00
|
|
|
int depth = 0;
|
|
|
|
is_trivial = true;
|
2023-05-16 13:25:04 +00:00
|
|
|
bool is_prefix = true;
|
2017-05-10 02:45:21 +00:00
|
|
|
required_substring.clear();
|
|
|
|
bool has_alternative_on_depth_0 = false;
|
2020-06-14 00:43:42 +00:00
|
|
|
bool has_case_insensitive_flag = false;
|
2017-05-10 02:45:21 +00:00
|
|
|
|
2023-05-22 08:41:22 +00:00
|
|
|
/// Substring with is_prefix.
|
2023-05-16 13:25:04 +00:00
|
|
|
using Substring = std::pair<std::string, bool>;
|
2017-05-10 04:00:19 +00:00
|
|
|
using Substrings = std::vector<Substring>;
|
2017-05-10 02:45:21 +00:00
|
|
|
|
|
|
|
Substrings trivial_substrings(1);
|
|
|
|
Substring * last_substring = &trivial_substrings.back();
|
|
|
|
|
2023-03-22 16:47:25 +00:00
|
|
|
Literals cur_alternatives;
|
2023-03-13 17:34:47 +00:00
|
|
|
|
2023-03-22 16:47:25 +00:00
|
|
|
auto finish_cur_alternatives = [&]()
|
2023-03-13 17:34:47 +00:00
|
|
|
{
|
2023-03-22 16:47:25 +00:00
|
|
|
if (cur_alternatives.empty())
|
2023-03-13 17:34:47 +00:00
|
|
|
return;
|
|
|
|
|
2023-03-22 16:47:25 +00:00
|
|
|
if (global_alternatives.empty())
|
2023-03-13 17:34:47 +00:00
|
|
|
{
|
2023-03-22 16:47:25 +00:00
|
|
|
global_alternatives = cur_alternatives;
|
|
|
|
cur_alternatives.clear();
|
2023-03-15 14:38:11 +00:00
|
|
|
return;
|
2023-03-13 17:34:47 +00:00
|
|
|
}
|
2023-03-15 14:38:11 +00:00
|
|
|
/// that means current alternatives have better quality.
|
2023-03-22 16:50:19 +00:00
|
|
|
if (shortest_literal_length(global_alternatives) < shortest_literal_length(cur_alternatives))
|
2023-03-13 17:34:47 +00:00
|
|
|
{
|
2023-03-22 16:47:25 +00:00
|
|
|
global_alternatives.clear();
|
|
|
|
global_alternatives = cur_alternatives;
|
2023-03-13 17:34:47 +00:00
|
|
|
}
|
2023-03-22 16:47:25 +00:00
|
|
|
cur_alternatives.clear();
|
2023-03-13 17:34:47 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
auto finish_non_trivial_char = [&](bool create_new_substr = true)
|
2023-03-03 16:57:10 +00:00
|
|
|
{
|
2023-05-16 13:25:04 +00:00
|
|
|
is_trivial = false;
|
|
|
|
if (create_new_substr)
|
|
|
|
is_prefix = false;
|
2023-03-03 16:57:10 +00:00
|
|
|
if (depth != 0)
|
|
|
|
return;
|
2023-03-06 18:10:36 +00:00
|
|
|
|
2023-03-22 16:47:25 +00:00
|
|
|
for (auto & alter : cur_alternatives)
|
2023-03-13 17:34:47 +00:00
|
|
|
{
|
|
|
|
if (alter.suffix)
|
|
|
|
{
|
|
|
|
alter.literal += last_substring->first;
|
2023-05-16 13:25:04 +00:00
|
|
|
alter.suffix = false;
|
2023-03-13 17:34:47 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-03-22 16:47:25 +00:00
|
|
|
finish_cur_alternatives();
|
2023-03-13 17:34:47 +00:00
|
|
|
|
|
|
|
if (!last_substring->first.empty() && create_new_substr)
|
2023-03-06 18:10:36 +00:00
|
|
|
{
|
|
|
|
trivial_substrings.resize(trivial_substrings.size() + 1);
|
|
|
|
last_substring = &trivial_substrings.back();
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2023-03-13 17:34:47 +00:00
|
|
|
/// Resolve the string or alters in a group (xxxxx)
|
2023-03-22 16:47:25 +00:00
|
|
|
auto finish_group = [&](Literal & group_required_string, Literals & group_alternatives)
|
2023-03-06 18:10:36 +00:00
|
|
|
{
|
2023-03-22 16:47:25 +00:00
|
|
|
for (auto & alter : group_alternatives)
|
2023-03-06 18:10:36 +00:00
|
|
|
{
|
2023-03-13 17:34:47 +00:00
|
|
|
if (alter.prefix)
|
|
|
|
{
|
|
|
|
alter.literal = last_substring->first + alter.literal;
|
2023-05-16 13:25:04 +00:00
|
|
|
alter.prefix = is_prefix;
|
2023-03-13 17:34:47 +00:00
|
|
|
}
|
2023-03-06 18:10:36 +00:00
|
|
|
}
|
|
|
|
|
2023-03-13 17:34:47 +00:00
|
|
|
if (group_required_string.prefix)
|
2023-05-16 13:25:04 +00:00
|
|
|
{
|
2023-03-13 17:34:47 +00:00
|
|
|
last_substring->first += group_required_string.literal;
|
2023-05-16 13:25:04 +00:00
|
|
|
last_substring->second = is_prefix;
|
|
|
|
}
|
2023-03-06 18:10:36 +00:00
|
|
|
else
|
|
|
|
{
|
|
|
|
finish_non_trivial_char();
|
2023-03-15 14:38:11 +00:00
|
|
|
last_substring->first = group_required_string.literal;
|
2023-05-16 13:25:04 +00:00
|
|
|
last_substring->second = false;
|
2023-03-06 18:10:36 +00:00
|
|
|
}
|
2023-05-16 13:25:04 +00:00
|
|
|
|
|
|
|
is_prefix = is_prefix && group_required_string.prefix && group_required_string.suffix;
|
|
|
|
|
2023-03-03 16:57:10 +00:00
|
|
|
/// if we can still append, no need to finish it. e.g. abc(de)fg should capture abcdefg
|
2023-03-13 17:34:47 +00:00
|
|
|
if (!last_substring->first.empty() && !group_required_string.suffix)
|
2023-03-03 16:57:10 +00:00
|
|
|
{
|
|
|
|
trivial_substrings.resize(trivial_substrings.size() + 1);
|
|
|
|
last_substring = &trivial_substrings.back();
|
|
|
|
}
|
2023-03-13 17:34:47 +00:00
|
|
|
|
|
|
|
/// assign group alters to current alters.
|
2023-03-22 16:47:25 +00:00
|
|
|
finish_cur_alternatives();
|
|
|
|
cur_alternatives = std::move(group_alternatives);
|
2023-03-03 16:57:10 +00:00
|
|
|
};
|
|
|
|
|
2017-05-10 02:45:21 +00:00
|
|
|
bool in_curly_braces = false;
|
|
|
|
bool in_square_braces = false;
|
|
|
|
|
|
|
|
while (pos != end)
|
|
|
|
{
|
|
|
|
switch (*pos)
|
|
|
|
{
|
|
|
|
case '\0':
|
|
|
|
pos = end;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case '\\':
|
|
|
|
{
|
|
|
|
++pos;
|
|
|
|
if (pos == end)
|
|
|
|
break;
|
|
|
|
|
|
|
|
switch (*pos)
|
|
|
|
{
|
2020-06-14 00:43:42 +00:00
|
|
|
case '|':
|
|
|
|
case '(':
|
|
|
|
case ')':
|
|
|
|
case '^':
|
|
|
|
case '$':
|
|
|
|
case '.':
|
|
|
|
case '[':
|
2023-03-06 18:10:36 +00:00
|
|
|
case ']':
|
2020-06-14 00:43:42 +00:00
|
|
|
case '?':
|
|
|
|
case '*':
|
|
|
|
case '+':
|
2023-03-06 18:10:36 +00:00
|
|
|
case '-':
|
2020-06-14 00:43:42 +00:00
|
|
|
case '{':
|
2023-03-06 18:10:36 +00:00
|
|
|
case '}':
|
|
|
|
case '/':
|
|
|
|
goto ordinary;
|
2017-05-10 02:45:21 +00:00
|
|
|
default:
|
|
|
|
/// all other escape sequences are not supported
|
2023-03-06 18:10:36 +00:00
|
|
|
finish_non_trivial_char();
|
2017-05-10 02:45:21 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
++pos;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
case '|':
|
|
|
|
is_trivial = false;
|
2023-05-16 13:25:04 +00:00
|
|
|
is_prefix = false;
|
2023-03-06 18:10:36 +00:00
|
|
|
++pos;
|
|
|
|
if (depth == 0)
|
2017-05-10 02:45:21 +00:00
|
|
|
{
|
2023-03-06 18:10:36 +00:00
|
|
|
has_alternative_on_depth_0 = true;
|
|
|
|
goto finish;
|
2017-05-10 02:45:21 +00:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case '(':
|
2023-05-16 13:25:04 +00:00
|
|
|
/// bracket does not break is_prefix. for example abc(d) has a prefix 'abcd'
|
2023-03-10 14:30:29 +00:00
|
|
|
is_trivial = false;
|
2017-05-10 02:45:21 +00:00
|
|
|
if (!in_square_braces)
|
|
|
|
{
|
2020-06-14 00:43:42 +00:00
|
|
|
/// Check for case-insensitive flag.
|
|
|
|
if (pos + 1 < end && pos[1] == '?')
|
|
|
|
{
|
|
|
|
for (size_t offset = 2; pos + offset < end; ++offset)
|
|
|
|
{
|
|
|
|
if (pos[offset] == '-' /// it means flag negation
|
|
|
|
/// various possible flags, actually only imsU are supported by re2
|
|
|
|
|| (pos[offset] >= 'a' && pos[offset] <= 'z')
|
|
|
|
|| (pos[offset] >= 'A' && pos[offset] <= 'Z'))
|
|
|
|
{
|
|
|
|
if (pos[offset] == 'i')
|
|
|
|
{
|
|
|
|
/// Actually it can be negated case-insensitive flag. But we don't care.
|
|
|
|
has_case_insensitive_flag = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2023-03-03 16:57:10 +00:00
|
|
|
if (pos + 2 < end && pos[1] == '?' && pos[2] == ':')
|
|
|
|
{
|
|
|
|
pos += 2;
|
|
|
|
}
|
2023-03-13 17:34:47 +00:00
|
|
|
Literal group_required_substr;
|
|
|
|
bool group_is_trival = true;
|
|
|
|
Literals group_alters;
|
|
|
|
pos = analyzeImpl(regexp, pos + 1, group_required_substr, group_is_trival, group_alters);
|
2023-03-06 18:10:36 +00:00
|
|
|
/// pos should be ')', if not, then it is not a valid regular expression
|
|
|
|
if (pos == end)
|
|
|
|
return pos;
|
|
|
|
|
2023-03-13 17:34:47 +00:00
|
|
|
/// For ()? or ()* or (){0,1}, we can just ignore the whole group.
|
2023-03-06 18:10:36 +00:00
|
|
|
if ((pos + 1 < end && (pos[1] == '?' || pos[1] == '*')) ||
|
|
|
|
(pos + 2 < end && pos[1] == '{' && pos[2] == '0'))
|
|
|
|
{
|
|
|
|
finish_non_trivial_char();
|
|
|
|
}
|
|
|
|
else
|
2023-03-13 17:34:47 +00:00
|
|
|
{
|
|
|
|
finish_group(group_required_substr, group_alters);
|
|
|
|
}
|
2017-05-10 02:45:21 +00:00
|
|
|
}
|
|
|
|
++pos;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case '[':
|
|
|
|
in_square_braces = true;
|
|
|
|
++depth;
|
2023-03-06 18:10:36 +00:00
|
|
|
finish_non_trivial_char();
|
2017-05-10 02:45:21 +00:00
|
|
|
++pos;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case ']':
|
|
|
|
if (!in_square_braces)
|
|
|
|
goto ordinary;
|
|
|
|
|
|
|
|
--depth;
|
2023-03-06 18:10:36 +00:00
|
|
|
if (depth == 0)
|
|
|
|
in_square_braces = false;
|
|
|
|
finish_non_trivial_char();
|
2017-05-10 02:45:21 +00:00
|
|
|
++pos;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case ')':
|
|
|
|
if (!in_square_braces)
|
|
|
|
{
|
2023-03-06 18:10:36 +00:00
|
|
|
goto finish;
|
2017-05-10 02:45:21 +00:00
|
|
|
}
|
|
|
|
++pos;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case '^': case '$': case '.': case '+':
|
2023-03-06 18:10:36 +00:00
|
|
|
finish_non_trivial_char();
|
2017-05-10 02:45:21 +00:00
|
|
|
++pos;
|
|
|
|
break;
|
|
|
|
|
2019-01-22 19:56:53 +00:00
|
|
|
/// Quantifiers that allow a zero number of occurrences.
|
2017-05-10 02:45:21 +00:00
|
|
|
case '{':
|
|
|
|
in_curly_braces = true;
|
2017-12-02 03:22:51 +00:00
|
|
|
[[fallthrough]];
|
|
|
|
case '?':
|
|
|
|
[[fallthrough]];
|
|
|
|
case '*':
|
2023-03-03 16:57:10 +00:00
|
|
|
if (depth == 0 && !last_substring->first.empty() && !in_square_braces)
|
2017-05-10 02:45:21 +00:00
|
|
|
{
|
|
|
|
last_substring->first.resize(last_substring->first.size() - 1);
|
|
|
|
}
|
2023-03-06 18:10:36 +00:00
|
|
|
finish_non_trivial_char();
|
2017-05-10 02:45:21 +00:00
|
|
|
++pos;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case '}':
|
|
|
|
if (!in_curly_braces)
|
|
|
|
goto ordinary;
|
|
|
|
|
|
|
|
in_curly_braces = false;
|
|
|
|
++pos;
|
|
|
|
break;
|
|
|
|
|
|
|
|
ordinary: /// Normal, not escaped symbol.
|
2017-12-02 03:22:51 +00:00
|
|
|
[[fallthrough]];
|
2017-05-10 02:45:21 +00:00
|
|
|
default:
|
|
|
|
if (depth == 0 && !in_curly_braces && !in_square_braces)
|
|
|
|
{
|
2023-05-16 13:25:04 +00:00
|
|
|
/// record the first position of last string.
|
2017-05-10 02:45:21 +00:00
|
|
|
if (last_substring->first.empty())
|
2023-05-16 13:25:04 +00:00
|
|
|
last_substring->second = is_prefix;
|
2017-05-10 02:45:21 +00:00
|
|
|
last_substring->first.push_back(*pos);
|
|
|
|
}
|
|
|
|
++pos;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2023-03-06 18:10:36 +00:00
|
|
|
finish:
|
2023-03-13 17:34:47 +00:00
|
|
|
|
2017-05-10 02:45:21 +00:00
|
|
|
if (!is_trivial)
|
|
|
|
{
|
2023-05-16 13:25:04 +00:00
|
|
|
finish_non_trivial_char(false);
|
2023-03-06 18:10:36 +00:00
|
|
|
/// we calculate required substring even though has_alternative_on_depth_0.
|
|
|
|
/// we will clear the required substring after putting it to alternatives.
|
|
|
|
if (!has_case_insensitive_flag)
|
2017-05-10 02:45:21 +00:00
|
|
|
{
|
2019-05-05 12:26:20 +00:00
|
|
|
/// We choose the non-alternative substring of the maximum length for first search.
|
2019-05-05 09:32:26 +00:00
|
|
|
|
|
|
|
/// Tuning for typical usage domain
|
|
|
|
auto tuning_strings_condition = [](const std::string & str)
|
|
|
|
{
|
|
|
|
return str != "://" && str != "http://" && str != "www" && str != "Windows ";
|
|
|
|
};
|
2017-05-10 02:45:21 +00:00
|
|
|
size_t max_length = 0;
|
|
|
|
Substrings::const_iterator candidate_it = trivial_substrings.begin();
|
|
|
|
for (Substrings::const_iterator it = trivial_substrings.begin(); it != trivial_substrings.end(); ++it)
|
|
|
|
{
|
2019-05-05 12:26:20 +00:00
|
|
|
if (it->first.size() > max_length && tuning_strings_condition(it->first))
|
2017-05-10 02:45:21 +00:00
|
|
|
{
|
|
|
|
max_length = it->first.size();
|
|
|
|
candidate_it = it;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-03-22 16:47:25 +00:00
|
|
|
if (max_length >= MIN_LENGTH_FOR_STRSTR || (!is_first_call && max_length > 0))
|
2017-05-10 02:45:21 +00:00
|
|
|
{
|
2023-03-13 17:34:47 +00:00
|
|
|
required_substring.literal = candidate_it->first;
|
2023-05-16 13:25:04 +00:00
|
|
|
required_substring.prefix = candidate_it->second;
|
2023-03-13 17:34:47 +00:00
|
|
|
required_substring.suffix = candidate_it + 1 == trivial_substrings.end();
|
2017-05-10 02:45:21 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2017-10-25 18:39:10 +00:00
|
|
|
else if (!trivial_substrings.empty())
|
2017-05-10 02:45:21 +00:00
|
|
|
{
|
2023-03-13 17:34:47 +00:00
|
|
|
required_substring.literal = trivial_substrings.front().first;
|
2023-05-16 13:25:04 +00:00
|
|
|
/// trivial string means the whole regex is a simple string literal, so the prefix and suffix should be true.
|
|
|
|
required_substring.prefix = true;
|
2023-03-13 17:34:47 +00:00
|
|
|
required_substring.suffix = true;
|
2017-05-10 02:45:21 +00:00
|
|
|
}
|
|
|
|
|
2023-03-06 18:10:36 +00:00
|
|
|
/// if it is xxx|xxx|xxx, we should call the next xxx|xxx recursively and collect the result.
|
|
|
|
if (has_alternative_on_depth_0)
|
|
|
|
{
|
2023-03-13 17:34:47 +00:00
|
|
|
/// compare the quality of required substring and alternatives and choose the better one.
|
2023-03-22 16:50:19 +00:00
|
|
|
if (shortest_literal_length(global_alternatives) < required_substring.literal.size())
|
2023-03-22 16:47:25 +00:00
|
|
|
global_alternatives = {required_substring};
|
2023-03-13 17:34:47 +00:00
|
|
|
Literals next_alternatives;
|
2023-03-06 18:10:36 +00:00
|
|
|
/// this two vals are useless, xxx|xxx cannot be trivial nor prefix.
|
2023-03-13 17:34:47 +00:00
|
|
|
bool next_is_trivial = true;
|
|
|
|
pos = analyzeImpl(regexp, pos, required_substring, next_is_trivial, next_alternatives);
|
2023-03-06 18:10:36 +00:00
|
|
|
/// For xxx|xxx|xxx, we only conbine the alternatives and return a empty required_substring.
|
2023-03-22 16:50:19 +00:00
|
|
|
if (next_alternatives.empty() || shortest_literal_length(next_alternatives) < required_substring.literal.size())
|
2023-03-06 18:10:36 +00:00
|
|
|
{
|
2023-03-22 16:47:25 +00:00
|
|
|
global_alternatives.push_back(required_substring);
|
2023-03-06 18:10:36 +00:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2023-03-22 16:47:25 +00:00
|
|
|
global_alternatives.insert(global_alternatives.end(), next_alternatives.begin(), next_alternatives.end());
|
2023-03-06 18:10:36 +00:00
|
|
|
}
|
|
|
|
required_substring.clear();
|
|
|
|
}
|
|
|
|
|
|
|
|
return pos;
|
|
|
|
|
2017-05-10 02:45:21 +00:00
|
|
|
/* std::cerr
|
|
|
|
<< "regexp: " << regexp
|
|
|
|
<< ", is_trivial: " << is_trivial
|
|
|
|
<< ", required_substring: " << required_substring
|
|
|
|
<< ", required_substring_is_prefix: " << required_substring_is_prefix
|
|
|
|
<< std::endl;*/
|
2015-10-05 01:11:12 +00:00
|
|
|
}
|
2023-03-13 17:34:47 +00:00
|
|
|
}
|
2015-10-05 01:11:12 +00:00
|
|
|
|
2023-03-13 17:34:47 +00:00
|
|
|
template <bool thread_safe>
|
|
|
|
void OptimizedRegularExpressionImpl<thread_safe>::analyze(
|
|
|
|
std::string_view regexp_,
|
|
|
|
std::string & required_substring,
|
|
|
|
bool & is_trivial,
|
|
|
|
bool & required_substring_is_prefix,
|
|
|
|
std::vector<std::string> & alternatives)
|
|
|
|
{
|
2023-03-22 16:47:25 +00:00
|
|
|
Literals alternative_literals;
|
|
|
|
Literal required_literal;
|
|
|
|
analyzeImpl(regexp_, regexp_.data(), required_literal, is_trivial, alternative_literals);
|
|
|
|
required_substring = std::move(required_literal.literal);
|
|
|
|
required_substring_is_prefix = required_literal.prefix;
|
|
|
|
for (auto & lit : alternative_literals)
|
2023-03-13 17:34:47 +00:00
|
|
|
alternatives.push_back(std::move(lit.literal));
|
|
|
|
}
|
2015-10-05 01:11:12 +00:00
|
|
|
|
2017-05-10 04:00:19 +00:00
|
|
|
template <bool thread_safe>
|
|
|
|
OptimizedRegularExpressionImpl<thread_safe>::OptimizedRegularExpressionImpl(const std::string & regexp_, int options)
|
2015-10-05 01:11:12 +00:00
|
|
|
{
|
2023-03-22 16:47:25 +00:00
|
|
|
std::vector<std::string> alternativesDummy; /// this vector extracts patterns a,b,c from pattern (a|b|c). for now it's not used.
|
|
|
|
analyze(regexp_, required_substring, is_trivial, required_substring_is_prefix, alternativesDummy);
|
2017-05-10 02:45:21 +00:00
|
|
|
|
2023-03-14 16:44:02 +00:00
|
|
|
|
2017-05-10 04:00:19 +00:00
|
|
|
/// Just three following options are supported
|
2017-05-10 02:45:21 +00:00
|
|
|
if (options & (~(RE_CASELESS | RE_NO_CAPTURE | RE_DOT_NL)))
|
2023-01-23 21:13:58 +00:00
|
|
|
throw DB::Exception(DB::ErrorCodes::CANNOT_COMPILE_REGEXP, "OptimizedRegularExpression: Unsupported option.");
|
2017-05-10 02:45:21 +00:00
|
|
|
|
2018-11-30 19:37:31 +00:00
|
|
|
is_case_insensitive = options & RE_CASELESS;
|
|
|
|
bool is_no_capture = options & RE_NO_CAPTURE;
|
|
|
|
bool is_dot_nl = options & RE_DOT_NL;
|
2017-05-10 02:45:21 +00:00
|
|
|
|
|
|
|
number_of_subpatterns = 0;
|
|
|
|
if (!is_trivial)
|
|
|
|
{
|
|
|
|
/// Compile the re2 regular expression.
|
2018-06-03 16:51:31 +00:00
|
|
|
typename RegexType::Options regexp_options;
|
2017-05-10 02:45:21 +00:00
|
|
|
|
2018-11-30 19:37:31 +00:00
|
|
|
/// Never write error messages to stderr. It's ignorant to do it from library code.
|
|
|
|
regexp_options.set_log_errors(false);
|
|
|
|
|
2017-05-10 02:45:21 +00:00
|
|
|
if (is_case_insensitive)
|
2018-06-03 16:51:31 +00:00
|
|
|
regexp_options.set_case_sensitive(false);
|
2017-05-10 02:45:21 +00:00
|
|
|
|
|
|
|
if (is_dot_nl)
|
2018-06-03 16:51:31 +00:00
|
|
|
regexp_options.set_dot_nl(true);
|
2017-05-10 02:45:21 +00:00
|
|
|
|
2018-06-03 16:51:31 +00:00
|
|
|
re2 = std::make_unique<RegexType>(regexp_, regexp_options);
|
2017-05-10 02:45:21 +00:00
|
|
|
if (!re2->ok())
|
2020-03-25 11:34:33 +00:00
|
|
|
{
|
2023-01-23 21:13:58 +00:00
|
|
|
throw DB::Exception(DB::ErrorCodes::CANNOT_COMPILE_REGEXP,
|
|
|
|
"OptimizedRegularExpression: cannot compile re2: {}, error: {}. "
|
|
|
|
"Look at https://github.com/google/re2/wiki/Syntax "
|
2020-03-25 11:34:33 +00:00
|
|
|
"for reference. Please note that if you specify regex as an SQL "
|
|
|
|
"string literal, the slashes have to be additionally escaped. "
|
|
|
|
"For example, to match an opening brace, write '\\(' -- "
|
|
|
|
"the first slash is for SQL and the second one is for regex",
|
2023-01-23 21:13:58 +00:00
|
|
|
regexp_, re2->error());
|
2020-03-25 11:34:33 +00:00
|
|
|
}
|
2020-03-25 13:51:50 +00:00
|
|
|
|
2017-05-10 02:45:21 +00:00
|
|
|
if (!is_no_capture)
|
|
|
|
{
|
|
|
|
number_of_subpatterns = re2->NumberOfCapturingGroups();
|
|
|
|
if (number_of_subpatterns > MAX_SUBPATTERNS)
|
2023-01-23 21:13:58 +00:00
|
|
|
throw DB::Exception(DB::ErrorCodes::CANNOT_COMPILE_REGEXP, "OptimizedRegularExpression: too many subpatterns in regexp: {}", regexp_);
|
2017-05-10 02:45:21 +00:00
|
|
|
}
|
|
|
|
}
|
2020-02-17 18:53:59 +00:00
|
|
|
|
|
|
|
if (!required_substring.empty())
|
|
|
|
{
|
|
|
|
if (is_case_insensitive)
|
|
|
|
case_insensitive_substring_searcher.emplace(required_substring.data(), required_substring.size());
|
|
|
|
else
|
|
|
|
case_sensitive_substring_searcher.emplace(required_substring.data(), required_substring.size());
|
|
|
|
}
|
2015-10-05 01:11:12 +00:00
|
|
|
}
|
|
|
|
|
Fix UB (stack-use-after-scope) in extactAll()
After #37544 OptimizedRegularExpressionImpl started to be moved, but
StringSearcher is not copyable since it holds pointers that goes out of
scope after move (before Regexps::get() returns std::shared_ptr<Regexp>
but it had been replaced with Regexps::createRegexp() that returns
Regexp object).
<details>
<summary>ASan report</summary>
==48348==ERROR: AddressSanitizer: stack-use-after-scope on address 0x7fff577239a9 at pc 0x00001518209b bp 0x7fff57723820 sp 0x7fff57723818
READ of size 1 at 0x7fff577239a9 thread T0
0 0x1518209a in char8_t const* DB::StringSearcher<true, true>::search<char8_t>(char8_t const*, char8_t const*) const /bld/./src/Common/StringSearcher.h:730:41
1 0x1518dd3f in char8_t const* DB::StringSearcher<true, true>::search<char8_t>(char8_t const*, unsigned long) const /bld/./src/Common/StringSearcher.h:751:16
2 0x1518dd3f in OptimizedRegularExpressionImpl<false>::match(char const*, unsigned long, std::__1::vector<OptimizedRegularExpressionDetails::Match, std::__1::allocator<OptimizedRegularExpressionDetails::Match> >&, unsigned int) const /bld/./src/Common/OptimizedRegularExpression.cpp:463:54
3 0x1811cb42 in DB::ExtractAllImpl::get(char const*&, char const*&) /bld/./src/Functions/FunctionsStringArray.h:588:18
4 0x1811aa62 in DB::FunctionTokens<DB::ExtractAllImpl>::executeImpl(std::__1::vector<DB::ColumnWithTypeAndName, std::__1::allocator<DB::ColumnWithTypeAndName> > const&, std::__1::shared_ptr<DB::IDataType const> const&, unsigned long) const /bld/./src/Functions/FunctionsStringArray.h:704:30
5 0x14fe17b4 in DB::IFunction::executeImplDryRun(std::__1::vector<DB::ColumnWithTypeAndName, std::__1::allocator<DB::ColumnWithTypeAndName> > const&, std::__1::shared_ptr<DB::IDataType const> const&, unsigned long) const /bld/./src/Functions/IFunction.h:409:16
Address 0x7fff577239a9 is located in stack of thread T0 at offset 201 in frame
0 0x1518d98f in OptimizedRegularExpressionImpl<false>::match(char const*, unsigned long, std::__1::vector<OptimizedRegularExpressionDetails::Match, std::__1::allocator<OptimizedRegularExpressionDetails::Match> >&, unsigned int) const /bld/./src/Common/OptimizedRegularExpression.cpp:439
</details>
CI: https://s3.amazonaws.com/clickhouse-test-reports/39342/c6f7698f9ad6ae22199182ebf7c3b2dac77d69d8/fuzzer_astfuzzerasan,actions//report.html
Fixes: #37544 (cc @rschu1ze)
Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
2022-07-19 11:31:56 +00:00
|
|
|
template <bool thread_safe>
|
|
|
|
OptimizedRegularExpressionImpl<thread_safe>::OptimizedRegularExpressionImpl(OptimizedRegularExpressionImpl && rhs) noexcept
|
|
|
|
: is_trivial(rhs.is_trivial)
|
|
|
|
, required_substring_is_prefix(rhs.required_substring_is_prefix)
|
|
|
|
, is_case_insensitive(rhs.is_case_insensitive)
|
|
|
|
, required_substring(std::move(rhs.required_substring))
|
|
|
|
, re2(std::move(rhs.re2))
|
|
|
|
, number_of_subpatterns(rhs.number_of_subpatterns)
|
|
|
|
{
|
|
|
|
if (!required_substring.empty())
|
|
|
|
{
|
|
|
|
if (is_case_insensitive)
|
|
|
|
case_insensitive_substring_searcher.emplace(required_substring.data(), required_substring.size());
|
|
|
|
else
|
|
|
|
case_sensitive_substring_searcher.emplace(required_substring.data(), required_substring.size());
|
|
|
|
}
|
|
|
|
}
|
2015-10-05 01:11:12 +00:00
|
|
|
|
2017-05-10 04:00:19 +00:00
|
|
|
template <bool thread_safe>
|
|
|
|
bool OptimizedRegularExpressionImpl<thread_safe>::match(const char * subject, size_t subject_size) const
|
2015-10-05 01:11:12 +00:00
|
|
|
{
|
2020-02-20 19:38:18 +00:00
|
|
|
const UInt8 * haystack = reinterpret_cast<const UInt8 *>(subject);
|
|
|
|
const UInt8 * haystack_end = haystack + subject_size;
|
|
|
|
|
2017-05-10 02:45:21 +00:00
|
|
|
if (is_trivial)
|
|
|
|
{
|
2020-02-21 15:55:36 +00:00
|
|
|
if (required_substring.empty())
|
|
|
|
return true;
|
|
|
|
|
2017-05-10 02:45:21 +00:00
|
|
|
if (is_case_insensitive)
|
2020-02-20 19:38:18 +00:00
|
|
|
return haystack_end != case_insensitive_substring_searcher->search(haystack, subject_size);
|
2017-05-10 02:45:21 +00:00
|
|
|
else
|
2020-02-20 19:38:18 +00:00
|
|
|
return haystack_end != case_sensitive_substring_searcher->search(haystack, subject_size);
|
2017-05-10 02:45:21 +00:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
if (!required_substring.empty())
|
|
|
|
{
|
|
|
|
if (is_case_insensitive)
|
2020-02-20 19:38:18 +00:00
|
|
|
{
|
|
|
|
if (haystack_end == case_insensitive_substring_searcher->search(haystack, subject_size))
|
|
|
|
return false;
|
|
|
|
}
|
2017-05-10 02:45:21 +00:00
|
|
|
else
|
2020-02-20 19:38:18 +00:00
|
|
|
{
|
|
|
|
if (haystack_end == case_sensitive_substring_searcher->search(haystack, subject_size))
|
|
|
|
return false;
|
|
|
|
}
|
2017-05-10 02:45:21 +00:00
|
|
|
}
|
|
|
|
|
2023-06-30 08:55:49 +00:00
|
|
|
return re2->Match({subject, subject_size}, 0, subject_size, RegexType::UNANCHORED, nullptr, 0);
|
2017-05-10 02:45:21 +00:00
|
|
|
}
|
2015-10-05 01:11:12 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2017-05-10 04:00:19 +00:00
|
|
|
template <bool thread_safe>
|
|
|
|
bool OptimizedRegularExpressionImpl<thread_safe>::match(const char * subject, size_t subject_size, Match & match) const
|
2015-10-05 01:11:12 +00:00
|
|
|
{
|
2020-02-20 19:38:18 +00:00
|
|
|
const UInt8 * haystack = reinterpret_cast<const UInt8 *>(subject);
|
|
|
|
const UInt8 * haystack_end = haystack + subject_size;
|
|
|
|
|
2017-05-10 02:45:21 +00:00
|
|
|
if (is_trivial)
|
|
|
|
{
|
2020-02-21 15:55:36 +00:00
|
|
|
if (required_substring.empty())
|
|
|
|
return true;
|
|
|
|
|
2020-02-20 19:38:18 +00:00
|
|
|
const UInt8 * pos;
|
2017-05-10 02:45:21 +00:00
|
|
|
if (is_case_insensitive)
|
2020-02-20 19:38:18 +00:00
|
|
|
pos = case_insensitive_substring_searcher->search(haystack, subject_size);
|
2017-05-10 02:45:21 +00:00
|
|
|
else
|
2020-02-20 19:38:18 +00:00
|
|
|
pos = case_sensitive_substring_searcher->search(haystack, subject_size);
|
2017-05-10 02:45:21 +00:00
|
|
|
|
2020-02-20 19:38:18 +00:00
|
|
|
if (haystack_end == pos)
|
|
|
|
return false;
|
2017-05-10 02:45:21 +00:00
|
|
|
else
|
|
|
|
{
|
2020-02-20 19:38:18 +00:00
|
|
|
match.offset = pos - haystack;
|
2017-05-10 02:45:21 +00:00
|
|
|
match.length = required_substring.size();
|
2020-03-08 21:04:10 +00:00
|
|
|
return true;
|
2017-05-10 02:45:21 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
if (!required_substring.empty())
|
|
|
|
{
|
2020-02-20 19:38:18 +00:00
|
|
|
const UInt8 * pos;
|
2017-05-10 02:45:21 +00:00
|
|
|
if (is_case_insensitive)
|
2020-02-20 19:38:18 +00:00
|
|
|
pos = case_insensitive_substring_searcher->search(haystack, subject_size);
|
2017-05-10 02:45:21 +00:00
|
|
|
else
|
2020-02-20 19:38:18 +00:00
|
|
|
pos = case_sensitive_substring_searcher->search(haystack, subject_size);
|
2017-05-10 02:45:21 +00:00
|
|
|
|
2020-02-20 19:38:18 +00:00
|
|
|
if (haystack_end == pos)
|
|
|
|
return false;
|
2017-05-10 02:45:21 +00:00
|
|
|
}
|
|
|
|
|
2023-06-30 08:55:49 +00:00
|
|
|
std::string_view piece;
|
2017-05-10 02:45:21 +00:00
|
|
|
|
2023-06-30 08:55:49 +00:00
|
|
|
if (!RegexType::PartialMatch({subject, subject_size}, *re2, &piece))
|
2020-02-20 19:38:18 +00:00
|
|
|
return false;
|
2017-05-10 02:45:21 +00:00
|
|
|
else
|
|
|
|
{
|
|
|
|
match.offset = piece.data() - subject;
|
|
|
|
match.length = piece.length();
|
2020-02-20 19:38:18 +00:00
|
|
|
return true;
|
2017-05-10 02:45:21 +00:00
|
|
|
}
|
|
|
|
}
|
2015-10-05 01:11:12 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2017-05-10 04:00:19 +00:00
|
|
|
template <bool thread_safe>
|
|
|
|
unsigned OptimizedRegularExpressionImpl<thread_safe>::match(const char * subject, size_t subject_size, MatchVec & matches, unsigned limit) const
|
2015-10-05 01:11:12 +00:00
|
|
|
{
|
2020-02-20 19:38:18 +00:00
|
|
|
const UInt8 * haystack = reinterpret_cast<const UInt8 *>(subject);
|
|
|
|
const UInt8 * haystack_end = haystack + subject_size;
|
|
|
|
|
2017-05-10 02:45:21 +00:00
|
|
|
matches.clear();
|
|
|
|
|
|
|
|
if (limit == 0)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (limit > number_of_subpatterns + 1)
|
|
|
|
limit = number_of_subpatterns + 1;
|
|
|
|
|
|
|
|
if (is_trivial)
|
|
|
|
{
|
2020-02-21 15:55:36 +00:00
|
|
|
if (required_substring.empty())
|
2020-02-24 18:49:38 +00:00
|
|
|
{
|
2020-02-24 18:52:03 +00:00
|
|
|
matches.emplace_back(Match{0, 0});
|
2020-02-21 15:55:36 +00:00
|
|
|
return 1;
|
2020-02-24 18:49:38 +00:00
|
|
|
}
|
2020-02-21 15:55:36 +00:00
|
|
|
|
2020-02-20 19:38:18 +00:00
|
|
|
const UInt8 * pos;
|
2017-05-10 02:45:21 +00:00
|
|
|
if (is_case_insensitive)
|
2020-02-20 19:38:18 +00:00
|
|
|
pos = case_insensitive_substring_searcher->search(haystack, subject_size);
|
2017-05-10 02:45:21 +00:00
|
|
|
else
|
2020-02-20 19:38:18 +00:00
|
|
|
pos = case_sensitive_substring_searcher->search(haystack, subject_size);
|
2017-05-10 02:45:21 +00:00
|
|
|
|
2020-02-20 19:38:18 +00:00
|
|
|
if (haystack_end == pos)
|
2017-05-10 02:45:21 +00:00
|
|
|
return 0;
|
|
|
|
else
|
|
|
|
{
|
|
|
|
Match match;
|
2020-02-20 19:38:18 +00:00
|
|
|
match.offset = pos - haystack;
|
2017-05-10 02:45:21 +00:00
|
|
|
match.length = required_substring.size();
|
|
|
|
matches.push_back(match);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
if (!required_substring.empty())
|
|
|
|
{
|
2020-02-20 19:38:18 +00:00
|
|
|
const UInt8 * pos;
|
2017-05-10 02:45:21 +00:00
|
|
|
if (is_case_insensitive)
|
2020-02-20 19:38:18 +00:00
|
|
|
pos = case_insensitive_substring_searcher->search(haystack, subject_size);
|
2017-05-10 02:45:21 +00:00
|
|
|
else
|
2020-02-20 19:38:18 +00:00
|
|
|
pos = case_sensitive_substring_searcher->search(haystack, subject_size);
|
2017-05-10 02:45:21 +00:00
|
|
|
|
2020-02-20 19:38:18 +00:00
|
|
|
if (haystack_end == pos)
|
2017-05-10 02:45:21 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2023-06-30 08:55:49 +00:00
|
|
|
DB::PODArrayWithStackMemory<std::string_view, 128> pieces(limit);
|
2017-05-10 02:45:21 +00:00
|
|
|
|
2022-10-07 10:46:45 +00:00
|
|
|
if (!re2->Match(
|
2023-06-30 08:55:49 +00:00
|
|
|
{subject, subject_size},
|
2022-10-07 10:46:45 +00:00
|
|
|
0,
|
|
|
|
subject_size,
|
|
|
|
RegexType::UNANCHORED,
|
|
|
|
pieces.data(),
|
|
|
|
static_cast<int>(pieces.size())))
|
|
|
|
{
|
2017-05-10 02:45:21 +00:00
|
|
|
return 0;
|
2022-10-07 10:46:45 +00:00
|
|
|
}
|
2017-05-10 02:45:21 +00:00
|
|
|
else
|
|
|
|
{
|
|
|
|
matches.resize(limit);
|
|
|
|
for (size_t i = 0; i < limit; ++i)
|
|
|
|
{
|
2023-06-13 16:29:41 +00:00
|
|
|
if (pieces[i].empty())
|
2017-05-10 02:45:21 +00:00
|
|
|
{
|
2023-06-13 16:29:41 +00:00
|
|
|
matches[i].offset = std::string::npos;
|
|
|
|
matches[i].length = 0;
|
2017-05-10 02:45:21 +00:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2023-06-13 16:29:41 +00:00
|
|
|
matches[i].offset = pieces[i].data() - subject;
|
|
|
|
matches[i].length = pieces[i].length();
|
2017-05-10 02:45:21 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return limit;
|
|
|
|
}
|
|
|
|
}
|
2015-10-05 01:11:12 +00:00
|
|
|
}
|
|
|
|
|
2018-11-30 19:37:31 +00:00
|
|
|
template class OptimizedRegularExpressionImpl<true>;
|
|
|
|
template class OptimizedRegularExpressionImpl<false>;
|