2022-06-24 13:24:17 +00:00
|
|
|
#include <Functions/checkHyperscanRegexp.h>
|
2021-08-07 05:07:41 +00:00
|
|
|
|
|
|
|
#include <Common/Exception.h>
|
2023-02-08 13:07:27 +00:00
|
|
|
#include <charconv>
|
2021-08-07 05:07:41 +00:00
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
|
|
|
extern const int BAD_ARGUMENTS;
|
|
|
|
}
|
|
|
|
|
2022-06-24 13:24:17 +00:00
|
|
|
void checkHyperscanRegexp(const std::vector<std::string_view> & regexps, size_t max_hyperscan_regexp_length, size_t max_hyperscan_regexp_total_length)
|
2021-08-07 05:07:41 +00:00
|
|
|
{
|
|
|
|
if (max_hyperscan_regexp_length > 0 || max_hyperscan_regexp_total_length > 0)
|
|
|
|
{
|
|
|
|
size_t total_regexp_length = 0;
|
2022-06-24 13:16:57 +00:00
|
|
|
for (const auto & regexp : regexps)
|
2021-08-07 05:07:41 +00:00
|
|
|
{
|
2022-06-24 13:16:57 +00:00
|
|
|
if (max_hyperscan_regexp_length > 0 && regexp.size() > max_hyperscan_regexp_length)
|
2023-01-26 19:20:56 +00:00
|
|
|
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Regexp length too large ({} > {})", regexp.size(), max_hyperscan_regexp_length);
|
2022-06-24 13:16:57 +00:00
|
|
|
total_regexp_length += regexp.size();
|
2021-08-07 05:07:41 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if (max_hyperscan_regexp_total_length > 0 && total_regexp_length > max_hyperscan_regexp_total_length)
|
2023-01-26 19:20:56 +00:00
|
|
|
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Total regexp lengths too large ({} > {})",
|
|
|
|
total_regexp_length, max_hyperscan_regexp_total_length);
|
2021-08-07 05:07:41 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-02-08 13:07:27 +00:00
|
|
|
namespace
|
|
|
|
{
|
|
|
|
|
|
|
|
bool isLargerThanFifty(std::string_view str)
|
|
|
|
{
|
|
|
|
int number;
|
|
|
|
auto [_, ec] = std::from_chars(str.begin(), str.end(), number);
|
|
|
|
if (ec != std::errc())
|
|
|
|
return false;
|
|
|
|
return number > 50;
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Check for sub-patterns of the form x{n} or x{n,} can be expensive. Ignore spaces before/after n and m.
|
|
|
|
bool SlowWithHyperscanChecker::isSlowOneRepeat(std::string_view regexp)
|
|
|
|
{
|
2023-06-30 08:55:49 +00:00
|
|
|
std::string_view haystack(regexp.data(), regexp.size());
|
|
|
|
std::string_view matches[2];
|
2023-02-08 13:07:27 +00:00
|
|
|
size_t start_pos = 0;
|
2023-02-24 11:29:45 +00:00
|
|
|
while (start_pos < haystack.size())
|
2023-02-08 13:07:27 +00:00
|
|
|
{
|
2023-09-14 16:12:29 +00:00
|
|
|
if (searcher_one_repeat.Match(haystack, start_pos, haystack.size(), re2::RE2::Anchor::UNANCHORED, matches, 2))
|
2023-02-08 13:07:27 +00:00
|
|
|
{
|
|
|
|
const auto & match = matches[0];
|
2023-02-24 11:29:45 +00:00
|
|
|
start_pos = (matches[0].data() - haystack.data()) + match.size(); // new start pos = prefix before match + match length
|
2023-02-08 13:07:27 +00:00
|
|
|
const auto & submatch = matches[1];
|
|
|
|
if (isLargerThanFifty({submatch.data(), submatch.size()}))
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Check if sub-patterns of the form x{n,m} can be expensive. Ignore spaces before/after n and m.
|
|
|
|
bool SlowWithHyperscanChecker::isSlowTwoRepeats(std::string_view regexp)
|
|
|
|
{
|
2023-06-30 08:55:49 +00:00
|
|
|
std::string_view haystack(regexp.data(), regexp.size());
|
|
|
|
std::string_view matches[3];
|
2023-02-08 13:07:27 +00:00
|
|
|
size_t start_pos = 0;
|
2023-02-24 11:29:45 +00:00
|
|
|
while (start_pos < haystack.size())
|
2023-02-08 13:07:27 +00:00
|
|
|
{
|
2023-09-14 16:12:29 +00:00
|
|
|
if (searcher_two_repeats.Match(haystack, start_pos, haystack.size(), re2::RE2::Anchor::UNANCHORED, matches, 3))
|
2023-02-08 13:07:27 +00:00
|
|
|
{
|
|
|
|
const auto & match = matches[0];
|
2023-02-24 11:29:45 +00:00
|
|
|
start_pos = (matches[0].data() - haystack.data()) + match.size(); // new start pos = prefix before match + match length
|
2023-02-08 13:07:27 +00:00
|
|
|
const auto & submatch1 = matches[1];
|
|
|
|
const auto & submatch2 = matches[2];
|
|
|
|
if (isLargerThanFifty({submatch1.data(), submatch1.size()})
|
|
|
|
|| isLargerThanFifty({submatch2.data(), submatch2.size()}))
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
SlowWithHyperscanChecker::SlowWithHyperscanChecker()
|
|
|
|
: searcher_one_repeat(R"(\{\s*([\d]+)\s*,?\s*})")
|
|
|
|
, searcher_two_repeats(R"(\{\s*([\d]+)\s*,\s*([\d]+)\s*\})")
|
|
|
|
{}
|
|
|
|
|
|
|
|
bool SlowWithHyperscanChecker::isSlow(std::string_view regexp)
|
|
|
|
{
|
|
|
|
if (isSlowOneRepeat(regexp))
|
|
|
|
return true;
|
|
|
|
else if (isSlowTwoRepeats(regexp))
|
|
|
|
return true;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2021-08-07 05:07:41 +00:00
|
|
|
}
|