mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-26 17:41:59 +00:00
Reject DoS-prone hyperscan regexes
This commit is contained in:
parent
99274f1db1
commit
74937cf27b
@ -391,10 +391,18 @@ For patterns to search for substrings in a string, it is better to use LIKE or
|
|||||||
|
|
||||||
## multiMatchAny(haystack, \[pattern<sub>1</sub>, pattern<sub>2</sub>, …, pattern<sub>n</sub>\])
|
## multiMatchAny(haystack, \[pattern<sub>1</sub>, pattern<sub>2</sub>, …, pattern<sub>n</sub>\])
|
||||||
|
|
||||||
The same as `match`, but returns 0 if none of the regular expressions are matched and 1 if any of the patterns matches. It uses [hyperscan](https://github.com/intel/hyperscan) library. For patterns to search substrings in a string, it is better to use `multiSearchAny` since it works much faster.
|
The same as `match`, but returns 0 if none of the regular expressions are matched and 1 if any of the patterns matches. For patterns to search substrings in a string, it is better to use `multiSearchAny` since it works much faster.
|
||||||
|
|
||||||
:::note
|
:::note
|
||||||
The length of any of the `haystack` string must be less than 2<sup>32</sup> bytes otherwise the exception is thrown. This restriction takes place because of hyperscan API.
|
Functions `multiMatchAny`, `multiMatchAnyIndex`, `multiMatchAllIndices` and their fuzzy equivalents (`multiFuzzyMatchAny`,
|
||||||
|
`multiFuzzyMatchAnyIndex`, `multiFuzzyMatchAllIndices`) use the (Vectorscan)[https://github.com/VectorCamp/vectorscan] library. As such,
|
||||||
|
they are only enabled if ClickHouse is compiled with support for vectorscan.
|
||||||
|
|
||||||
|
Due to restrictions of vectorscan, the length of the `haystack` string must be less than 2<sup>32</sup> bytes.
|
||||||
|
|
||||||
|
Hyperscan is generally vulnerable to regular expression denial of service (ReDoS) attacks (e.g. see
|
||||||
|
(here)[https://www.usenix.org/conference/usenixsecurity22/presentation/turonova], (here)[https://doi.org/10.1007/s10664-021-10033-1] and
|
||||||
|
(here)[ https://doi.org/10.1145/3236024.3236027]. Users are adviced to check the provided patterns carefully.
|
||||||
:::
|
:::
|
||||||
|
|
||||||
## multiMatchAnyIndex(haystack, \[pattern<sub>1</sub>, pattern<sub>2</sub>, …, pattern<sub>n</sub>\])
|
## multiMatchAnyIndex(haystack, \[pattern<sub>1</sub>, pattern<sub>2</sub>, …, pattern<sub>n</sub>\])
|
||||||
|
@ -446,6 +446,7 @@ class IColumn;
|
|||||||
M(Bool, allow_hyperscan, true, "Allow functions that use Hyperscan library. Disable to avoid potentially long compilation times and excessive resource usage.", 0) \
|
M(Bool, allow_hyperscan, true, "Allow functions that use Hyperscan library. Disable to avoid potentially long compilation times and excessive resource usage.", 0) \
|
||||||
M(UInt64, max_hyperscan_regexp_length, 0, "Max length of regexp than can be used in hyperscan multi-match functions. Zero means unlimited.", 0) \
|
M(UInt64, max_hyperscan_regexp_length, 0, "Max length of regexp than can be used in hyperscan multi-match functions. Zero means unlimited.", 0) \
|
||||||
M(UInt64, max_hyperscan_regexp_total_length, 0, "Max total length of all regexps than can be used in hyperscan multi-match functions (per every function). Zero means unlimited.", 0) \
|
M(UInt64, max_hyperscan_regexp_total_length, 0, "Max total length of all regexps than can be used in hyperscan multi-match functions (per every function). Zero means unlimited.", 0) \
|
||||||
|
M(Bool, reject_expensive_hyperscan_regexps, true, "Reject patterns which will likely be expensive to evaluate with hyperscan (due to NFA state explosion)", 0) \
|
||||||
M(Bool, allow_simdjson, true, "Allow using simdjson library in 'JSON*' functions if AVX2 instructions are available. If disabled rapidjson will be used.", 0) \
|
M(Bool, allow_simdjson, true, "Allow using simdjson library in 'JSON*' functions if AVX2 instructions are available. If disabled rapidjson will be used.", 0) \
|
||||||
M(Bool, allow_introspection_functions, false, "Allow functions for introspection of ELF and DWARF for query profiling. These functions are slow and may impose security considerations.", 0) \
|
M(Bool, allow_introspection_functions, false, "Allow functions for introspection of ELF and DWARF for query profiling. These functions are slow and may impose security considerations.", 0) \
|
||||||
\
|
\
|
||||||
|
@ -17,6 +17,7 @@
|
|||||||
#include <DataTypes/DataTypesNumber.h>
|
#include <DataTypes/DataTypesNumber.h>
|
||||||
|
|
||||||
#include <Functions/Regexps.h>
|
#include <Functions/Regexps.h>
|
||||||
|
#include <Functions/checkHyperscanRegexp.h>
|
||||||
#include <QueryPipeline/QueryPipeline.h>
|
#include <QueryPipeline/QueryPipeline.h>
|
||||||
|
|
||||||
#include <Dictionaries/ClickHouseDictionarySource.h>
|
#include <Dictionaries/ClickHouseDictionarySource.h>
|
||||||
@ -152,53 +153,6 @@ void RegExpTreeDictionary::calculateBytesAllocated()
|
|||||||
bytes_allocated += 2 * sizeof(UInt64) * topology_order.size();
|
bytes_allocated += 2 * sizeof(UInt64) * topology_order.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
namespace
|
|
||||||
{
|
|
||||||
/// hyper scan is not good at processing regex containing {0, 200}
|
|
||||||
/// This will make re compilation slow and failed. So we select this heavy regular expressions and
|
|
||||||
/// process it with re2.
|
|
||||||
struct RegexChecker
|
|
||||||
{
|
|
||||||
re2_st::RE2 searcher;
|
|
||||||
RegexChecker() : searcher(R"(\{([\d]+),([\d]+)\})") {}
|
|
||||||
|
|
||||||
static bool isFigureLargerThanFifty(const String & str)
|
|
||||||
try
|
|
||||||
{
|
|
||||||
auto number = std::stoi(str);
|
|
||||||
return number > 50;
|
|
||||||
}
|
|
||||||
catch (std::exception &)
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
[[maybe_unused]]
|
|
||||||
bool isSimpleRegex(const String & regex) const
|
|
||||||
{
|
|
||||||
|
|
||||||
re2_st::StringPiece haystack(regex.data(), regex.size());
|
|
||||||
re2_st::StringPiece matches[10];
|
|
||||||
size_t start_pos = 0;
|
|
||||||
while (start_pos < regex.size())
|
|
||||||
{
|
|
||||||
if (searcher.Match(haystack, start_pos, regex.size(), re2_st::RE2::Anchor::UNANCHORED, matches, 10))
|
|
||||||
{
|
|
||||||
const auto & match = matches[0];
|
|
||||||
start_pos += match.length();
|
|
||||||
const auto & match1 = matches[1];
|
|
||||||
const auto & match2 = matches[2];
|
|
||||||
if (isFigureLargerThanFifty(match1.ToString()) || isFigureLargerThanFifty(match2.ToString()))
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
void RegExpTreeDictionary::initRegexNodes(Block & block)
|
void RegExpTreeDictionary::initRegexNodes(Block & block)
|
||||||
{
|
{
|
||||||
auto id_column = block.getByName(kId).column;
|
auto id_column = block.getByName(kId).column;
|
||||||
@ -207,7 +161,9 @@ void RegExpTreeDictionary::initRegexNodes(Block & block)
|
|||||||
auto keys_column = block.getByName(kKeys).column;
|
auto keys_column = block.getByName(kKeys).column;
|
||||||
auto values_column = block.getByName(kValues).column;
|
auto values_column = block.getByName(kValues).column;
|
||||||
|
|
||||||
RegexChecker checker;
|
#ifdef USE_VECTORSCAN
|
||||||
|
SlowWithHyperscanChecker checker;
|
||||||
|
#endif
|
||||||
|
|
||||||
size_t size = block.rows();
|
size_t size = block.rows();
|
||||||
for (size_t i = 0; i < size; i++)
|
for (size_t i = 0; i < size; i++)
|
||||||
@ -253,7 +209,7 @@ void RegExpTreeDictionary::initRegexNodes(Block & block)
|
|||||||
}
|
}
|
||||||
regex_nodes.emplace(id, node);
|
regex_nodes.emplace(id, node);
|
||||||
#if USE_VECTORSCAN
|
#if USE_VECTORSCAN
|
||||||
if (use_vectorscan && checker.isSimpleRegex(regex))
|
if (use_vectorscan && !checker.isSlow(regex))
|
||||||
{
|
{
|
||||||
simple_regexps.push_back(regex);
|
simple_regexps.push_back(regex);
|
||||||
regexp_ids.push_back(id);
|
regexp_ids.push_back(id);
|
||||||
|
@ -39,13 +39,14 @@ public:
|
|||||||
static FunctionPtr create(ContextPtr context)
|
static FunctionPtr create(ContextPtr context)
|
||||||
{
|
{
|
||||||
const auto & settings = context->getSettingsRef();
|
const auto & settings = context->getSettingsRef();
|
||||||
return std::make_shared<FunctionsMultiStringFuzzySearch>(settings.allow_hyperscan, settings.max_hyperscan_regexp_length, settings.max_hyperscan_regexp_total_length);
|
return std::make_shared<FunctionsMultiStringFuzzySearch>(settings.allow_hyperscan, settings.max_hyperscan_regexp_length, settings.max_hyperscan_regexp_total_length, settings.reject_expensive_hyperscan_regexps);
|
||||||
}
|
}
|
||||||
|
|
||||||
FunctionsMultiStringFuzzySearch(bool allow_hyperscan_, size_t max_hyperscan_regexp_length_, size_t max_hyperscan_regexp_total_length_)
|
FunctionsMultiStringFuzzySearch(bool allow_hyperscan_, size_t max_hyperscan_regexp_length_, size_t max_hyperscan_regexp_total_length_, bool reject_expensive_hyperscan_regexps_)
|
||||||
: allow_hyperscan(allow_hyperscan_)
|
: allow_hyperscan(allow_hyperscan_)
|
||||||
, max_hyperscan_regexp_length(max_hyperscan_regexp_length_)
|
, max_hyperscan_regexp_length(max_hyperscan_regexp_length_)
|
||||||
, max_hyperscan_regexp_total_length(max_hyperscan_regexp_total_length_)
|
, max_hyperscan_regexp_total_length(max_hyperscan_regexp_total_length_)
|
||||||
|
, reject_expensive_hyperscan_regexps(reject_expensive_hyperscan_regexps_)
|
||||||
{}
|
{}
|
||||||
|
|
||||||
String getName() const override { return name; }
|
String getName() const override { return name; }
|
||||||
@ -112,14 +113,14 @@ public:
|
|||||||
col_needles_const->getValue<Array>(),
|
col_needles_const->getValue<Array>(),
|
||||||
vec_res, offsets_res,
|
vec_res, offsets_res,
|
||||||
edit_distance,
|
edit_distance,
|
||||||
allow_hyperscan, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length);
|
allow_hyperscan, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length, reject_expensive_hyperscan_regexps);
|
||||||
else
|
else
|
||||||
Impl::vectorVector(
|
Impl::vectorVector(
|
||||||
col_haystack_vector->getChars(), col_haystack_vector->getOffsets(),
|
col_haystack_vector->getChars(), col_haystack_vector->getOffsets(),
|
||||||
col_needles_vector->getData(), col_needles_vector->getOffsets(),
|
col_needles_vector->getData(), col_needles_vector->getOffsets(),
|
||||||
vec_res, offsets_res,
|
vec_res, offsets_res,
|
||||||
edit_distance,
|
edit_distance,
|
||||||
allow_hyperscan, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length);
|
allow_hyperscan, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length, reject_expensive_hyperscan_regexps);
|
||||||
|
|
||||||
// the combination of const haystack + const needle is not implemented because
|
// the combination of const haystack + const needle is not implemented because
|
||||||
// useDefaultImplementationForConstants() == true makes upper layers convert both to
|
// useDefaultImplementationForConstants() == true makes upper layers convert both to
|
||||||
@ -135,6 +136,7 @@ private:
|
|||||||
const bool allow_hyperscan;
|
const bool allow_hyperscan;
|
||||||
const size_t max_hyperscan_regexp_length;
|
const size_t max_hyperscan_regexp_length;
|
||||||
const size_t max_hyperscan_regexp_total_length;
|
const size_t max_hyperscan_regexp_total_length;
|
||||||
|
const bool reject_expensive_hyperscan_regexps;
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -53,13 +53,14 @@ public:
|
|||||||
static FunctionPtr create(ContextPtr context)
|
static FunctionPtr create(ContextPtr context)
|
||||||
{
|
{
|
||||||
const auto & settings = context->getSettingsRef();
|
const auto & settings = context->getSettingsRef();
|
||||||
return std::make_shared<FunctionsMultiStringSearch>(settings.allow_hyperscan, settings.max_hyperscan_regexp_length, settings.max_hyperscan_regexp_total_length);
|
return std::make_shared<FunctionsMultiStringSearch>(settings.allow_hyperscan, settings.max_hyperscan_regexp_length, settings.max_hyperscan_regexp_total_length, settings.reject_expensive_hyperscan_regexps);
|
||||||
}
|
}
|
||||||
|
|
||||||
FunctionsMultiStringSearch(bool allow_hyperscan_, size_t max_hyperscan_regexp_length_, size_t max_hyperscan_regexp_total_length_)
|
FunctionsMultiStringSearch(bool allow_hyperscan_, size_t max_hyperscan_regexp_length_, size_t max_hyperscan_regexp_total_length_, bool reject_expensive_hyperscan_regexps_)
|
||||||
: allow_hyperscan(allow_hyperscan_)
|
: allow_hyperscan(allow_hyperscan_)
|
||||||
, max_hyperscan_regexp_length(max_hyperscan_regexp_length_)
|
, max_hyperscan_regexp_length(max_hyperscan_regexp_length_)
|
||||||
, max_hyperscan_regexp_total_length(max_hyperscan_regexp_total_length_)
|
, max_hyperscan_regexp_total_length(max_hyperscan_regexp_total_length_)
|
||||||
|
, reject_expensive_hyperscan_regexps(reject_expensive_hyperscan_regexps_)
|
||||||
{}
|
{}
|
||||||
|
|
||||||
String getName() const override { return name; }
|
String getName() const override { return name; }
|
||||||
@ -108,13 +109,13 @@ public:
|
|||||||
col_haystack_vector->getChars(), col_haystack_vector->getOffsets(),
|
col_haystack_vector->getChars(), col_haystack_vector->getOffsets(),
|
||||||
col_needles_const->getValue<Array>(),
|
col_needles_const->getValue<Array>(),
|
||||||
vec_res, offsets_res,
|
vec_res, offsets_res,
|
||||||
allow_hyperscan, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length);
|
allow_hyperscan, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length, reject_expensive_hyperscan_regexps);
|
||||||
else
|
else
|
||||||
Impl::vectorVector(
|
Impl::vectorVector(
|
||||||
col_haystack_vector->getChars(), col_haystack_vector->getOffsets(),
|
col_haystack_vector->getChars(), col_haystack_vector->getOffsets(),
|
||||||
col_needles_vector->getData(), col_needles_vector->getOffsets(),
|
col_needles_vector->getData(), col_needles_vector->getOffsets(),
|
||||||
vec_res, offsets_res,
|
vec_res, offsets_res,
|
||||||
allow_hyperscan, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length);
|
allow_hyperscan, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length, reject_expensive_hyperscan_regexps);
|
||||||
|
|
||||||
// the combination of const haystack + const needle is not implemented because
|
// the combination of const haystack + const needle is not implemented because
|
||||||
// useDefaultImplementationForConstants() == true makes upper layers convert both to
|
// useDefaultImplementationForConstants() == true makes upper layers convert both to
|
||||||
@ -130,6 +131,7 @@ private:
|
|||||||
const bool allow_hyperscan;
|
const bool allow_hyperscan;
|
||||||
const size_t max_hyperscan_regexp_length;
|
const size_t max_hyperscan_regexp_length;
|
||||||
const size_t max_hyperscan_regexp_total_length;
|
const size_t max_hyperscan_regexp_total_length;
|
||||||
|
const bool reject_expensive_hyperscan_regexps;
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -6,7 +6,7 @@
|
|||||||
#include <Columns/ColumnString.h>
|
#include <Columns/ColumnString.h>
|
||||||
#include <Columns/ColumnsNumber.h>
|
#include <Columns/ColumnsNumber.h>
|
||||||
#include <Core/ColumnNumbers.h>
|
#include <Core/ColumnNumbers.h>
|
||||||
#include "Regexps.h"
|
#include <Functions/Regexps.h>
|
||||||
|
|
||||||
#include "config.h"
|
#include "config.h"
|
||||||
#include <re2_st/re2.h>
|
#include <re2_st/re2.h>
|
||||||
|
@ -6,7 +6,7 @@
|
|||||||
#include <DataTypes/DataTypesNumber.h>
|
#include <DataTypes/DataTypesNumber.h>
|
||||||
#include <DataTypes/DataTypeArray.h>
|
#include <DataTypes/DataTypeArray.h>
|
||||||
#include <Functions/checkHyperscanRegexp.h>
|
#include <Functions/checkHyperscanRegexp.h>
|
||||||
#include "Regexps.h"
|
#include <Functions/Regexps.h>
|
||||||
|
|
||||||
#include "config.h"
|
#include "config.h"
|
||||||
|
|
||||||
@ -51,9 +51,10 @@ struct MultiMatchAllIndicesImpl
|
|||||||
PaddedPODArray<UInt64> & offsets,
|
PaddedPODArray<UInt64> & offsets,
|
||||||
bool allow_hyperscan,
|
bool allow_hyperscan,
|
||||||
size_t max_hyperscan_regexp_length,
|
size_t max_hyperscan_regexp_length,
|
||||||
size_t max_hyperscan_regexp_total_length)
|
size_t max_hyperscan_regexp_total_length,
|
||||||
|
bool reject_expensive_hyperscan_regexps)
|
||||||
{
|
{
|
||||||
vectorConstant(haystack_data, haystack_offsets, needles_arr, res, offsets, std::nullopt, allow_hyperscan, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length);
|
vectorConstant(haystack_data, haystack_offsets, needles_arr, res, offsets, std::nullopt, allow_hyperscan, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length, reject_expensive_hyperscan_regexps);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void vectorConstant(
|
static void vectorConstant(
|
||||||
@ -65,7 +66,8 @@ struct MultiMatchAllIndicesImpl
|
|||||||
std::optional<UInt32> edit_distance,
|
std::optional<UInt32> edit_distance,
|
||||||
bool allow_hyperscan,
|
bool allow_hyperscan,
|
||||||
size_t max_hyperscan_regexp_length,
|
size_t max_hyperscan_regexp_length,
|
||||||
size_t max_hyperscan_regexp_total_length)
|
size_t max_hyperscan_regexp_total_length,
|
||||||
|
bool reject_expensive_hyperscan_regexps)
|
||||||
{
|
{
|
||||||
if (!allow_hyperscan)
|
if (!allow_hyperscan)
|
||||||
throw Exception(ErrorCodes::FUNCTION_NOT_ALLOWED, "Hyperscan functions are disabled, because setting 'allow_hyperscan' is set to 0");
|
throw Exception(ErrorCodes::FUNCTION_NOT_ALLOWED, "Hyperscan functions are disabled, because setting 'allow_hyperscan' is set to 0");
|
||||||
@ -77,6 +79,14 @@ struct MultiMatchAllIndicesImpl
|
|||||||
|
|
||||||
checkHyperscanRegexp(needles, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length);
|
checkHyperscanRegexp(needles, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length);
|
||||||
|
|
||||||
|
if (reject_expensive_hyperscan_regexps)
|
||||||
|
{
|
||||||
|
SlowWithHyperscanChecker checker;
|
||||||
|
for (auto needle : needles)
|
||||||
|
if (checker.isSlow(needle))
|
||||||
|
throw Exception(ErrorCodes::HYPERSCAN_CANNOT_SCAN_TEXT, "Regular expression evaluation in vectorscan will be too slow. To ignore this error, disable setting 'reject_expensive_hyperscan_regexps'.");
|
||||||
|
}
|
||||||
|
|
||||||
offsets.resize(haystack_offsets.size());
|
offsets.resize(haystack_offsets.size());
|
||||||
|
|
||||||
if (needles_arr.empty())
|
if (needles_arr.empty())
|
||||||
@ -135,6 +145,7 @@ struct MultiMatchAllIndicesImpl
|
|||||||
(void)edit_distance;
|
(void)edit_distance;
|
||||||
(void)max_hyperscan_regexp_length;
|
(void)max_hyperscan_regexp_length;
|
||||||
(void)max_hyperscan_regexp_total_length;
|
(void)max_hyperscan_regexp_total_length;
|
||||||
|
(void)reject_expensive_hyperscan_regexps;
|
||||||
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "multi-search all indices is not implemented when vectorscan is off");
|
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "multi-search all indices is not implemented when vectorscan is off");
|
||||||
#endif // USE_VECTORSCAN
|
#endif // USE_VECTORSCAN
|
||||||
}
|
}
|
||||||
@ -148,9 +159,10 @@ struct MultiMatchAllIndicesImpl
|
|||||||
PaddedPODArray<UInt64> & offsets,
|
PaddedPODArray<UInt64> & offsets,
|
||||||
bool allow_hyperscan,
|
bool allow_hyperscan,
|
||||||
size_t max_hyperscan_regexp_length,
|
size_t max_hyperscan_regexp_length,
|
||||||
size_t max_hyperscan_regexp_total_length)
|
size_t max_hyperscan_regexp_total_length,
|
||||||
|
bool reject_expensive_hyperscan_regexps)
|
||||||
{
|
{
|
||||||
vectorVector(haystack_data, haystack_offsets, needles_data, needles_offsets, res, offsets, std::nullopt, allow_hyperscan, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length);
|
vectorVector(haystack_data, haystack_offsets, needles_data, needles_offsets, res, offsets, std::nullopt, allow_hyperscan, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length, reject_expensive_hyperscan_regexps);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void vectorVector(
|
static void vectorVector(
|
||||||
@ -163,7 +175,8 @@ struct MultiMatchAllIndicesImpl
|
|||||||
std::optional<UInt32> edit_distance,
|
std::optional<UInt32> edit_distance,
|
||||||
bool allow_hyperscan,
|
bool allow_hyperscan,
|
||||||
size_t max_hyperscan_regexp_length,
|
size_t max_hyperscan_regexp_length,
|
||||||
size_t max_hyperscan_regexp_total_length)
|
size_t max_hyperscan_regexp_total_length,
|
||||||
|
bool reject_expensive_hyperscan_regexps)
|
||||||
{
|
{
|
||||||
if (!allow_hyperscan)
|
if (!allow_hyperscan)
|
||||||
throw Exception(ErrorCodes::FUNCTION_NOT_ALLOWED, "Hyperscan functions are disabled, because setting 'allow_hyperscan' is set to 0");
|
throw Exception(ErrorCodes::FUNCTION_NOT_ALLOWED, "Hyperscan functions are disabled, because setting 'allow_hyperscan' is set to 0");
|
||||||
@ -195,6 +208,14 @@ struct MultiMatchAllIndicesImpl
|
|||||||
|
|
||||||
checkHyperscanRegexp(needles, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length);
|
checkHyperscanRegexp(needles, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length);
|
||||||
|
|
||||||
|
if (reject_expensive_hyperscan_regexps)
|
||||||
|
{
|
||||||
|
SlowWithHyperscanChecker checker;
|
||||||
|
for (auto needle : needles)
|
||||||
|
if (checker.isSlow(needle))
|
||||||
|
throw Exception(ErrorCodes::HYPERSCAN_CANNOT_SCAN_TEXT, "Regular expression evaluation in vectorscan will be too slow. To ignore this error, disable setting 'reject_expensive_hyperscan_regexps'.");
|
||||||
|
}
|
||||||
|
|
||||||
MultiRegexps::DeferredConstructedRegexpsPtr deferred_constructed_regexps = MultiRegexps::getOrSet</*SaveIndices*/ true, WithEditDistance>(needles, edit_distance);
|
MultiRegexps::DeferredConstructedRegexpsPtr deferred_constructed_regexps = MultiRegexps::getOrSet</*SaveIndices*/ true, WithEditDistance>(needles, edit_distance);
|
||||||
MultiRegexps::Regexps * regexps = deferred_constructed_regexps->get();
|
MultiRegexps::Regexps * regexps = deferred_constructed_regexps->get();
|
||||||
hs_scratch_t * scratch = nullptr;
|
hs_scratch_t * scratch = nullptr;
|
||||||
@ -249,6 +270,7 @@ struct MultiMatchAllIndicesImpl
|
|||||||
(void)edit_distance;
|
(void)edit_distance;
|
||||||
(void)max_hyperscan_regexp_length;
|
(void)max_hyperscan_regexp_length;
|
||||||
(void)max_hyperscan_regexp_total_length;
|
(void)max_hyperscan_regexp_total_length;
|
||||||
|
(void)reject_expensive_hyperscan_regexps;
|
||||||
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "multi-search all indices is not implemented when vectorscan is off");
|
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "multi-search all indices is not implemented when vectorscan is off");
|
||||||
#endif // USE_VECTORSCAN
|
#endif // USE_VECTORSCAN
|
||||||
}
|
}
|
||||||
|
@ -5,7 +5,7 @@
|
|||||||
#include <Columns/ColumnString.h>
|
#include <Columns/ColumnString.h>
|
||||||
#include <DataTypes/DataTypesNumber.h>
|
#include <DataTypes/DataTypesNumber.h>
|
||||||
#include <Functions/checkHyperscanRegexp.h>
|
#include <Functions/checkHyperscanRegexp.h>
|
||||||
#include "Regexps.h"
|
#include <Functions/Regexps.h>
|
||||||
|
|
||||||
#include "config.h"
|
#include "config.h"
|
||||||
|
|
||||||
@ -65,9 +65,10 @@ struct MultiMatchAnyImpl
|
|||||||
PaddedPODArray<UInt64> & offsets,
|
PaddedPODArray<UInt64> & offsets,
|
||||||
bool allow_hyperscan,
|
bool allow_hyperscan,
|
||||||
size_t max_hyperscan_regexp_length,
|
size_t max_hyperscan_regexp_length,
|
||||||
size_t max_hyperscan_regexp_total_length)
|
size_t max_hyperscan_regexp_total_length,
|
||||||
|
bool reject_expensive_hyperscan_regexps)
|
||||||
{
|
{
|
||||||
vectorConstant(haystack_data, haystack_offsets, needles_arr, res, offsets, std::nullopt, allow_hyperscan, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length);
|
vectorConstant(haystack_data, haystack_offsets, needles_arr, res, offsets, std::nullopt, allow_hyperscan, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length, reject_expensive_hyperscan_regexps);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void vectorConstant(
|
static void vectorConstant(
|
||||||
@ -79,7 +80,8 @@ struct MultiMatchAnyImpl
|
|||||||
[[maybe_unused]] std::optional<UInt32> edit_distance,
|
[[maybe_unused]] std::optional<UInt32> edit_distance,
|
||||||
bool allow_hyperscan,
|
bool allow_hyperscan,
|
||||||
size_t max_hyperscan_regexp_length,
|
size_t max_hyperscan_regexp_length,
|
||||||
size_t max_hyperscan_regexp_total_length)
|
size_t max_hyperscan_regexp_total_length,
|
||||||
|
bool reject_expensive_hyperscan_regexps)
|
||||||
{
|
{
|
||||||
if (!allow_hyperscan)
|
if (!allow_hyperscan)
|
||||||
throw Exception(ErrorCodes::FUNCTION_NOT_ALLOWED, "Hyperscan functions are disabled, because setting 'allow_hyperscan' is set to 0");
|
throw Exception(ErrorCodes::FUNCTION_NOT_ALLOWED, "Hyperscan functions are disabled, because setting 'allow_hyperscan' is set to 0");
|
||||||
@ -91,6 +93,14 @@ struct MultiMatchAnyImpl
|
|||||||
|
|
||||||
checkHyperscanRegexp(needles, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length);
|
checkHyperscanRegexp(needles, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length);
|
||||||
|
|
||||||
|
if (reject_expensive_hyperscan_regexps)
|
||||||
|
{
|
||||||
|
SlowWithHyperscanChecker checker;
|
||||||
|
for (auto needle : needles)
|
||||||
|
if (checker.isSlow(needle))
|
||||||
|
throw Exception(ErrorCodes::HYPERSCAN_CANNOT_SCAN_TEXT, "Regular expression evaluation in vectorscan will be too slow. To ignore this error, disable setting 'reject_expensive_hyperscan_regexps'.");
|
||||||
|
}
|
||||||
|
|
||||||
res.resize(haystack_offsets.size());
|
res.resize(haystack_offsets.size());
|
||||||
|
|
||||||
if (needles_arr.empty())
|
if (needles_arr.empty())
|
||||||
@ -175,9 +185,10 @@ struct MultiMatchAnyImpl
|
|||||||
PaddedPODArray<UInt64> & offsets,
|
PaddedPODArray<UInt64> & offsets,
|
||||||
bool allow_hyperscan,
|
bool allow_hyperscan,
|
||||||
size_t max_hyperscan_regexp_length,
|
size_t max_hyperscan_regexp_length,
|
||||||
size_t max_hyperscan_regexp_total_length)
|
size_t max_hyperscan_regexp_total_length,
|
||||||
|
bool reject_expensive_hyperscan_regexps)
|
||||||
{
|
{
|
||||||
vectorVector(haystack_data, haystack_offsets, needles_data, needles_offsets, res, offsets, std::nullopt, allow_hyperscan, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length);
|
vectorVector(haystack_data, haystack_offsets, needles_data, needles_offsets, res, offsets, std::nullopt, allow_hyperscan, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length, reject_expensive_hyperscan_regexps);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void vectorVector(
|
static void vectorVector(
|
||||||
@ -190,7 +201,8 @@ struct MultiMatchAnyImpl
|
|||||||
std::optional<UInt32> edit_distance,
|
std::optional<UInt32> edit_distance,
|
||||||
bool allow_hyperscan,
|
bool allow_hyperscan,
|
||||||
size_t max_hyperscan_regexp_length,
|
size_t max_hyperscan_regexp_length,
|
||||||
size_t max_hyperscan_regexp_total_length)
|
size_t max_hyperscan_regexp_total_length,
|
||||||
|
bool reject_expensive_hyperscan_regexps)
|
||||||
{
|
{
|
||||||
if (!allow_hyperscan)
|
if (!allow_hyperscan)
|
||||||
throw Exception(ErrorCodes::FUNCTION_NOT_ALLOWED, "Hyperscan functions are disabled, because setting 'allow_hyperscan' is set to 0");
|
throw Exception(ErrorCodes::FUNCTION_NOT_ALLOWED, "Hyperscan functions are disabled, because setting 'allow_hyperscan' is set to 0");
|
||||||
@ -209,9 +221,7 @@ struct MultiMatchAnyImpl
|
|||||||
needles.reserve(needles_offsets[i] - prev_needles_offset);
|
needles.reserve(needles_offsets[i] - prev_needles_offset);
|
||||||
|
|
||||||
for (size_t j = prev_needles_offset; j < needles_offsets[i]; ++j)
|
for (size_t j = prev_needles_offset; j < needles_offsets[i]; ++j)
|
||||||
{
|
|
||||||
needles.emplace_back(needles_data_string->getDataAt(j).toView());
|
needles.emplace_back(needles_data_string->getDataAt(j).toView());
|
||||||
}
|
|
||||||
|
|
||||||
if (needles.empty())
|
if (needles.empty())
|
||||||
{
|
{
|
||||||
@ -223,6 +233,14 @@ struct MultiMatchAnyImpl
|
|||||||
|
|
||||||
checkHyperscanRegexp(needles, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length);
|
checkHyperscanRegexp(needles, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length);
|
||||||
|
|
||||||
|
if (reject_expensive_hyperscan_regexps)
|
||||||
|
{
|
||||||
|
SlowWithHyperscanChecker checker;
|
||||||
|
for (auto needle : needles)
|
||||||
|
if (checker.isSlow(needle))
|
||||||
|
throw Exception(ErrorCodes::HYPERSCAN_CANNOT_SCAN_TEXT, "Regular expression evaluation in vectorscan will be too slow. To ignore this error, disable setting 'reject_expensive_hyperscan_regexps'.");
|
||||||
|
}
|
||||||
|
|
||||||
MultiRegexps::DeferredConstructedRegexpsPtr deferred_constructed_regexps = MultiRegexps::getOrSet</*SaveIndices*/ FindAnyIndex, WithEditDistance>(needles, edit_distance);
|
MultiRegexps::DeferredConstructedRegexpsPtr deferred_constructed_regexps = MultiRegexps::getOrSet</*SaveIndices*/ FindAnyIndex, WithEditDistance>(needles, edit_distance);
|
||||||
MultiRegexps::Regexps * regexps = deferred_constructed_regexps->get();
|
MultiRegexps::Regexps * regexps = deferred_constructed_regexps->get();
|
||||||
hs_scratch_t * scratch = nullptr;
|
hs_scratch_t * scratch = nullptr;
|
||||||
@ -309,6 +327,13 @@ struct MultiMatchAnyImpl
|
|||||||
|
|
||||||
checkHyperscanRegexp(needles, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length);
|
checkHyperscanRegexp(needles, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length);
|
||||||
|
|
||||||
|
for (auto needle : needles)
|
||||||
|
{
|
||||||
|
SlowWithHyperscanChecker checker;
|
||||||
|
if (checker.isSlow(needle))
|
||||||
|
throw Exception(ErrorCodes::HYPERSCAN_CANNOT_SCAN_TEXT, "Regular expression evaluation in vectorscan will be too slow. To ignore this error, disable setting 'reject_expensive_hyperscan_regexps'.");
|
||||||
|
}
|
||||||
|
|
||||||
for (size_t j = 0; j < needles.size(); ++j)
|
for (size_t j = 0; j < needles.size(); ++j)
|
||||||
{
|
{
|
||||||
String needle(needles[j]);
|
String needle(needles[j]);
|
||||||
|
@ -32,7 +32,8 @@ struct MultiSearchFirstIndexImpl
|
|||||||
PaddedPODArray<UInt64> & /*offsets*/,
|
PaddedPODArray<UInt64> & /*offsets*/,
|
||||||
bool /*allow_hyperscan*/,
|
bool /*allow_hyperscan*/,
|
||||||
size_t /*max_hyperscan_regexp_length*/,
|
size_t /*max_hyperscan_regexp_length*/,
|
||||||
size_t /*max_hyperscan_regexp_total_length*/)
|
size_t /*max_hyperscan_regexp_total_length*/,
|
||||||
|
bool /*reject_expensive_hyperscan_regexps*/)
|
||||||
{
|
{
|
||||||
// For performance of Volnitsky search, it is crucial to save only one byte for pattern number.
|
// For performance of Volnitsky search, it is crucial to save only one byte for pattern number.
|
||||||
if (needles_arr.size() > std::numeric_limits<UInt8>::max())
|
if (needles_arr.size() > std::numeric_limits<UInt8>::max())
|
||||||
@ -78,7 +79,8 @@ struct MultiSearchFirstIndexImpl
|
|||||||
PaddedPODArray<UInt64> & /*offsets*/,
|
PaddedPODArray<UInt64> & /*offsets*/,
|
||||||
bool /*allow_hyperscan*/,
|
bool /*allow_hyperscan*/,
|
||||||
size_t /*max_hyperscan_regexp_length*/,
|
size_t /*max_hyperscan_regexp_length*/,
|
||||||
size_t /*max_hyperscan_regexp_total_length*/)
|
size_t /*max_hyperscan_regexp_total_length*/,
|
||||||
|
bool /*reject_expensive_hyperscan_regexps*/)
|
||||||
{
|
{
|
||||||
const size_t haystack_size = haystack_offsets.size();
|
const size_t haystack_size = haystack_offsets.size();
|
||||||
res.resize(haystack_size);
|
res.resize(haystack_size);
|
||||||
|
@ -32,7 +32,8 @@ struct MultiSearchFirstPositionImpl
|
|||||||
PaddedPODArray<UInt64> & /*offsets*/,
|
PaddedPODArray<UInt64> & /*offsets*/,
|
||||||
bool /*allow_hyperscan*/,
|
bool /*allow_hyperscan*/,
|
||||||
size_t /*max_hyperscan_regexp_length*/,
|
size_t /*max_hyperscan_regexp_length*/,
|
||||||
size_t /*max_hyperscan_regexp_total_length*/)
|
size_t /*max_hyperscan_regexp_total_length*/,
|
||||||
|
bool /*reject_expensive_hyperscan_regexps*/)
|
||||||
{
|
{
|
||||||
// For performance of Volnitsky search, it is crucial to save only one byte for pattern number.
|
// For performance of Volnitsky search, it is crucial to save only one byte for pattern number.
|
||||||
if (needles_arr.size() > std::numeric_limits<UInt8>::max())
|
if (needles_arr.size() > std::numeric_limits<UInt8>::max())
|
||||||
@ -87,7 +88,8 @@ struct MultiSearchFirstPositionImpl
|
|||||||
PaddedPODArray<UInt64> & /*offsets*/,
|
PaddedPODArray<UInt64> & /*offsets*/,
|
||||||
bool /*allow_hyperscan*/,
|
bool /*allow_hyperscan*/,
|
||||||
size_t /*max_hyperscan_regexp_length*/,
|
size_t /*max_hyperscan_regexp_length*/,
|
||||||
size_t /*max_hyperscan_regexp_total_length*/)
|
size_t /*max_hyperscan_regexp_total_length*/,
|
||||||
|
bool /*reject_expensive_hyperscan_regexps*/)
|
||||||
{
|
{
|
||||||
const size_t haystack_size = haystack_offsets.size();
|
const size_t haystack_size = haystack_offsets.size();
|
||||||
res.resize(haystack_size);
|
res.resize(haystack_size);
|
||||||
|
@ -32,7 +32,8 @@ struct MultiSearchImpl
|
|||||||
PaddedPODArray<UInt64> & /*offsets*/,
|
PaddedPODArray<UInt64> & /*offsets*/,
|
||||||
bool /*allow_hyperscan*/,
|
bool /*allow_hyperscan*/,
|
||||||
size_t /*max_hyperscan_regexp_length*/,
|
size_t /*max_hyperscan_regexp_length*/,
|
||||||
size_t /*max_hyperscan_regexp_total_length*/)
|
size_t /*max_hyperscan_regexp_total_length*/,
|
||||||
|
bool /*reject_expensive_hyperscan_regexps*/)
|
||||||
{
|
{
|
||||||
// For performance of Volnitsky search, it is crucial to save only one byte for pattern number.
|
// For performance of Volnitsky search, it is crucial to save only one byte for pattern number.
|
||||||
if (needles_arr.size() > std::numeric_limits<UInt8>::max())
|
if (needles_arr.size() > std::numeric_limits<UInt8>::max())
|
||||||
@ -77,7 +78,8 @@ struct MultiSearchImpl
|
|||||||
PaddedPODArray<UInt64> & /*offsets*/,
|
PaddedPODArray<UInt64> & /*offsets*/,
|
||||||
bool /*allow_hyperscan*/,
|
bool /*allow_hyperscan*/,
|
||||||
size_t /*max_hyperscan_regexp_length*/,
|
size_t /*max_hyperscan_regexp_length*/,
|
||||||
size_t /*max_hyperscan_regexp_total_length*/)
|
size_t /*max_hyperscan_regexp_total_length*/,
|
||||||
|
bool /*reject_expensive_hyperscan_regexps*/)
|
||||||
{
|
{
|
||||||
const size_t haystack_size = haystack_offsets.size();
|
const size_t haystack_size = haystack_offsets.size();
|
||||||
res.resize(haystack_size);
|
res.resize(haystack_size);
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
#include <Functions/checkHyperscanRegexp.h>
|
#include <Functions/checkHyperscanRegexp.h>
|
||||||
|
|
||||||
#include <Common/Exception.h>
|
#include <Common/Exception.h>
|
||||||
|
#include <charconv>
|
||||||
|
|
||||||
namespace DB
|
namespace DB
|
||||||
{
|
{
|
||||||
@ -27,4 +28,78 @@ void checkHyperscanRegexp(const std::vector<std::string_view> & regexps, size_t
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
namespace
|
||||||
|
{
|
||||||
|
|
||||||
|
bool isLargerThanFifty(std::string_view str)
|
||||||
|
{
|
||||||
|
int number;
|
||||||
|
auto [_, ec] = std::from_chars(str.begin(), str.end(), number);
|
||||||
|
if (ec != std::errc())
|
||||||
|
return false;
|
||||||
|
return number > 50;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check for sub-patterns of the form x{n} or x{n,} can be expensive. Ignore spaces before/after n and m.
|
||||||
|
bool SlowWithHyperscanChecker::isSlowOneRepeat(std::string_view regexp)
|
||||||
|
{
|
||||||
|
re2_st::StringPiece haystack(regexp.data(), regexp.size());
|
||||||
|
re2_st::StringPiece matches[2];
|
||||||
|
size_t start_pos = 0;
|
||||||
|
while (start_pos < regexp.size())
|
||||||
|
{
|
||||||
|
if (searcher_one_repeat.Match(haystack, start_pos, regexp.size(), re2_st::RE2::Anchor::UNANCHORED, matches, 2))
|
||||||
|
{
|
||||||
|
const auto & match = matches[0];
|
||||||
|
start_pos += (matches[0].data() - haystack.data()) + match.length(); // fwd by prefix + match length
|
||||||
|
const auto & submatch = matches[1];
|
||||||
|
if (isLargerThanFifty({submatch.data(), submatch.size()}))
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check if sub-patterns of the form x{n,m} can be expensive. Ignore spaces before/after n and m.
|
||||||
|
bool SlowWithHyperscanChecker::isSlowTwoRepeats(std::string_view regexp)
|
||||||
|
{
|
||||||
|
re2_st::StringPiece haystack(regexp.data(), regexp.size());
|
||||||
|
re2_st::StringPiece matches[3];
|
||||||
|
size_t start_pos = 0;
|
||||||
|
while (start_pos < regexp.size())
|
||||||
|
{
|
||||||
|
if (searcher_two_repeats.Match(haystack, start_pos, regexp.size(), re2_st::RE2::Anchor::UNANCHORED, matches, 3))
|
||||||
|
{
|
||||||
|
const auto & match = matches[0];
|
||||||
|
start_pos += (matches[0].data() - haystack.data()) + match.length(); // fwd by prefix + match length
|
||||||
|
const auto & submatch1 = matches[1];
|
||||||
|
const auto & submatch2 = matches[2];
|
||||||
|
if (isLargerThanFifty({submatch1.data(), submatch1.size()})
|
||||||
|
|| isLargerThanFifty({submatch2.data(), submatch2.size()}))
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
SlowWithHyperscanChecker::SlowWithHyperscanChecker()
|
||||||
|
: searcher_one_repeat(R"(\{\s*([\d]+)\s*,?\s*})")
|
||||||
|
, searcher_two_repeats(R"(\{\s*([\d]+)\s*,\s*([\d]+)\s*\})")
|
||||||
|
{}
|
||||||
|
|
||||||
|
bool SlowWithHyperscanChecker::isSlow(std::string_view regexp)
|
||||||
|
{
|
||||||
|
if (isSlowOneRepeat(regexp))
|
||||||
|
return true;
|
||||||
|
else if (isSlowTwoRepeats(regexp))
|
||||||
|
return true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -3,9 +3,27 @@
|
|||||||
#include <string_view>
|
#include <string_view>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
#include <re2_st/re2.h>
|
||||||
|
|
||||||
namespace DB
|
namespace DB
|
||||||
{
|
{
|
||||||
|
|
||||||
void checkHyperscanRegexp(const std::vector<std::string_view> & regexps, size_t max_hyperscan_regexp_length, size_t max_hyperscan_regexp_total_length);
|
void checkHyperscanRegexp(const std::vector<std::string_view> & regexps, size_t max_hyperscan_regexp_length, size_t max_hyperscan_regexp_total_length);
|
||||||
|
|
||||||
|
/// Regexp evaluation with hyperscan can be slow for certain patterns due to NFA state explosion. Try to identify such patterns on a
|
||||||
|
/// best-effort basis.
|
||||||
|
|
||||||
|
class SlowWithHyperscanChecker
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
SlowWithHyperscanChecker();
|
||||||
|
bool isSlow(std::string_view regexp);
|
||||||
|
|
||||||
|
private:
|
||||||
|
bool isSlowOneRepeat(std::string_view regexp);
|
||||||
|
bool isSlowTwoRepeats(std::string_view regexp);
|
||||||
|
re2_st::RE2 searcher_one_repeat;
|
||||||
|
re2_st::RE2 searcher_two_repeats;
|
||||||
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
#include "FunctionsStringSearchToString.h"
|
#include <Functions/FunctionsStringSearchToString.h>
|
||||||
#include "FunctionFactory.h"
|
#include <Functions/FunctionFactory.h>
|
||||||
#include "Regexps.h"
|
#include <Functions/Regexps.h>
|
||||||
#include <Common/OptimizedRegularExpression.h>
|
#include <Common/OptimizedRegularExpression.h>
|
||||||
|
|
||||||
|
|
||||||
|
58
tests/queries/0_stateless/02560_regexp_denial_of_service.sql
Normal file
58
tests/queries/0_stateless/02560_regexp_denial_of_service.sql
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
-- Tags: no-fasttest, use-vectorscan
|
||||||
|
|
||||||
|
DROP TABLE IF EXISTS t;
|
||||||
|
|
||||||
|
-- test that the check which rejects hyperscan regexes with too big bounded repeats works
|
||||||
|
|
||||||
|
-- {n}
|
||||||
|
SELECT multiMatchAny('test', ['.{51}']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT }
|
||||||
|
SELECT multiMatchAny('test', ['.{ 51}']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT }
|
||||||
|
SELECT multiMatchAny('test', ['.{51 }']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT }
|
||||||
|
SELECT multiMatchAny('test', ['prefix.{51}']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT }
|
||||||
|
SELECT multiMatchAny('test', ['.{51}.suffix']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT }
|
||||||
|
SELECT multiMatchAny('test', ['.{4,4}midfix{51}']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT }
|
||||||
|
|
||||||
|
-- {n,}
|
||||||
|
SELECT multiMatchAny('test', ['.{51,}']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT }
|
||||||
|
SELECT multiMatchAny('test', ['.{ 51,}']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT }
|
||||||
|
SELECT multiMatchAny('test', ['.{51 ,}']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT }
|
||||||
|
SELECT multiMatchAny('test', ['.{51, }']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT }
|
||||||
|
SELECT multiMatchAny('test', ['prefix.{51,}']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT }
|
||||||
|
SELECT multiMatchAny('test', ['.{51,}.suffix']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT }
|
||||||
|
SELECT multiMatchAny('test', ['.{4,4}midfix{51,}']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT }
|
||||||
|
|
||||||
|
-- {n,m}
|
||||||
|
SELECT multiMatchAny('test', ['.{1,51}']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT }
|
||||||
|
SELECT multiMatchAny('test', ['.{51,52}']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT }
|
||||||
|
SELECT multiMatchAny('test', ['.{ 51,52}']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT }
|
||||||
|
SELECT multiMatchAny('test', ['.{51 ,52}']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT }
|
||||||
|
SELECT multiMatchAny('test', ['.{51, 52}']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT }
|
||||||
|
SELECT multiMatchAny('test', ['.{51,52 }']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT }
|
||||||
|
SELECT multiMatchAny('test', ['prefix.{1,51}']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT }
|
||||||
|
SELECT multiMatchAny('test', ['.{1,51}.suffix']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT }
|
||||||
|
SELECT multiMatchAny('test', ['.{4,4}midfix{1,51}']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT }
|
||||||
|
|
||||||
|
-- test that the check is implemented in all functions which use vectorscan
|
||||||
|
|
||||||
|
CREATE TABLE t(c String) Engine=MergeTree() ORDER BY c;
|
||||||
|
INSERT INTO t VALUES('Hallo Welt');
|
||||||
|
|
||||||
|
SELECT multiMatchAny('Hallo Welt', ['.{51}']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT }
|
||||||
|
SELECT multiMatchAny(c, ['.{51}']) FROM t; -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT }
|
||||||
|
|
||||||
|
SELECT multiMatchAnyIndex('Hallo Welt', ['.{51}']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT }
|
||||||
|
SELECT multiMatchAnyIndex(c, ['.{51}']) FROM t; -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT }
|
||||||
|
|
||||||
|
SELECT multiMatchAllIndices('Hallo Welt', ['.{51}']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT }
|
||||||
|
SELECT multiMatchAllIndices(c, ['.{51}']) FROM t; -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT }
|
||||||
|
|
||||||
|
SELECT multiFuzzyMatchAny('Hallo Welt', 1, ['.{51}']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT }
|
||||||
|
SELECT multiFuzzyMatchAny(c, 1, ['.{51}']) FROM t; -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT }
|
||||||
|
|
||||||
|
SELECT multiFuzzyMatchAnyIndex('Hallo Welt', 1, ['.{51}']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT }
|
||||||
|
SELECT multiFuzzyMatchAnyIndex(c, 1, ['.{51}']) FROM t; -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT }
|
||||||
|
|
||||||
|
SELECT multiFuzzyMatchAllIndices('Hallo Welt', 1, ['.{51}']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT }
|
||||||
|
SELECT multiFuzzyMatchAllIndices(c, 1, ['.{51}']) FROM t; -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT }
|
||||||
|
|
||||||
|
DROP TABLE t;
|
Loading…
Reference in New Issue
Block a user