diff --git a/docs/en/sql-reference/functions/string-search-functions.md b/docs/en/sql-reference/functions/string-search-functions.md index 7b14b0e96de..b6b70c7795b 100644 --- a/docs/en/sql-reference/functions/string-search-functions.md +++ b/docs/en/sql-reference/functions/string-search-functions.md @@ -391,10 +391,18 @@ For patterns to search for substrings in a string, it is better to use LIKE or ## multiMatchAny(haystack, \[pattern1, pattern2, …, patternn\]) -The same as `match`, but returns 0 if none of the regular expressions are matched and 1 if any of the patterns matches. It uses [hyperscan](https://github.com/intel/hyperscan) library. For patterns to search substrings in a string, it is better to use `multiSearchAny` since it works much faster. +The same as `match`, but returns 0 if none of the regular expressions are matched and 1 if any of the patterns matches. For patterns to search substrings in a string, it is better to use `multiSearchAny` since it works much faster. :::note -The length of any of the `haystack` string must be less than 232 bytes otherwise the exception is thrown. This restriction takes place because of hyperscan API. +Functions `multiMatchAny`, `multiMatchAnyIndex`, `multiMatchAllIndices` and their fuzzy equivalents (`multiFuzzyMatchAny`, +`multiFuzzyMatchAnyIndex`, `multiFuzzyMatchAllIndices`) use the (Vectorscan)[https://github.com/VectorCamp/vectorscan] library. As such, +they are only enabled if ClickHouse is compiled with support for vectorscan. + +Due to restrictions of vectorscan, the length of the `haystack` string must be less than 232 bytes. + +Hyperscan is generally vulnerable to regular expression denial of service (ReDoS) attacks (e.g. see +(here)[https://www.usenix.org/conference/usenixsecurity22/presentation/turonova], (here)[https://doi.org/10.1007/s10664-021-10033-1] and +(here)[ https://doi.org/10.1145/3236024.3236027]. Users are adviced to check the provided patterns carefully. ::: ## multiMatchAnyIndex(haystack, \[pattern1, pattern2, …, patternn\]) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 983ec4d6416..e378de77875 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -446,6 +446,7 @@ class IColumn; M(Bool, allow_hyperscan, true, "Allow functions that use Hyperscan library. Disable to avoid potentially long compilation times and excessive resource usage.", 0) \ M(UInt64, max_hyperscan_regexp_length, 0, "Max length of regexp than can be used in hyperscan multi-match functions. Zero means unlimited.", 0) \ M(UInt64, max_hyperscan_regexp_total_length, 0, "Max total length of all regexps than can be used in hyperscan multi-match functions (per every function). Zero means unlimited.", 0) \ + M(Bool, reject_expensive_hyperscan_regexps, true, "Reject patterns which will likely be expensive to evaluate with hyperscan (due to NFA state explosion)", 0) \ M(Bool, allow_simdjson, true, "Allow using simdjson library in 'JSON*' functions if AVX2 instructions are available. If disabled rapidjson will be used.", 0) \ M(Bool, allow_introspection_functions, false, "Allow functions for introspection of ELF and DWARF for query profiling. These functions are slow and may impose security considerations.", 0) \ \ diff --git a/src/Dictionaries/RegExpTreeDictionary.cpp b/src/Dictionaries/RegExpTreeDictionary.cpp index fdda2cd407d..c636f200324 100644 --- a/src/Dictionaries/RegExpTreeDictionary.cpp +++ b/src/Dictionaries/RegExpTreeDictionary.cpp @@ -17,6 +17,7 @@ #include #include +#include #include #include @@ -152,53 +153,6 @@ void RegExpTreeDictionary::calculateBytesAllocated() bytes_allocated += 2 * sizeof(UInt64) * topology_order.size(); } -namespace -{ - /// hyper scan is not good at processing regex containing {0, 200} - /// This will make re compilation slow and failed. So we select this heavy regular expressions and - /// process it with re2. - struct RegexChecker - { - re2_st::RE2 searcher; - RegexChecker() : searcher(R"(\{([\d]+),([\d]+)\})") {} - - static bool isFigureLargerThanFifty(const String & str) - try - { - auto number = std::stoi(str); - return number > 50; - } - catch (std::exception &) - { - return false; - } - - [[maybe_unused]] - bool isSimpleRegex(const String & regex) const - { - - re2_st::StringPiece haystack(regex.data(), regex.size()); - re2_st::StringPiece matches[10]; - size_t start_pos = 0; - while (start_pos < regex.size()) - { - if (searcher.Match(haystack, start_pos, regex.size(), re2_st::RE2::Anchor::UNANCHORED, matches, 10)) - { - const auto & match = matches[0]; - start_pos += match.length(); - const auto & match1 = matches[1]; - const auto & match2 = matches[2]; - if (isFigureLargerThanFifty(match1.ToString()) || isFigureLargerThanFifty(match2.ToString())) - return false; - } - else - break; - } - return true; - } - }; -} - void RegExpTreeDictionary::initRegexNodes(Block & block) { auto id_column = block.getByName(kId).column; @@ -207,7 +161,9 @@ void RegExpTreeDictionary::initRegexNodes(Block & block) auto keys_column = block.getByName(kKeys).column; auto values_column = block.getByName(kValues).column; - RegexChecker checker; +#ifdef USE_VECTORSCAN + SlowWithHyperscanChecker checker; +#endif size_t size = block.rows(); for (size_t i = 0; i < size; i++) @@ -253,7 +209,7 @@ void RegExpTreeDictionary::initRegexNodes(Block & block) } regex_nodes.emplace(id, node); #if USE_VECTORSCAN - if (use_vectorscan && checker.isSimpleRegex(regex)) + if (use_vectorscan && !checker.isSlow(regex)) { simple_regexps.push_back(regex); regexp_ids.push_back(id); diff --git a/src/Functions/FunctionsMultiStringFuzzySearch.h b/src/Functions/FunctionsMultiStringFuzzySearch.h index 5d86dd4dada..00d989f388e 100644 --- a/src/Functions/FunctionsMultiStringFuzzySearch.h +++ b/src/Functions/FunctionsMultiStringFuzzySearch.h @@ -39,13 +39,14 @@ public: static FunctionPtr create(ContextPtr context) { const auto & settings = context->getSettingsRef(); - return std::make_shared(settings.allow_hyperscan, settings.max_hyperscan_regexp_length, settings.max_hyperscan_regexp_total_length); + return std::make_shared(settings.allow_hyperscan, settings.max_hyperscan_regexp_length, settings.max_hyperscan_regexp_total_length, settings.reject_expensive_hyperscan_regexps); } - FunctionsMultiStringFuzzySearch(bool allow_hyperscan_, size_t max_hyperscan_regexp_length_, size_t max_hyperscan_regexp_total_length_) + FunctionsMultiStringFuzzySearch(bool allow_hyperscan_, size_t max_hyperscan_regexp_length_, size_t max_hyperscan_regexp_total_length_, bool reject_expensive_hyperscan_regexps_) : allow_hyperscan(allow_hyperscan_) , max_hyperscan_regexp_length(max_hyperscan_regexp_length_) , max_hyperscan_regexp_total_length(max_hyperscan_regexp_total_length_) + , reject_expensive_hyperscan_regexps(reject_expensive_hyperscan_regexps_) {} String getName() const override { return name; } @@ -112,14 +113,14 @@ public: col_needles_const->getValue(), vec_res, offsets_res, edit_distance, - allow_hyperscan, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length); + allow_hyperscan, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length, reject_expensive_hyperscan_regexps); else Impl::vectorVector( col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), col_needles_vector->getData(), col_needles_vector->getOffsets(), vec_res, offsets_res, edit_distance, - allow_hyperscan, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length); + allow_hyperscan, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length, reject_expensive_hyperscan_regexps); // the combination of const haystack + const needle is not implemented because // useDefaultImplementationForConstants() == true makes upper layers convert both to @@ -135,6 +136,7 @@ private: const bool allow_hyperscan; const size_t max_hyperscan_regexp_length; const size_t max_hyperscan_regexp_total_length; + const bool reject_expensive_hyperscan_regexps; }; } diff --git a/src/Functions/FunctionsMultiStringSearch.h b/src/Functions/FunctionsMultiStringSearch.h index 2465567b883..c0ed90aa042 100644 --- a/src/Functions/FunctionsMultiStringSearch.h +++ b/src/Functions/FunctionsMultiStringSearch.h @@ -53,13 +53,14 @@ public: static FunctionPtr create(ContextPtr context) { const auto & settings = context->getSettingsRef(); - return std::make_shared(settings.allow_hyperscan, settings.max_hyperscan_regexp_length, settings.max_hyperscan_regexp_total_length); + return std::make_shared(settings.allow_hyperscan, settings.max_hyperscan_regexp_length, settings.max_hyperscan_regexp_total_length, settings.reject_expensive_hyperscan_regexps); } - FunctionsMultiStringSearch(bool allow_hyperscan_, size_t max_hyperscan_regexp_length_, size_t max_hyperscan_regexp_total_length_) + FunctionsMultiStringSearch(bool allow_hyperscan_, size_t max_hyperscan_regexp_length_, size_t max_hyperscan_regexp_total_length_, bool reject_expensive_hyperscan_regexps_) : allow_hyperscan(allow_hyperscan_) , max_hyperscan_regexp_length(max_hyperscan_regexp_length_) , max_hyperscan_regexp_total_length(max_hyperscan_regexp_total_length_) + , reject_expensive_hyperscan_regexps(reject_expensive_hyperscan_regexps_) {} String getName() const override { return name; } @@ -108,13 +109,13 @@ public: col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), col_needles_const->getValue(), vec_res, offsets_res, - allow_hyperscan, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length); + allow_hyperscan, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length, reject_expensive_hyperscan_regexps); else Impl::vectorVector( col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), col_needles_vector->getData(), col_needles_vector->getOffsets(), vec_res, offsets_res, - allow_hyperscan, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length); + allow_hyperscan, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length, reject_expensive_hyperscan_regexps); // the combination of const haystack + const needle is not implemented because // useDefaultImplementationForConstants() == true makes upper layers convert both to @@ -130,6 +131,7 @@ private: const bool allow_hyperscan; const size_t max_hyperscan_regexp_length; const size_t max_hyperscan_regexp_total_length; + const bool reject_expensive_hyperscan_regexps; }; } diff --git a/src/Functions/MatchImpl.h b/src/Functions/MatchImpl.h index 00c08ea8531..db8dd55474e 100644 --- a/src/Functions/MatchImpl.h +++ b/src/Functions/MatchImpl.h @@ -6,7 +6,7 @@ #include #include #include -#include "Regexps.h" +#include #include "config.h" #include diff --git a/src/Functions/MultiMatchAllIndicesImpl.h b/src/Functions/MultiMatchAllIndicesImpl.h index dec8349e693..d655311f532 100644 --- a/src/Functions/MultiMatchAllIndicesImpl.h +++ b/src/Functions/MultiMatchAllIndicesImpl.h @@ -6,7 +6,7 @@ #include #include #include -#include "Regexps.h" +#include #include "config.h" @@ -51,9 +51,10 @@ struct MultiMatchAllIndicesImpl PaddedPODArray & offsets, bool allow_hyperscan, size_t max_hyperscan_regexp_length, - size_t max_hyperscan_regexp_total_length) + size_t max_hyperscan_regexp_total_length, + bool reject_expensive_hyperscan_regexps) { - vectorConstant(haystack_data, haystack_offsets, needles_arr, res, offsets, std::nullopt, allow_hyperscan, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length); + vectorConstant(haystack_data, haystack_offsets, needles_arr, res, offsets, std::nullopt, allow_hyperscan, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length, reject_expensive_hyperscan_regexps); } static void vectorConstant( @@ -65,7 +66,8 @@ struct MultiMatchAllIndicesImpl std::optional edit_distance, bool allow_hyperscan, size_t max_hyperscan_regexp_length, - size_t max_hyperscan_regexp_total_length) + size_t max_hyperscan_regexp_total_length, + bool reject_expensive_hyperscan_regexps) { if (!allow_hyperscan) throw Exception(ErrorCodes::FUNCTION_NOT_ALLOWED, "Hyperscan functions are disabled, because setting 'allow_hyperscan' is set to 0"); @@ -77,6 +79,14 @@ struct MultiMatchAllIndicesImpl checkHyperscanRegexp(needles, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length); + if (reject_expensive_hyperscan_regexps) + { + SlowWithHyperscanChecker checker; + for (auto needle : needles) + if (checker.isSlow(needle)) + throw Exception(ErrorCodes::HYPERSCAN_CANNOT_SCAN_TEXT, "Regular expression evaluation in vectorscan will be too slow. To ignore this error, disable setting 'reject_expensive_hyperscan_regexps'."); + } + offsets.resize(haystack_offsets.size()); if (needles_arr.empty()) @@ -135,6 +145,7 @@ struct MultiMatchAllIndicesImpl (void)edit_distance; (void)max_hyperscan_regexp_length; (void)max_hyperscan_regexp_total_length; + (void)reject_expensive_hyperscan_regexps; throw Exception(ErrorCodes::NOT_IMPLEMENTED, "multi-search all indices is not implemented when vectorscan is off"); #endif // USE_VECTORSCAN } @@ -148,9 +159,10 @@ struct MultiMatchAllIndicesImpl PaddedPODArray & offsets, bool allow_hyperscan, size_t max_hyperscan_regexp_length, - size_t max_hyperscan_regexp_total_length) + size_t max_hyperscan_regexp_total_length, + bool reject_expensive_hyperscan_regexps) { - vectorVector(haystack_data, haystack_offsets, needles_data, needles_offsets, res, offsets, std::nullopt, allow_hyperscan, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length); + vectorVector(haystack_data, haystack_offsets, needles_data, needles_offsets, res, offsets, std::nullopt, allow_hyperscan, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length, reject_expensive_hyperscan_regexps); } static void vectorVector( @@ -163,7 +175,8 @@ struct MultiMatchAllIndicesImpl std::optional edit_distance, bool allow_hyperscan, size_t max_hyperscan_regexp_length, - size_t max_hyperscan_regexp_total_length) + size_t max_hyperscan_regexp_total_length, + bool reject_expensive_hyperscan_regexps) { if (!allow_hyperscan) throw Exception(ErrorCodes::FUNCTION_NOT_ALLOWED, "Hyperscan functions are disabled, because setting 'allow_hyperscan' is set to 0"); @@ -195,6 +208,14 @@ struct MultiMatchAllIndicesImpl checkHyperscanRegexp(needles, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length); + if (reject_expensive_hyperscan_regexps) + { + SlowWithHyperscanChecker checker; + for (auto needle : needles) + if (checker.isSlow(needle)) + throw Exception(ErrorCodes::HYPERSCAN_CANNOT_SCAN_TEXT, "Regular expression evaluation in vectorscan will be too slow. To ignore this error, disable setting 'reject_expensive_hyperscan_regexps'."); + } + MultiRegexps::DeferredConstructedRegexpsPtr deferred_constructed_regexps = MultiRegexps::getOrSet(needles, edit_distance); MultiRegexps::Regexps * regexps = deferred_constructed_regexps->get(); hs_scratch_t * scratch = nullptr; @@ -249,6 +270,7 @@ struct MultiMatchAllIndicesImpl (void)edit_distance; (void)max_hyperscan_regexp_length; (void)max_hyperscan_regexp_total_length; + (void)reject_expensive_hyperscan_regexps; throw Exception(ErrorCodes::NOT_IMPLEMENTED, "multi-search all indices is not implemented when vectorscan is off"); #endif // USE_VECTORSCAN } diff --git a/src/Functions/MultiMatchAnyImpl.h b/src/Functions/MultiMatchAnyImpl.h index 0e42d4f6b58..7a35fcebc24 100644 --- a/src/Functions/MultiMatchAnyImpl.h +++ b/src/Functions/MultiMatchAnyImpl.h @@ -5,7 +5,7 @@ #include #include #include -#include "Regexps.h" +#include #include "config.h" @@ -65,9 +65,10 @@ struct MultiMatchAnyImpl PaddedPODArray & offsets, bool allow_hyperscan, size_t max_hyperscan_regexp_length, - size_t max_hyperscan_regexp_total_length) + size_t max_hyperscan_regexp_total_length, + bool reject_expensive_hyperscan_regexps) { - vectorConstant(haystack_data, haystack_offsets, needles_arr, res, offsets, std::nullopt, allow_hyperscan, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length); + vectorConstant(haystack_data, haystack_offsets, needles_arr, res, offsets, std::nullopt, allow_hyperscan, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length, reject_expensive_hyperscan_regexps); } static void vectorConstant( @@ -79,7 +80,8 @@ struct MultiMatchAnyImpl [[maybe_unused]] std::optional edit_distance, bool allow_hyperscan, size_t max_hyperscan_regexp_length, - size_t max_hyperscan_regexp_total_length) + size_t max_hyperscan_regexp_total_length, + bool reject_expensive_hyperscan_regexps) { if (!allow_hyperscan) throw Exception(ErrorCodes::FUNCTION_NOT_ALLOWED, "Hyperscan functions are disabled, because setting 'allow_hyperscan' is set to 0"); @@ -91,6 +93,14 @@ struct MultiMatchAnyImpl checkHyperscanRegexp(needles, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length); + if (reject_expensive_hyperscan_regexps) + { + SlowWithHyperscanChecker checker; + for (auto needle : needles) + if (checker.isSlow(needle)) + throw Exception(ErrorCodes::HYPERSCAN_CANNOT_SCAN_TEXT, "Regular expression evaluation in vectorscan will be too slow. To ignore this error, disable setting 'reject_expensive_hyperscan_regexps'."); + } + res.resize(haystack_offsets.size()); if (needles_arr.empty()) @@ -175,9 +185,10 @@ struct MultiMatchAnyImpl PaddedPODArray & offsets, bool allow_hyperscan, size_t max_hyperscan_regexp_length, - size_t max_hyperscan_regexp_total_length) + size_t max_hyperscan_regexp_total_length, + bool reject_expensive_hyperscan_regexps) { - vectorVector(haystack_data, haystack_offsets, needles_data, needles_offsets, res, offsets, std::nullopt, allow_hyperscan, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length); + vectorVector(haystack_data, haystack_offsets, needles_data, needles_offsets, res, offsets, std::nullopt, allow_hyperscan, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length, reject_expensive_hyperscan_regexps); } static void vectorVector( @@ -190,7 +201,8 @@ struct MultiMatchAnyImpl std::optional edit_distance, bool allow_hyperscan, size_t max_hyperscan_regexp_length, - size_t max_hyperscan_regexp_total_length) + size_t max_hyperscan_regexp_total_length, + bool reject_expensive_hyperscan_regexps) { if (!allow_hyperscan) throw Exception(ErrorCodes::FUNCTION_NOT_ALLOWED, "Hyperscan functions are disabled, because setting 'allow_hyperscan' is set to 0"); @@ -209,9 +221,7 @@ struct MultiMatchAnyImpl needles.reserve(needles_offsets[i] - prev_needles_offset); for (size_t j = prev_needles_offset; j < needles_offsets[i]; ++j) - { needles.emplace_back(needles_data_string->getDataAt(j).toView()); - } if (needles.empty()) { @@ -223,6 +233,14 @@ struct MultiMatchAnyImpl checkHyperscanRegexp(needles, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length); + if (reject_expensive_hyperscan_regexps) + { + SlowWithHyperscanChecker checker; + for (auto needle : needles) + if (checker.isSlow(needle)) + throw Exception(ErrorCodes::HYPERSCAN_CANNOT_SCAN_TEXT, "Regular expression evaluation in vectorscan will be too slow. To ignore this error, disable setting 'reject_expensive_hyperscan_regexps'."); + } + MultiRegexps::DeferredConstructedRegexpsPtr deferred_constructed_regexps = MultiRegexps::getOrSet(needles, edit_distance); MultiRegexps::Regexps * regexps = deferred_constructed_regexps->get(); hs_scratch_t * scratch = nullptr; @@ -309,6 +327,13 @@ struct MultiMatchAnyImpl checkHyperscanRegexp(needles, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length); + for (auto needle : needles) + { + SlowWithHyperscanChecker checker; + if (checker.isSlow(needle)) + throw Exception(ErrorCodes::HYPERSCAN_CANNOT_SCAN_TEXT, "Regular expression evaluation in vectorscan will be too slow. To ignore this error, disable setting 'reject_expensive_hyperscan_regexps'."); + } + for (size_t j = 0; j < needles.size(); ++j) { String needle(needles[j]); diff --git a/src/Functions/MultiSearchFirstIndexImpl.h b/src/Functions/MultiSearchFirstIndexImpl.h index f108f2ca41e..73f3c92adfb 100644 --- a/src/Functions/MultiSearchFirstIndexImpl.h +++ b/src/Functions/MultiSearchFirstIndexImpl.h @@ -32,7 +32,8 @@ struct MultiSearchFirstIndexImpl PaddedPODArray & /*offsets*/, bool /*allow_hyperscan*/, size_t /*max_hyperscan_regexp_length*/, - size_t /*max_hyperscan_regexp_total_length*/) + size_t /*max_hyperscan_regexp_total_length*/, + bool /*reject_expensive_hyperscan_regexps*/) { // For performance of Volnitsky search, it is crucial to save only one byte for pattern number. if (needles_arr.size() > std::numeric_limits::max()) @@ -78,7 +79,8 @@ struct MultiSearchFirstIndexImpl PaddedPODArray & /*offsets*/, bool /*allow_hyperscan*/, size_t /*max_hyperscan_regexp_length*/, - size_t /*max_hyperscan_regexp_total_length*/) + size_t /*max_hyperscan_regexp_total_length*/, + bool /*reject_expensive_hyperscan_regexps*/) { const size_t haystack_size = haystack_offsets.size(); res.resize(haystack_size); diff --git a/src/Functions/MultiSearchFirstPositionImpl.h b/src/Functions/MultiSearchFirstPositionImpl.h index 69cab478d13..99dd3f9d394 100644 --- a/src/Functions/MultiSearchFirstPositionImpl.h +++ b/src/Functions/MultiSearchFirstPositionImpl.h @@ -32,7 +32,8 @@ struct MultiSearchFirstPositionImpl PaddedPODArray & /*offsets*/, bool /*allow_hyperscan*/, size_t /*max_hyperscan_regexp_length*/, - size_t /*max_hyperscan_regexp_total_length*/) + size_t /*max_hyperscan_regexp_total_length*/, + bool /*reject_expensive_hyperscan_regexps*/) { // For performance of Volnitsky search, it is crucial to save only one byte for pattern number. if (needles_arr.size() > std::numeric_limits::max()) @@ -87,7 +88,8 @@ struct MultiSearchFirstPositionImpl PaddedPODArray & /*offsets*/, bool /*allow_hyperscan*/, size_t /*max_hyperscan_regexp_length*/, - size_t /*max_hyperscan_regexp_total_length*/) + size_t /*max_hyperscan_regexp_total_length*/, + bool /*reject_expensive_hyperscan_regexps*/) { const size_t haystack_size = haystack_offsets.size(); res.resize(haystack_size); diff --git a/src/Functions/MultiSearchImpl.h b/src/Functions/MultiSearchImpl.h index d42c2ca43e4..fb7d56f302a 100644 --- a/src/Functions/MultiSearchImpl.h +++ b/src/Functions/MultiSearchImpl.h @@ -32,7 +32,8 @@ struct MultiSearchImpl PaddedPODArray & /*offsets*/, bool /*allow_hyperscan*/, size_t /*max_hyperscan_regexp_length*/, - size_t /*max_hyperscan_regexp_total_length*/) + size_t /*max_hyperscan_regexp_total_length*/, + bool /*reject_expensive_hyperscan_regexps*/) { // For performance of Volnitsky search, it is crucial to save only one byte for pattern number. if (needles_arr.size() > std::numeric_limits::max()) @@ -77,7 +78,8 @@ struct MultiSearchImpl PaddedPODArray & /*offsets*/, bool /*allow_hyperscan*/, size_t /*max_hyperscan_regexp_length*/, - size_t /*max_hyperscan_regexp_total_length*/) + size_t /*max_hyperscan_regexp_total_length*/, + bool /*reject_expensive_hyperscan_regexps*/) { const size_t haystack_size = haystack_offsets.size(); res.resize(haystack_size); diff --git a/src/Functions/checkHyperscanRegexp.cpp b/src/Functions/checkHyperscanRegexp.cpp index 4a1bc4f9031..e6fbc3baa1a 100644 --- a/src/Functions/checkHyperscanRegexp.cpp +++ b/src/Functions/checkHyperscanRegexp.cpp @@ -1,6 +1,7 @@ #include #include +#include namespace DB { @@ -27,4 +28,78 @@ void checkHyperscanRegexp(const std::vector & regexps, size_t } } +namespace +{ + +bool isLargerThanFifty(std::string_view str) +{ + int number; + auto [_, ec] = std::from_chars(str.begin(), str.end(), number); + if (ec != std::errc()) + return false; + return number > 50; +} + +} + +/// Check for sub-patterns of the form x{n} or x{n,} can be expensive. Ignore spaces before/after n and m. +bool SlowWithHyperscanChecker::isSlowOneRepeat(std::string_view regexp) +{ + re2_st::StringPiece haystack(regexp.data(), regexp.size()); + re2_st::StringPiece matches[2]; + size_t start_pos = 0; + while (start_pos < regexp.size()) + { + if (searcher_one_repeat.Match(haystack, start_pos, regexp.size(), re2_st::RE2::Anchor::UNANCHORED, matches, 2)) + { + const auto & match = matches[0]; + start_pos += (matches[0].data() - haystack.data()) + match.length(); // fwd by prefix + match length + const auto & submatch = matches[1]; + if (isLargerThanFifty({submatch.data(), submatch.size()})) + return true; + } + else + break; + } + return false; +} + +/// Check if sub-patterns of the form x{n,m} can be expensive. Ignore spaces before/after n and m. +bool SlowWithHyperscanChecker::isSlowTwoRepeats(std::string_view regexp) +{ + re2_st::StringPiece haystack(regexp.data(), regexp.size()); + re2_st::StringPiece matches[3]; + size_t start_pos = 0; + while (start_pos < regexp.size()) + { + if (searcher_two_repeats.Match(haystack, start_pos, regexp.size(), re2_st::RE2::Anchor::UNANCHORED, matches, 3)) + { + const auto & match = matches[0]; + start_pos += (matches[0].data() - haystack.data()) + match.length(); // fwd by prefix + match length + const auto & submatch1 = matches[1]; + const auto & submatch2 = matches[2]; + if (isLargerThanFifty({submatch1.data(), submatch1.size()}) + || isLargerThanFifty({submatch2.data(), submatch2.size()})) + return true; + } + else + break; + } + return false; +} + +SlowWithHyperscanChecker::SlowWithHyperscanChecker() + : searcher_one_repeat(R"(\{\s*([\d]+)\s*,?\s*})") + , searcher_two_repeats(R"(\{\s*([\d]+)\s*,\s*([\d]+)\s*\})") +{} + +bool SlowWithHyperscanChecker::isSlow(std::string_view regexp) +{ + if (isSlowOneRepeat(regexp)) + return true; + else if (isSlowTwoRepeats(regexp)) + return true; + return false; +} + } diff --git a/src/Functions/checkHyperscanRegexp.h b/src/Functions/checkHyperscanRegexp.h index e5f1165a949..8bb76e6ebc4 100644 --- a/src/Functions/checkHyperscanRegexp.h +++ b/src/Functions/checkHyperscanRegexp.h @@ -3,9 +3,27 @@ #include #include +#include + namespace DB { void checkHyperscanRegexp(const std::vector & regexps, size_t max_hyperscan_regexp_length, size_t max_hyperscan_regexp_total_length); +/// Regexp evaluation with hyperscan can be slow for certain patterns due to NFA state explosion. Try to identify such patterns on a +/// best-effort basis. + +class SlowWithHyperscanChecker +{ +public: + SlowWithHyperscanChecker(); + bool isSlow(std::string_view regexp); + +private: + bool isSlowOneRepeat(std::string_view regexp); + bool isSlowTwoRepeats(std::string_view regexp); + re2_st::RE2 searcher_one_repeat; + re2_st::RE2 searcher_two_repeats; +}; + } diff --git a/src/Functions/extract.cpp b/src/Functions/extract.cpp index 5d539e03dae..74c5a2fdd36 100644 --- a/src/Functions/extract.cpp +++ b/src/Functions/extract.cpp @@ -1,6 +1,6 @@ -#include "FunctionsStringSearchToString.h" -#include "FunctionFactory.h" -#include "Regexps.h" +#include +#include +#include #include diff --git a/tests/queries/0_stateless/02560_regexp_denial_of_service.reference b/tests/queries/0_stateless/02560_regexp_denial_of_service.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02560_regexp_denial_of_service.sql b/tests/queries/0_stateless/02560_regexp_denial_of_service.sql new file mode 100644 index 00000000000..3a02c12c679 --- /dev/null +++ b/tests/queries/0_stateless/02560_regexp_denial_of_service.sql @@ -0,0 +1,58 @@ +-- Tags: no-fasttest, use-vectorscan + +DROP TABLE IF EXISTS t; + +-- test that the check which rejects hyperscan regexes with too big bounded repeats works + +-- {n} +SELECT multiMatchAny('test', ['.{51}']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT } +SELECT multiMatchAny('test', ['.{ 51}']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT } +SELECT multiMatchAny('test', ['.{51 }']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT } +SELECT multiMatchAny('test', ['prefix.{51}']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT } +SELECT multiMatchAny('test', ['.{51}.suffix']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT } +SELECT multiMatchAny('test', ['.{4,4}midfix{51}']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT } + +-- {n,} +SELECT multiMatchAny('test', ['.{51,}']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT } +SELECT multiMatchAny('test', ['.{ 51,}']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT } +SELECT multiMatchAny('test', ['.{51 ,}']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT } +SELECT multiMatchAny('test', ['.{51, }']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT } +SELECT multiMatchAny('test', ['prefix.{51,}']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT } +SELECT multiMatchAny('test', ['.{51,}.suffix']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT } +SELECT multiMatchAny('test', ['.{4,4}midfix{51,}']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT } + +-- {n,m} +SELECT multiMatchAny('test', ['.{1,51}']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT } +SELECT multiMatchAny('test', ['.{51,52}']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT } +SELECT multiMatchAny('test', ['.{ 51,52}']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT } +SELECT multiMatchAny('test', ['.{51 ,52}']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT } +SELECT multiMatchAny('test', ['.{51, 52}']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT } +SELECT multiMatchAny('test', ['.{51,52 }']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT } +SELECT multiMatchAny('test', ['prefix.{1,51}']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT } +SELECT multiMatchAny('test', ['.{1,51}.suffix']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT } +SELECT multiMatchAny('test', ['.{4,4}midfix{1,51}']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT } + +-- test that the check is implemented in all functions which use vectorscan + +CREATE TABLE t(c String) Engine=MergeTree() ORDER BY c; +INSERT INTO t VALUES('Hallo Welt'); + +SELECT multiMatchAny('Hallo Welt', ['.{51}']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT } +SELECT multiMatchAny(c, ['.{51}']) FROM t; -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT } + +SELECT multiMatchAnyIndex('Hallo Welt', ['.{51}']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT } +SELECT multiMatchAnyIndex(c, ['.{51}']) FROM t; -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT } + +SELECT multiMatchAllIndices('Hallo Welt', ['.{51}']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT } +SELECT multiMatchAllIndices(c, ['.{51}']) FROM t; -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT } + +SELECT multiFuzzyMatchAny('Hallo Welt', 1, ['.{51}']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT } +SELECT multiFuzzyMatchAny(c, 1, ['.{51}']) FROM t; -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT } + +SELECT multiFuzzyMatchAnyIndex('Hallo Welt', 1, ['.{51}']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT } +SELECT multiFuzzyMatchAnyIndex(c, 1, ['.{51}']) FROM t; -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT } + +SELECT multiFuzzyMatchAllIndices('Hallo Welt', 1, ['.{51}']); -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT } +SELECT multiFuzzyMatchAllIndices(c, 1, ['.{51}']) FROM t; -- { serverError HYPERSCAN_CANNOT_SCAN_TEXT } + +DROP TABLE t;