From aa33a7add163c6668949a1f1056ec529975ae8ef Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Sat, 7 Aug 2021 13:07:41 +0800 Subject: [PATCH] Add settings to check hyperscan regexp length. --- src/Core/Settings.h | 2 ++ .../FunctionsMultiStringFuzzySearch.h | 16 +++++++++- src/Functions/FunctionsMultiStringSearch.h | 16 +++++++++- src/Functions/hyperscanRegexpChecker.cpp | 29 +++++++++++++++++++ src/Functions/hyperscanRegexpChecker.h | 10 +++++++ ...02004_max_hyperscan_regex_length.reference | 6 ++++ .../02004_max_hyperscan_regex_length.sql | 26 +++++++++++++++++ 7 files changed, 103 insertions(+), 2 deletions(-) create mode 100644 src/Functions/hyperscanRegexpChecker.cpp create mode 100644 src/Functions/hyperscanRegexpChecker.h create mode 100644 tests/queries/0_stateless/02004_max_hyperscan_regex_length.reference create mode 100644 tests/queries/0_stateless/02004_max_hyperscan_regex_length.sql diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 20404089210..d3493677af5 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -377,6 +377,8 @@ class IColumn; M(Bool, external_table_functions_use_nulls, true, "If it is set to true, external table functions will implicitly use Nullable type if needed. Otherwise NULLs will be substituted with default values. Currently supported only by 'mysql', 'postgresql' and 'odbc' table functions.", 0) \ \ M(Bool, allow_hyperscan, true, "Allow functions that use Hyperscan library. Disable to avoid potentially long compilation times and excessive resource usage.", 0) \ + M(UInt64, max_hyperscan_regexp_length, 0, "Max length of regexp than can be used in hyperscan multi-match functions. Zero means unlimited.", 0) \ + M(UInt64, max_hyperscan_regexp_total_length, 0, "Max total length of all regexps than can be used in hyperscan multi-match functions. Zero means unlimited.", 0) \ M(Bool, allow_simdjson, true, "Allow using simdjson library in 'JSON*' functions if AVX2 instructions are available. If disabled rapidjson will be used.", 0) \ M(Bool, allow_introspection_functions, false, "Allow functions for introspection of ELF and DWARF for query profiling. These functions are slow and may impose security considerations.", 0) \ \ diff --git a/src/Functions/FunctionsMultiStringFuzzySearch.h b/src/Functions/FunctionsMultiStringFuzzySearch.h index 209efb0fc2f..a2d0c972abb 100644 --- a/src/Functions/FunctionsMultiStringFuzzySearch.h +++ b/src/Functions/FunctionsMultiStringFuzzySearch.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -40,7 +41,13 @@ public: throw Exception( "Hyperscan functions are disabled, because setting 'allow_hyperscan' is set to 0", ErrorCodes::FUNCTION_NOT_ALLOWED); - return std::make_shared(); + return std::make_shared( + context->getSettingsRef().max_hyperscan_regexp_length, context->getSettingsRef().max_hyperscan_regexp_total_length); + } + + FunctionsMultiStringFuzzySearch(size_t max_hyperscan_regexp_length_, size_t max_hyperscan_regexp_total_length_) + : max_hyperscan_regexp_length(max_hyperscan_regexp_length_), max_hyperscan_regexp_total_length(max_hyperscan_regexp_total_length_) + { } String getName() const override { return name; } @@ -113,6 +120,9 @@ public: for (const auto & el : src_arr) refs.emplace_back(el.get()); + if (Impl::is_using_hyperscan) + checkRegexp(refs, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length); + auto col_res = ColumnVector::create(); auto col_offsets = ColumnArray::ColumnOffsets::create(); @@ -131,6 +141,10 @@ public: else return col_res; } + +private: + size_t max_hyperscan_regexp_length; + size_t max_hyperscan_regexp_total_length; }; } diff --git a/src/Functions/FunctionsMultiStringSearch.h b/src/Functions/FunctionsMultiStringSearch.h index 08b4668940e..3dd2e8bfd09 100644 --- a/src/Functions/FunctionsMultiStringSearch.h +++ b/src/Functions/FunctionsMultiStringSearch.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -53,7 +54,13 @@ public: throw Exception( "Hyperscan functions are disabled, because setting 'allow_hyperscan' is set to 0", ErrorCodes::FUNCTION_NOT_ALLOWED); - return std::make_shared(); + return std::make_shared( + context->getSettingsRef().max_hyperscan_regexp_length, context->getSettingsRef().max_hyperscan_regexp_total_length); + } + + FunctionsMultiStringSearch(size_t max_hyperscan_regexp_length_, size_t max_hyperscan_regexp_total_length_) + : max_hyperscan_regexp_length(max_hyperscan_regexp_length_), max_hyperscan_regexp_total_length(max_hyperscan_regexp_total_length_) + { } String getName() const override { return name; } @@ -105,6 +112,9 @@ public: for (const auto & el : src_arr) refs.emplace_back(el.get()); + if (Impl::is_using_hyperscan) + checkRegexp(refs, max_hyperscan_regexp_length, max_hyperscan_regexp_total_length); + auto col_res = ColumnVector::create(); auto col_offsets = ColumnArray::ColumnOffsets::create(); @@ -122,6 +132,10 @@ public: else return col_res; } + +private: + size_t max_hyperscan_regexp_length; + size_t max_hyperscan_regexp_total_length; }; } diff --git a/src/Functions/hyperscanRegexpChecker.cpp b/src/Functions/hyperscanRegexpChecker.cpp new file mode 100644 index 00000000000..b3c46e34daa --- /dev/null +++ b/src/Functions/hyperscanRegexpChecker.cpp @@ -0,0 +1,29 @@ +#include + +#include + +namespace DB +{ +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; +} + +void checkRegexp(const std::vector & refs, size_t max_hyperscan_regexp_length, size_t max_hyperscan_regexp_total_length) +{ + if (max_hyperscan_regexp_length > 0 || max_hyperscan_regexp_total_length > 0) + { + size_t total_regexp_length = 0; + for (const auto & pattern : refs) + { + if (max_hyperscan_regexp_length > 0 && pattern.size > max_hyperscan_regexp_length) + throw Exception("Regexp length too large", ErrorCodes::BAD_ARGUMENTS); + total_regexp_length += pattern.size; + } + + if (max_hyperscan_regexp_total_length > 0 && total_regexp_length > max_hyperscan_regexp_total_length) + throw Exception("Total regexp lengths too large", ErrorCodes::BAD_ARGUMENTS); + } +} + +} diff --git a/src/Functions/hyperscanRegexpChecker.h b/src/Functions/hyperscanRegexpChecker.h new file mode 100644 index 00000000000..f2988120899 --- /dev/null +++ b/src/Functions/hyperscanRegexpChecker.h @@ -0,0 +1,10 @@ +#pragma once + +#include + +namespace DB +{ + +void checkRegexp(const std::vector & refs, size_t max_hyperscan_regexp_length, size_t max_hyperscan_regexp_total_length); + +} diff --git a/tests/queries/0_stateless/02004_max_hyperscan_regex_length.reference b/tests/queries/0_stateless/02004_max_hyperscan_regex_length.reference new file mode 100644 index 00000000000..87e68c152c3 --- /dev/null +++ b/tests/queries/0_stateless/02004_max_hyperscan_regex_length.reference @@ -0,0 +1,6 @@ +1 +1 +[1] +1 +1 +[1] diff --git a/tests/queries/0_stateless/02004_max_hyperscan_regex_length.sql b/tests/queries/0_stateless/02004_max_hyperscan_regex_length.sql new file mode 100644 index 00000000000..6058ef2f430 --- /dev/null +++ b/tests/queries/0_stateless/02004_max_hyperscan_regex_length.sql @@ -0,0 +1,26 @@ +set max_hyperscan_regexp_length = 1; +set max_hyperscan_regexp_total_length = 1; + +select multiMatchAny('123', ['1']); +select multiMatchAny('123', ['12']); -- { serverError 36 } +select multiMatchAny('123', ['1', '2']); -- { serverError 36 } + +select multiMatchAnyIndex('123', ['1']); +select multiMatchAnyIndex('123', ['12']); -- { serverError 36 } +select multiMatchAnyIndex('123', ['1', '2']); -- { serverError 36 } + +select multiMatchAllIndices('123', ['1']); +select multiMatchAllIndices('123', ['12']); -- { serverError 36 } +select multiMatchAllIndices('123', ['1', '2']); -- { serverError 36 } + +select multiFuzzyMatchAny('123', 0, ['1']); +select multiFuzzyMatchAny('123', 0, ['12']); -- { serverError 36 } +select multiFuzzyMatchAny('123', 0, ['1', '2']); -- { serverError 36 } + +select multiFuzzyMatchAnyIndex('123', 0, ['1']); +select multiFuzzyMatchAnyIndex('123', 0, ['12']); -- { serverError 36 } +select multiFuzzyMatchAnyIndex('123', 0, ['1', '2']); -- { serverError 36 } + +select multiFuzzyMatchAllIndices('123', 0, ['1']); +select multiFuzzyMatchAllIndices('123', 0, ['12']); -- { serverError 36 } +select multiFuzzyMatchAllIndices('123', 0, ['1', '2']); -- { serverError 36 }