diff --git a/src/Common/OptimizedRegularExpression.cpp b/src/Common/OptimizedRegularExpression.cpp index da348adbe31..cfc364929a3 100644 --- a/src/Common/OptimizedRegularExpression.cpp +++ b/src/Common/OptimizedRegularExpression.cpp @@ -17,7 +17,7 @@ namespace DB template void OptimizedRegularExpressionImpl::analyze( - const std::string & regexp, + std::string_view regexp, std::string & required_substring, bool & is_trivial, bool & required_substring_is_prefix) diff --git a/src/Common/OptimizedRegularExpression.h b/src/Common/OptimizedRegularExpression.h index a3d38f27c07..eaa7b06e309 100644 --- a/src/Common/OptimizedRegularExpression.h +++ b/src/Common/OptimizedRegularExpression.h @@ -86,8 +86,6 @@ public: /// Get the regexp re2 or nullptr if the pattern is trivial (for output to the log). const std::unique_ptr & getRE2() const { return re2; } - static void analyze(const std::string & regexp_, std::string & required_substring, bool & is_trivial, bool & required_substring_is_prefix); - void getAnalyzeResult(std::string & out_required_substring, bool & out_is_trivial, bool & out_required_substring_is_prefix) const { out_required_substring = required_substring; @@ -104,6 +102,8 @@ private: std::optional> case_insensitive_substring_searcher; std::unique_ptr re2; unsigned number_of_subpatterns; + + static void analyze(std::string_view regexp_, std::string & required_substring, bool & is_trivial, bool & required_substring_is_prefix); }; using OptimizedRegularExpression = OptimizedRegularExpressionImpl; diff --git a/src/Functions/MatchImpl.h b/src/Functions/MatchImpl.h index 54aaa3116fd..17bda74f8ab 100644 --- a/src/Functions/MatchImpl.h +++ b/src/Functions/MatchImpl.h @@ -63,14 +63,33 @@ inline bool likePatternIsSubstring(std::string_view pattern, String & res) } -/** 'like' - if true, treat pattern as SQL LIKE, otherwise as re2 regexp. - * 'negate' - if true, negate result - * 'case_insensitive' - if true, match case insensitively - * - * NOTE: We want to run regexp search for whole columns by one call (as implemented in function 'position') - * but for that, regexp engine must support \0 bytes and their interpretation as string boundaries. - */ -template +// For more readable instantiations of MatchImpl<> +struct MatchTraits +{ +enum class Syntax +{ + Like, + Re2 +}; + +enum class Case +{ + Sensitive, + Insensitive +}; + +enum class Result +{ + DontNegate, + Negate +}; +}; + +/** + * NOTE: We want to run regexp search for whole columns by one call (as implemented in function 'position') + * but for that, regexp engine must support \0 bytes and their interpretation as string boundaries. + */ +template struct MatchImpl { static constexpr bool use_default_implementation_for_constants = true; @@ -81,6 +100,10 @@ struct MatchImpl using ResultType = UInt8; + static constexpr bool is_like = (syntax_ == MatchTraits::Syntax::Like); + static constexpr bool case_insensitive = (case_ == MatchTraits::Case::Insensitive); + static constexpr bool negate = (result_ == MatchTraits::Result::Negate); + using Searcher = std::conditional_t; @@ -92,16 +115,20 @@ struct MatchImpl const ColumnPtr & start_pos_, PaddedPODArray & res) { + const size_t haystack_size = haystack_offsets.size(); + + if (haystack_size != res.size()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Function '{}' unexpectedly received a different number of haystacks and results", name); + if (start_pos_ != nullptr) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Function '{}' doesn't support start_pos argument", name); + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Function '{}' doesn't support start_pos argument", name); if (haystack_offsets.empty()) return; /// A simple case where the [I]LIKE expression reduces to finding a substring in a string String strstr_pattern; - if (like && impl::likePatternIsSubstring(needle, strstr_pattern)) + if (is_like && impl::likePatternIsSubstring(needle, strstr_pattern)) { const UInt8 * const begin = haystack_data.data(); const UInt8 * const end = haystack_data.data() + haystack_data.size(); @@ -139,7 +166,7 @@ struct MatchImpl } else { - auto regexp = Regexps::get(needle); + auto regexp = Regexps::get(needle); String required_substring; bool is_trivial; @@ -147,28 +174,26 @@ struct MatchImpl regexp->getAnalyzeResult(required_substring, is_trivial, required_substring_is_prefix); - size_t haystack_size = haystack_offsets.size(); - if (required_substring.empty()) { if (!regexp->getRE2()) /// An empty regexp. Always matches. { if (haystack_size) - memset(res.data(), 1, haystack_size * sizeof(res[0])); + memset(res.data(), !negate, haystack_size * sizeof(res[0])); } else { size_t prev_offset = 0; for (size_t i = 0; i < haystack_size; ++i) { - res[i] = negate - ^ regexp->getRE2()->Match( - {reinterpret_cast(&haystack_data[prev_offset]), haystack_offsets[i] - prev_offset - 1}, - 0, - haystack_offsets[i] - prev_offset - 1, - re2_st::RE2::UNANCHORED, - nullptr, - 0); + const bool match = regexp->getRE2()->Match( + {reinterpret_cast(&haystack_data[prev_offset]), haystack_offsets[i] - prev_offset - 1}, + 0, + haystack_offsets[i] - prev_offset - 1, + re2_st::RE2::UNANCHORED, + nullptr, + 0); + res[i] = negate ^ match; prev_offset = haystack_offsets[i]; } @@ -216,14 +241,14 @@ struct MatchImpl const size_t start_pos = (required_substring_is_prefix) ? (reinterpret_cast(pos) - str_data) : 0; const size_t end_pos = str_size; - res[i] = negate - ^ regexp->getRE2()->Match( - {str_data, str_size}, - start_pos, - end_pos, - re2_st::RE2::UNANCHORED, - nullptr, - 0); + const bool match = regexp->getRE2()->Match( + {str_data, str_size}, + start_pos, + end_pos, + re2_st::RE2::UNANCHORED, + nullptr, + 0); + res[i] = negate ^ match; } } else @@ -247,12 +272,17 @@ struct MatchImpl const String & needle, PaddedPODArray & res) { + const size_t haystack_size = haystack.size() / N; + + if (haystack_size != res.size()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Function '{}' unexpectedly received a different number of haystacks and results", name); + if (haystack.empty()) return; /// A simple case where the LIKE expression reduces to finding a substring in a string String strstr_pattern; - if (like && impl::likePatternIsSubstring(needle, strstr_pattern)) + if (is_like && impl::likePatternIsSubstring(needle, strstr_pattern)) { const UInt8 * const begin = haystack.data(); const UInt8 * const end = haystack.data() + haystack.size(); @@ -295,7 +325,7 @@ struct MatchImpl } else { - auto regexp = Regexps::get(needle); + auto regexp = Regexps::get(needle); String required_substring; bool is_trivial; @@ -303,28 +333,26 @@ struct MatchImpl regexp->getAnalyzeResult(required_substring, is_trivial, required_substring_is_prefix); - const size_t haystack_size = haystack.size() / N; - if (required_substring.empty()) { if (!regexp->getRE2()) /// An empty regexp. Always matches. { if (haystack_size) - memset(res.data(), 1, haystack_size * sizeof(res[0])); + memset(res.data(), !negate, haystack_size * sizeof(res[0])); } else { size_t offset = 0; for (size_t i = 0; i < haystack_size; ++i) { - res[i] = negate - ^ regexp->getRE2()->Match( - {reinterpret_cast(&haystack[offset]), N}, - 0, - N, - re2_st::RE2::UNANCHORED, - nullptr, - 0); + const bool match = regexp->getRE2()->Match( + {reinterpret_cast(&haystack[offset]), N}, + 0, + N, + re2_st::RE2::UNANCHORED, + nullptr, + 0); + res[i] = negate ^ match; offset += N; } @@ -375,14 +403,14 @@ struct MatchImpl const size_t start_pos = (required_substring_is_prefix) ? (reinterpret_cast(pos) - str_data) : 0; const size_t end_pos = N; - res[i] = negate - ^ regexp->getRE2()->Match( + const bool match = regexp->getRE2()->Match( {str_data, N}, start_pos, end_pos, re2_st::RE2::UNANCHORED, nullptr, 0); + res[i] = negate ^ match; } } else @@ -410,13 +438,11 @@ struct MatchImpl { const size_t haystack_size = haystack_offsets.size(); - if (haystack_size != needle_offset.size()) - throw Exception(ErrorCodes::LOGICAL_ERROR, - "Function '{}' unexpectedly received a different number of haystacks and needles", name); + if (haystack_size != needle_offset.size() || haystack_size != res.size()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Function '{}' unexpectedly received a different number of haystacks, needles and results", name); if (start_pos_ != nullptr) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Function '{}' doesn't support start_pos argument", name); + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Function '{}' doesn't support start_pos argument", name); if (haystack_offsets.empty()) return; @@ -440,7 +466,7 @@ struct MatchImpl reinterpret_cast(cur_needle_data), cur_needle_length); - if (like && impl::likePatternIsSubstring(needle, required_substr)) + if (is_like && impl::likePatternIsSubstring(needle, required_substr)) { if (required_substr.size() > cur_haystack_length) res[i] = negate; @@ -448,16 +474,15 @@ struct MatchImpl { Searcher searcher(required_substr.data(), required_substr.size(), cur_haystack_length); const auto * match = searcher.search(cur_haystack_data, cur_haystack_length); - res[i] = negate - ^ (match != cur_haystack_data + cur_haystack_length); + res[i] = negate ^ (match != cur_haystack_data + cur_haystack_length); } } else { // each row is expected to contain a different like/re2 pattern // --> bypass the regexp cache, instead construct the pattern on-the-fly - const int flags = Regexps::buildRe2Flags(); - const auto & regexp = Regexps::Regexp(Regexps::createRegexp(needle, flags)); + const int flags = Regexps::buildRe2Flags(); + const auto & regexp = Regexps::Regexp(Regexps::createRegexp(needle, flags)); regexp.getAnalyzeResult(required_substr, is_trivial, required_substring_is_prefix); @@ -465,18 +490,18 @@ struct MatchImpl { if (!regexp.getRE2()) /// An empty regexp. Always matches. { - res[i] = 1; + res[i] = !negate; } else { - res[i] = negate - ^ regexp.getRE2()->Match( - {reinterpret_cast(cur_haystack_data), cur_haystack_length}, - 0, - cur_haystack_length, - re2_st::RE2::UNANCHORED, - nullptr, - 0); + const bool match = regexp.getRE2()->Match( + {reinterpret_cast(cur_haystack_data), cur_haystack_length}, + 0, + cur_haystack_length, + re2_st::RE2::UNANCHORED, + nullptr, + 0); + res[i] = negate ^ match; } } else @@ -499,14 +524,14 @@ struct MatchImpl const size_t start_pos = (required_substring_is_prefix) ? (match - cur_haystack_data) : 0; const size_t end_pos = cur_haystack_length; - res[i] = negate - ^ regexp.getRE2()->Match( - {reinterpret_cast(cur_haystack_data), cur_haystack_length}, - start_pos, - end_pos, - re2_st::RE2::UNANCHORED, - nullptr, - 0); + const bool match2 = regexp.getRE2()->Match( + {reinterpret_cast(cur_haystack_data), cur_haystack_length}, + start_pos, + end_pos, + re2_st::RE2::UNANCHORED, + nullptr, + 0); + res[i] = negate ^ match2; } } } @@ -527,13 +552,11 @@ struct MatchImpl { const size_t haystack_size = haystack.size()/N; - if (haystack_size != needle_offset.size()) - throw Exception(ErrorCodes::LOGICAL_ERROR, - "Function '{}' unexpectedly received a different number of haystacks and needles", name); + if (haystack_size != needle_offset.size() || haystack_size != res.size()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Function '{}' unexpectedly received a different number of haystacks, needles and results", name); if (start_pos_ != nullptr) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Function '{}' doesn't support start_pos argument", name); + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Function '{}' doesn't support start_pos argument", name); if (haystack.empty()) return; @@ -557,7 +580,7 @@ struct MatchImpl reinterpret_cast(cur_needle_data), cur_needle_length); - if (like && impl::likePatternIsSubstring(needle, required_substr)) + if (is_like && impl::likePatternIsSubstring(needle, required_substr)) { if (required_substr.size() > cur_haystack_length) res[i] = negate; @@ -565,16 +588,15 @@ struct MatchImpl { Searcher searcher(required_substr.data(), required_substr.size(), cur_haystack_length); const auto * match = searcher.search(cur_haystack_data, cur_haystack_length); - res[i] = negate - ^ (match != cur_haystack_data + cur_haystack_length); + res[i] = negate ^ (match != cur_haystack_data + cur_haystack_length); } } else { // each row is expected to contain a different like/re2 pattern // --> bypass the regexp cache, instead construct the pattern on-the-fly - const int flags = Regexps::buildRe2Flags(); - const auto & regexp = Regexps::Regexp(Regexps::createRegexp(needle, flags)); + const int flags = Regexps::buildRe2Flags(); + const auto & regexp = Regexps::Regexp(Regexps::createRegexp(needle, flags)); regexp.getAnalyzeResult(required_substr, is_trivial, required_substring_is_prefix); @@ -582,18 +604,18 @@ struct MatchImpl { if (!regexp.getRE2()) /// An empty regexp. Always matches. { - res[i] = 1; + res[i] = !negate; } else { - res[i] = negate - ^ regexp.getRE2()->Match( - {reinterpret_cast(cur_haystack_data), cur_haystack_length}, - 0, - cur_haystack_length, - re2_st::RE2::UNANCHORED, - nullptr, - 0); + const bool match = regexp.getRE2()->Match( + {reinterpret_cast(cur_haystack_data), cur_haystack_length}, + 0, + cur_haystack_length, + re2_st::RE2::UNANCHORED, + nullptr, + 0); + res[i] = negate ^ match; } } else @@ -616,14 +638,14 @@ struct MatchImpl const size_t start_pos = (required_substring_is_prefix) ? (match - cur_haystack_data) : 0; const size_t end_pos = cur_haystack_length; - res[i] = negate - ^ regexp.getRE2()->Match( - {reinterpret_cast(cur_haystack_data), cur_haystack_length}, - start_pos, - end_pos, - re2_st::RE2::UNANCHORED, - nullptr, - 0); + const bool match2 = regexp.getRE2()->Match( + {reinterpret_cast(cur_haystack_data), cur_haystack_length}, + start_pos, + end_pos, + re2_st::RE2::UNANCHORED, + nullptr, + 0); + res[i] = negate ^ match2; } } } diff --git a/src/Functions/MultiMatchAnyImpl.h b/src/Functions/MultiMatchAnyImpl.h index 595a3c8de5b..8a65c8cb2b4 100644 --- a/src/Functions/MultiMatchAnyImpl.h +++ b/src/Functions/MultiMatchAnyImpl.h @@ -120,7 +120,7 @@ struct MultiMatchAnyImpl memset(accum.data(), 0, accum.size()); for (size_t j = 0; j < needles.size(); ++j) { - MatchImpl::vectorConstant(haystack_data, haystack_offsets, needles[j].toString(), nullptr, accum); + MatchImpl::vectorConstant(haystack_data, haystack_offsets, needles[j].toString(), nullptr, accum); for (size_t i = 0; i < res.size(); ++i) { if constexpr (FindAny) diff --git a/src/Functions/ilike.cpp b/src/Functions/ilike.cpp index 1222cc48d07..b88d01986d5 100644 --- a/src/Functions/ilike.cpp +++ b/src/Functions/ilike.cpp @@ -12,7 +12,7 @@ struct NameILike static constexpr auto name = "ilike"; }; -using ILikeImpl = MatchImpl; +using ILikeImpl = MatchImpl; using FunctionILike = FunctionsStringSearch; } diff --git a/src/Functions/like.h b/src/Functions/like.h index edb738d393b..9e25fc6f4c0 100644 --- a/src/Functions/like.h +++ b/src/Functions/like.h @@ -11,7 +11,7 @@ struct NameLike static constexpr auto name = "like"; }; -using LikeImpl = MatchImpl; +using LikeImpl = MatchImpl; using FunctionLike = FunctionsStringSearch; } diff --git a/src/Functions/likePatternToRegexp.h b/src/Functions/likePatternToRegexp.h index 15e38e61ab4..319a3729e16 100644 --- a/src/Functions/likePatternToRegexp.h +++ b/src/Functions/likePatternToRegexp.h @@ -6,7 +6,7 @@ namespace DB { /// Transforms the [I]LIKE expression into regexp re2. For example, abc%def -> ^abc.*def$ -inline String likePatternToRegexp(const String & pattern) +inline String likePatternToRegexp(std::string_view pattern) { String res; res.reserve(pattern.size() * 2); diff --git a/src/Functions/match.cpp b/src/Functions/match.cpp index 4c329701464..a0789f229fd 100644 --- a/src/Functions/match.cpp +++ b/src/Functions/match.cpp @@ -13,7 +13,7 @@ struct NameMatch static constexpr auto name = "match"; }; -using FunctionMatch = FunctionsStringSearch>; +using FunctionMatch = FunctionsStringSearch>; } diff --git a/src/Functions/notILike.cpp b/src/Functions/notILike.cpp index b5e06ac55f4..5e78db1c518 100644 --- a/src/Functions/notILike.cpp +++ b/src/Functions/notILike.cpp @@ -12,7 +12,7 @@ struct NameNotILike static constexpr auto name = "notILike"; }; -using NotILikeImpl = MatchImpl; +using NotILikeImpl = MatchImpl; using FunctionNotILike = FunctionsStringSearch; } diff --git a/src/Functions/notLike.cpp b/src/Functions/notLike.cpp index 7fa1b6f9122..33a36748bb1 100644 --- a/src/Functions/notLike.cpp +++ b/src/Functions/notLike.cpp @@ -12,7 +12,7 @@ struct NameNotLike static constexpr auto name = "notLike"; }; -using FunctionNotLike = FunctionsStringSearch>; +using FunctionNotLike = FunctionsStringSearch>; }