From 7232f47c682dc7dc4f30cc3ac978ba5bc8d6c5c1 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Wed, 11 May 2022 13:54:26 +0200 Subject: [PATCH] Fix Bug 37114 - ilike on FixedString(N) columns produces wrong results The main fix is in MatchImpl.h where the "case_insensitive" parameter is added to Regexps::get(). Also made "case_insensitive" a non-default template parameter to reduce the risk of future bugs. The remainder of this commit are minor random code improvements. resoves #37114 --- src/Common/ObjectPool.h | 2 +- src/Functions/FunctionsStringArray.h | 4 +-- src/Functions/MatchImpl.h | 29 +++++++++---------- src/Functions/Regexps.h | 15 ++++------ src/Functions/extract.cpp | 2 +- src/Functions/extractAllGroups.h | 2 +- src/Functions/extractGroups.cpp | 2 +- .../02293_ilike_on_fixed_strings.reference | 2 ++ .../02293_ilike_on_fixed_strings.sql | 10 +++++++ 9 files changed, 38 insertions(+), 30 deletions(-) create mode 100644 tests/queries/0_stateless/02293_ilike_on_fixed_strings.reference create mode 100644 tests/queries/0_stateless/02293_ilike_on_fixed_strings.sql diff --git a/src/Common/ObjectPool.h b/src/Common/ObjectPool.h index ef07b8eed1b..801a37d0dfb 100644 --- a/src/Common/ObjectPool.h +++ b/src/Common/ObjectPool.h @@ -94,7 +94,7 @@ public: template Pointer get(const Key & key, Factory && f) { - std::unique_lock lock(mutex); + std::lock_guard lock(mutex); auto it = container.find(key); if (container.end() == it) diff --git a/src/Functions/FunctionsStringArray.h b/src/Functions/FunctionsStringArray.h index 6b3adf46ff5..2680816670f 100644 --- a/src/Functions/FunctionsStringArray.h +++ b/src/Functions/FunctionsStringArray.h @@ -477,7 +477,7 @@ public: ErrorCodes::ILLEGAL_COLUMN); if (!col->getValue().empty()) - re = Regexps::get(col->getValue()); + re = Regexps::get(col->getValue()); } @@ -560,7 +560,7 @@ public: + " of first argument of function " + getName() + ". Must be constant string.", ErrorCodes::ILLEGAL_COLUMN); - re = Regexps::get(col->getValue()); + re = Regexps::get(col->getValue()); capture = re->getNumberOfSubpatterns() > 0 ? 1 : 0; matches.resize(capture + 1); diff --git a/src/Functions/MatchImpl.h b/src/Functions/MatchImpl.h index 08ac47e692f..026b38b997b 100644 --- a/src/Functions/MatchImpl.h +++ b/src/Functions/MatchImpl.h @@ -24,12 +24,11 @@ namespace ErrorCodes /// Is the [I]LIKE expression reduced to finding a substring in a string? static inline bool likePatternIsStrstr(const String & pattern, String & res) { - res = ""; - if (pattern.size() < 2 || pattern.front() != '%' || pattern.back() != '%') return false; - res.reserve(pattern.size() * 2); + res = ""; + res.reserve(pattern.size() - 2); const char * pos = pattern.data(); const char * end = pos + pattern.size(); @@ -81,7 +80,7 @@ struct MatchImpl static void vectorConstant( const ColumnString::Chars & data, const ColumnString::Offsets & offsets, - const std::string & pattern, + const String & pattern, const ColumnPtr & start_pos, PaddedPODArray & res) { @@ -92,14 +91,13 @@ struct MatchImpl if (offsets.empty()) return; - String strstr_pattern; - /// A simple case where the [I]LIKE expression reduces to finding a substring in a string + String strstr_pattern; if (like && likePatternIsStrstr(pattern, strstr_pattern)) { - const UInt8 * begin = data.data(); + const UInt8 * const begin = data.data(); + const UInt8 * const end = data.data() + data.size(); const UInt8 * pos = begin; - const UInt8 * end = pos + data.size(); /// The current index in the array of strings. size_t i = 0; @@ -137,7 +135,7 @@ struct MatchImpl auto regexp = Regexps::get(pattern); - std::string required_substring; + String required_substring; bool is_trivial; bool required_substring_is_prefix; /// for `anchored` execution of the regexp. @@ -172,9 +170,9 @@ struct MatchImpl { /// NOTE This almost matches with the case of LikePatternIsStrstr. - const UInt8 * begin = data.data(); + const UInt8 * const begin = data.data(); + const UInt8 * const end = data.begin() + data.size(); const UInt8 * pos = begin; - const UInt8 * end = pos + data.size(); /// The current index in the array of strings. size_t i = 0; @@ -230,6 +228,7 @@ struct MatchImpl ++i; } + /// Tail, in which there can be no substring. if (i < res.size()) memset(&res[i], revert, (res.size() - i) * sizeof(res[0])); } @@ -238,14 +237,14 @@ struct MatchImpl /// Very carefully crafted copy-paste. static void vectorFixedConstant( - const ColumnString::Chars & data, size_t n, const std::string & pattern, + const ColumnString::Chars & data, size_t n, const String & pattern, PaddedPODArray & res) { if (data.empty()) return; - String strstr_pattern; /// A simple case where the LIKE expression reduces to finding a substring in a string + String strstr_pattern; if (like && likePatternIsStrstr(pattern, strstr_pattern)) { const UInt8 * begin = data.data(); @@ -291,9 +290,9 @@ struct MatchImpl { size_t size = data.size() / n; - auto regexp = Regexps::get(pattern); + auto regexp = Regexps::get(pattern); - std::string required_substring; + String required_substring; bool is_trivial; bool required_substring_is_prefix; /// for `anchored` execution of the regexp. diff --git a/src/Functions/Regexps.h b/src/Functions/Regexps.h index 8e6d30c8e14..9a1938a3f32 100644 --- a/src/Functions/Regexps.h +++ b/src/Functions/Regexps.h @@ -44,23 +44,20 @@ namespace Regexps template inline Regexp createRegexp(const std::string & pattern, int flags) { - return {pattern, flags}; - } - - template <> - inline Regexp createRegexp(const std::string & pattern, int flags) - { - return {likePatternToRegexp(pattern), flags}; + if constexpr (like) + return {likePatternToRegexp(pattern), flags}; + else + return {pattern, flags}; } /** Returns holder of an object from Pool. * You must hold the ownership while using the object. * In destructor, it returns the object back to the Pool for further reuse. */ - template + template inline Pool::Pointer get(const std::string & pattern) { - /// C++11 has thread-safe function-local static on most modern compilers. + /// the Singleton is thread-safe in C++11 static Pool known_regexps; /// Different variables for different pattern parameters. return known_regexps.get(pattern, [&pattern] diff --git a/src/Functions/extract.cpp b/src/Functions/extract.cpp index 0296602d205..5b138d19747 100644 --- a/src/Functions/extract.cpp +++ b/src/Functions/extract.cpp @@ -21,7 +21,7 @@ struct ExtractImpl res_data.reserve(data.size() / 5); res_offsets.resize(offsets.size()); - const auto & regexp = Regexps::get(pattern); + const auto & regexp = Regexps::get(pattern); unsigned capture = regexp->getNumberOfSubpatterns() > 0 ? 1 : 0; OptimizedRegularExpression::MatchVec matches; diff --git a/src/Functions/extractAllGroups.h b/src/Functions/extractAllGroups.h index 057dedab6e4..e6d31e00616 100644 --- a/src/Functions/extractAllGroups.h +++ b/src/Functions/extractAllGroups.h @@ -95,7 +95,7 @@ public: throw Exception("Length of 'needle' argument must be greater than 0.", ErrorCodes::BAD_ARGUMENTS); using StringPiece = typename Regexps::Regexp::StringPieceType; - auto holder = Regexps::get(needle); + auto holder = Regexps::get(needle); const auto & regexp = holder->getRE2(); if (!regexp) diff --git a/src/Functions/extractGroups.cpp b/src/Functions/extractGroups.cpp index 2286951bb8f..c5b958ec345 100644 --- a/src/Functions/extractGroups.cpp +++ b/src/Functions/extractGroups.cpp @@ -63,7 +63,7 @@ public: if (needle.empty()) throw Exception(getName() + " length of 'needle' argument must be greater than 0.", ErrorCodes::BAD_ARGUMENTS); - auto regexp = Regexps::get(needle); + auto regexp = Regexps::get(needle); const auto & re2 = regexp->getRE2(); if (!re2) diff --git a/tests/queries/0_stateless/02293_ilike_on_fixed_strings.reference b/tests/queries/0_stateless/02293_ilike_on_fixed_strings.reference new file mode 100644 index 00000000000..5489ab3d7ce --- /dev/null +++ b/tests/queries/0_stateless/02293_ilike_on_fixed_strings.reference @@ -0,0 +1,2 @@ +AA 0 1 +Aa 1 1 diff --git a/tests/queries/0_stateless/02293_ilike_on_fixed_strings.sql b/tests/queries/0_stateless/02293_ilike_on_fixed_strings.sql new file mode 100644 index 00000000000..3838e372e24 --- /dev/null +++ b/tests/queries/0_stateless/02293_ilike_on_fixed_strings.sql @@ -0,0 +1,10 @@ +DROP TABLE IF EXISTS tab; + +CREATE TABLE tab (col FixedString(2)) engine = MergeTree() ORDER BY col; + +INSERT INTO tab VALUES ('AA') ('Aa'); + +SELECT col, col LIKE '%a', col ILIKE '%a' FROM tab WHERE col = 'AA'; +SELECT col, col LIKE '%a', col ILIKE '%a' FROM tab WHERE col = 'Aa'; + +DROP TABLE IF EXISTS tab;