From a1cab43feb52936e6969fa7d119733efe7808407 Mon Sep 17 00:00:00 2001
From: Alexander Tokmakov <avtokmakov@yandex-team.ru>
Date: Tue, 26 Oct 2021 13:32:07 +0300
Subject: [PATCH] fix five years old bug in StingSearcher

---
 src/Common/StringSearcher.h                   | 133 ++++++------------
 ...163_search_case_insensetive_utf8.reference |  12 ++
 .../01163_search_case_insensetive_utf8.sql    |  12 ++
 3 files changed, 66 insertions(+), 91 deletions(-)
 create mode 100644 tests/queries/0_stateless/01163_search_case_insensetive_utf8.reference
 create mode 100644 tests/queries/0_stateless/01163_search_case_insensetive_utf8.sql
diff --git a/src/Common/StringSearcher.h b/src/Common/StringSearcher.h
index af1d36adf81..f34bc6f7322 100644
--- a/src/Common/StringSearcher.h
+++ b/src/Common/StringSearcher.h
@@ -116,9 +116,9 @@ public:
 
                 /// lower and uppercase variants of the first octet of the first character in `needle`
                 size_t length_l = UTF8::convertCodePointToUTF8(first_l_u32, l_seq, sizeof(l_seq));
-                size_t length_r = UTF8::convertCodePointToUTF8(first_u_u32, u_seq, sizeof(u_seq));
+                size_t length_u = UTF8::convertCodePointToUTF8(first_u_u32, u_seq, sizeof(u_seq));
 
-                if (length_l != length_r)
+                if (length_l != length_u)
                     throw Exception{"UTF8 sequences with different lowercase and uppercase lengths are not supported", ErrorCodes::UNSUPPORTED_PARAMETER};
             }
 
@@ -183,6 +183,31 @@ public:
 #endif
     }
 
+    template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
+    ALWAYS_INLINE bool compareTrivial(const CharT * haystack_pos, const CharT * const haystack_end, const uint8_t * needle_pos) const
+    {
+        while (haystack_pos < haystack_end && needle_pos < needle_end)
+        {
+            auto haystack_code_point = UTF8::convertUTF8ToCodePoint(haystack_pos, haystack_end - haystack_pos);
+            auto needle_code_point = UTF8::convertUTF8ToCodePoint(needle_pos, needle_end - needle_pos);
+
+            /// Invalid UTF-8, should not compare equals
+            if (!haystack_code_point || !needle_code_point)
+                break;
+
+            /// Not equals case insensitive.
+            if (Poco::Unicode::toLower(*haystack_code_point) != Poco::Unicode::toLower(*needle_code_point))
+                break;
+
+            /// @note assuming sequences for lowercase and uppercase have exact same length (that is not always true)
+            const auto len = UTF8::seqLength(*haystack_pos);
+            haystack_pos += len;
+            needle_pos += len;
+        }
+
+        return needle_pos == needle_end;
+    }
+
     template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
     ALWAYS_INLINE bool compare(const CharT * /*haystack*/, const CharT * haystack_end, const CharT * pos) const
     {
@@ -200,34 +225,15 @@ public:
             {
                 if (mask == cachemask)
                 {
-                    pos += cache_valid_len;
-                    auto needle_pos = needle + cache_valid_len;
-
-                    while (needle_pos < needle_end)
-                    {
-                        auto haystack_code_point = UTF8::convertUTF8ToCodePoint(pos, haystack_end - pos);
-                        auto needle_code_point = UTF8::convertUTF8ToCodePoint(needle_pos, needle_end - needle_pos);
-
-                        /// Invalid UTF-8, should not compare equals
-                        if (!haystack_code_point || !needle_code_point)
-                            break;
-
-                        /// Not equals case insensitive.
-                        if (Poco::Unicode::toLower(*haystack_code_point) != Poco::Unicode::toLower(*needle_code_point))
-                            break;
-
-                        /// @note assuming sequences for lowercase and uppercase have exact same length (that is not always true)
-                        const auto len = UTF8::seqLength(*pos);
-                        pos += len;
-                        needle_pos += len;
-                    }
-
-                    if (needle_pos == needle_end)
+                    if (compareTrivial(pos, haystack_end, needle))
                         return true;
                 }
             }
             else if ((mask & cachemask) == cachemask)
-                return true;
+            {
+                if (compareTrivial(pos, haystack_end, needle))
+                    return true;
+            }
 
             return false;
         }
@@ -238,25 +244,7 @@ public:
             pos += first_needle_symbol_is_ascii;
             auto needle_pos = needle + first_needle_symbol_is_ascii;
 
-            while (needle_pos < needle_end)
-            {
-                auto haystack_code_point = UTF8::convertUTF8ToCodePoint(pos, haystack_end - pos);
-                auto needle_code_point = UTF8::convertUTF8ToCodePoint(needle_pos, needle_end - needle_pos);
-
-                /// Invalid UTF-8, should not compare equals
-                if (!haystack_code_point || !needle_code_point)
-                    break;
-
-                /// Not equals case insensitive.
-                if (Poco::Unicode::toLower(*haystack_code_point) != Poco::Unicode::toLower(*needle_code_point))
-                    break;
-
-                const auto len = UTF8::seqLength(*pos);
-                pos += len;
-                needle_pos += len;
-            }
-
-            if (needle_pos == needle_end)
+            if (compareTrivial(pos, haystack_end, needle_pos))
                 return true;
         }
 
@@ -299,40 +287,21 @@ public:
                     const auto v_against_l_offset = _mm_cmpeq_epi8(v_haystack_offset, cachel);
                     const auto v_against_u_offset = _mm_cmpeq_epi8(v_haystack_offset, cacheu);
                     const auto v_against_l_or_u_offset = _mm_or_si128(v_against_l_offset, v_against_u_offset);
-                    const auto mask_offset = _mm_movemask_epi8(v_against_l_or_u_offset);
+                    const auto mask_offset_both = _mm_movemask_epi8(v_against_l_or_u_offset);
 
                     if (0xffff == cachemask)
                     {
-                        if (mask_offset == cachemask)
+                        if (mask_offset_both == cachemask)
                         {
-                            auto haystack_pos = haystack + cache_valid_len;
-                            auto needle_pos = needle + cache_valid_len;
-
-                            while (haystack_pos < haystack_end && needle_pos < needle_end)
-                            {
-                                auto haystack_code_point = UTF8::convertUTF8ToCodePoint(haystack_pos, haystack_end - haystack_pos);
-                                auto needle_code_point = UTF8::convertUTF8ToCodePoint(needle_pos, needle_end - needle_pos);
-
-                                /// Invalid UTF-8, should not compare equals
-                                if (!haystack_code_point || !needle_code_point)
-                                    break;
-
-                                /// Not equals case insensitive.
-                                if (Poco::Unicode::toLower(*haystack_code_point) != Poco::Unicode::toLower(*needle_code_point))
-                                    break;
-
-                                /// @note assuming sequences for lowercase and uppercase have exact same length (that is not always true)
-                                const auto len = UTF8::seqLength(*haystack_pos);
-                                haystack_pos += len;
-                                needle_pos += len;
-                            }
-
-                            if (needle_pos == needle_end)
+                            if (compareTrivial(haystack, haystack_end, needle))
                                 return haystack;
                         }
                     }
-                    else if ((mask_offset & cachemask) == cachemask)
-                        return haystack;
+                    else if ((mask_offset_both & cachemask) == cachemask)
+                    {
+                        if (compareTrivial(haystack, haystack_end, needle))
+                            return haystack;
+                    }
 
                     /// first octet was ok, but not the first 16, move to start of next sequence and reapply
                     haystack += UTF8::seqLength(*haystack);
@@ -349,25 +318,7 @@ public:
                 auto haystack_pos = haystack + first_needle_symbol_is_ascii;
                 auto needle_pos = needle + first_needle_symbol_is_ascii;
 
-                while (haystack_pos < haystack_end && needle_pos < needle_end)
-                {
-                    auto haystack_code_point = UTF8::convertUTF8ToCodePoint(haystack_pos, haystack_end - haystack_pos);
-                    auto needle_code_point = UTF8::convertUTF8ToCodePoint(needle_pos, needle_end - needle_pos);
-
-                    /// Invalid UTF-8, should not compare equals
-                    if (!haystack_code_point || !needle_code_point)
-                        break;
-
-                    /// Not equals case insensitive.
-                    if (Poco::Unicode::toLower(*haystack_code_point) != Poco::Unicode::toLower(*needle_code_point))
-                        break;
-
-                    const auto len = UTF8::seqLength(*haystack_pos);
-                    haystack_pos += len;
-                    needle_pos += len;
-                }
-
-                if (needle_pos == needle_end)
+                if (compareTrivial(haystack_pos, haystack_end, needle_pos))
                     return haystack;
             }
 
diff --git a/tests/queries/0_stateless/01163_search_case_insensetive_utf8.reference b/tests/queries/0_stateless/01163_search_case_insensetive_utf8.reference
new file mode 100644
index 00000000000..66f4ca4a5a8
--- /dev/null
+++ b/tests/queries/0_stateless/01163_search_case_insensetive_utf8.reference
@@ -0,0 +1,12 @@
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
diff --git a/tests/queries/0_stateless/01163_search_case_insensetive_utf8.sql b/tests/queries/0_stateless/01163_search_case_insensetive_utf8.sql
new file mode 100644
index 00000000000..99bdd38ceae
--- /dev/null
+++ b/tests/queries/0_stateless/01163_search_case_insensetive_utf8.sql
@@ -0,0 +1,12 @@
+SELECT positionCaseInsensitiveUTF8(materialize('сссссс'), 'Ё');
+SELECT countSubstringsCaseInsensitiveUTF8(materialize('сссссс'), 'ё');
+SELECT positionCaseInsensitiveUTF8(materialize('сссссссс'), 'ё');
+SELECT countSubstringsCaseInsensitiveUTF8(materialize('сссссссс'), 'Ё');
+SELECT countSubstringsCaseInsensitiveUTF8(materialize('ссссссссссссссссссс'), 'ёёёёёёё');
+SELECT positionCaseInsensitiveUTF8(materialize('ссссссссссссссссссс'), 'ёЁёЁёЁё');
+SELECT countSubstringsCaseInsensitiveUTF8(materialize('ссссссссссссссссссс'), 'ёЁёЁёЁёЁёЁ');
+SELECT positionCaseInsensitiveUTF8(materialize('ссссссссссссссссссс'), 'ЁЁЁЁЁЁЁЁЁЁ');
+SELECT countSubstringsCaseInsensitiveUTF8(materialize('ссссссссссссссссссс'), 'ёЁёЁёЁёссс');
+SELECT positionCaseInsensitiveUTF8(materialize('ссссссссссссссссссс'), 'ёЁёЁёЁёссс');
+SELECT countSubstringsCaseInsensitiveUTF8(materialize('ссссссссссссссссссс'), 'ЁС');
+SELECT positionCaseInsensitiveUTF8(materialize('ссссссссссссссссссс'), 'ёс');