Fixed error in lowerUTF8 and upperUTF8 functions [#CLICKHOUSE-2]

2024-11-21 15:12:02 +00:00 · 2018-11-26 01:26:36 +03:00 · 2018-11-26 01:26:36 +03:00 · 4cb7f2896c
commit 4cb7f2896c
parent 68e0a687c8
4 changed files with 31 additions and 15 deletions
--- a/dbms/src/Functions/LowerUpperUTF8Impl.h
+++ b/dbms/src/Functions/LowerUpperUTF8Impl.h
@ -1,5 +1,6 @@
 #include <Columns/ColumnString.h>
 #include <Poco/UTF8Encoding.h>
+#include <Common/UTF8Helpers.h>

 #if __SSE2__
 #include <emmintrin.h>
@ -9,6 +10,11 @@
 namespace DB
 {

+namespace ErrorCodes
+{
+    extern const int BAD_ARGUMENTS;
+}
+
 namespace
 {
    /// xor or do nothing
@ -90,10 +96,9 @@ struct LowerUpperUTF8Impl
        array(data.data(), data.data() + data.size(), res_data.data());
    }

-    static void vector_fixed(const ColumnString::Chars & data, size_t /*n*/, ColumnString::Chars & res_data)
+    static void vector_fixed(const ColumnString::Chars &, size_t, ColumnString::Chars &)
    {
-        res_data.resize(data.size());
-        array(data.data(), data.data() + data.size(), res_data.data());
+        throw Exception("Functions lowerUTF8 and upperUTF8 cannot work with FixedString argument", ErrorCodes::BAD_ARGUMENTS);
    }

    /** Converts a single code point starting at `src` to desired case, storing result starting at `dst`.
@ -129,16 +134,28 @@ struct LowerUpperUTF8Impl
        {
            static const Poco::UTF8Encoding utf8;

-            if (const auto chars = utf8.convert(to_case(utf8.convert(src)), dst, src_end - src))
+            int src_sequence_length = UTF8::seqLength(*src);
+
+            int src_code_point = utf8.queryConvert(src, src_end - src);
+            if (src_code_point > 0)
            {
-                src += chars;
-                dst += chars;
-            }
-            else
-            {
-                ++src;
-                ++dst;
+                int dst_code_point = to_case(src_code_point);
+                if (dst_code_point > 0)
+                {
+                    int dst_sequence_length = utf8.convert(dst_code_point, dst, src_end - src);
+
+                    /// We don't support cases when lowercase and uppercase characters occupy different number of bytes in UTF-8.
+                    /// As an example, this happens for ß and ẞ.
+                    if (dst_sequence_length == src_sequence_length)
+                    {
+                        src += dst_sequence_length;
+                        dst += dst_sequence_length;
+                        return;
+                    }
+                }
            }
+
+            *dst++ = *src++;
        }
    }

@ -149,7 +166,7 @@ private:
    static void array(const UInt8 * src, const UInt8 * src_end, UInt8 * dst)
    {
 #if __SSE2__
-        const auto bytes_sse = sizeof(__m128i);
+        static constexpr auto bytes_sse = sizeof(__m128i);
        auto src_end_sse = src + (src_end - src) / bytes_sse * bytes_sse;

        /// SSE2 packed comparison operate on signed types, hence compare (c < 0) instead of (c > 0x7f)
--- a/dbms/tests/queries/0_stateless/00761_lower_utf8_bug.reference
+++ b/dbms/tests/queries/0_stateless/00761_lower_utf8_bug.reference
@ -0,0 +1 @@
+1
--- a/dbms/tests/queries/0_stateless/00761_lower_utf8_bug.sql
+++ b/dbms/tests/queries/0_stateless/00761_lower_utf8_bug.sql
@ -0,0 +1 @@
+SELECT lowerUTF8('\xF0') = lowerUTF8('\xF0');
--- a/dbms/tests/queries/bugs/fuzzy.sql
+++ b/dbms/tests/queries/bugs/fuzzy.sql
@ -6,9 +6,6 @@ SELECT globalNotIn(['"wh'], [NULL]);
 SELECT globalIn([''], [NULL])
 SELECT ( SELECT toDecimal128([], rowNumberInBlock()) ) , lcm('', [[(CAST(('>A') AS String))]]);
 SELECT truncate(895, -16);
-SELECT (CAST((lowerUTF8('a7\xwK>-')) AS String)), [6935];
-SELECT upperUTF8(sipHash128('\0')), [], ['xD2jG'];
-SELECT upperUTF8(SHA256(''));
 SELECT arrayEnumerateUniq(anyHeavy([]), []);
 SELECT notIn([['']], [[NULL]]);
 SELECT subtractDays((CAST((-5263074.47) AS DateTime)), -737895);
				`@ -0,0 +1 @@`
				`SELECT lowerUTF8('\xF0') = lowerUTF8('\xF0');`