Merge pull request #68822 from bigo-sg/improve_upper_lower

Speed up functions `lowerUTF8`/`upperUTF8`
2024-11-23 08:02:02 +00:00 · 2024-08-28 08:53:30 +00:00 · 2024-08-28 08:53:30 +00:00 · cbf60f83b2
commit cbf60f83b2
parent 9fb4c23c06 53ace69ede
3 changed files with 53 additions and 32 deletions
--- a/src/Functions/LowerUpperUTF8Impl.h
+++ b/src/Functions/LowerUpperUTF8Impl.h
@ -4,17 +4,21 @@

 #if USE_ICU

-#include <Columns/ColumnString.h>
-#include <Functions/LowerUpperImpl.h>
-#include <unicode/unistr.h>
-#include <Common/StringUtils.h>
+#    include <Columns/ColumnString.h>
+#    include <Functions/LowerUpperImpl.h>
+#    include <unicode/ucasemap.h>
+#    include <unicode/unistr.h>
+#    include <unicode/urename.h>
+#    include <unicode/utypes.h>
+#    include <Common/StringUtils.h>

 namespace DB
 {

 namespace ErrorCodes
 {
-    extern const int BAD_ARGUMENTS;
+extern const int BAD_ARGUMENTS;
+extern const int LOGICAL_ERROR;
 }

 template <char not_case_lower_bound, char not_case_upper_bound, bool upper>
@ -27,7 +31,7 @@ struct LowerUpperUTF8Impl
        ColumnString::Offsets & res_offsets,
        size_t input_rows_count)
    {
-        if (data.empty())
+        if (input_rows_count == 0)
            return;

        bool all_ascii = isAllASCII(data.data(), data.size());
@ -38,39 +42,56 @@ struct LowerUpperUTF8Impl
        }

        res_data.resize(data.size());
-        res_offsets.resize_exact(offsets.size());
+        res_offsets.resize_exact(input_rows_count);
+
+        UErrorCode error_code = U_ZERO_ERROR;
+        UCaseMap * case_map = ucasemap_open("", U_FOLD_CASE_DEFAULT, &error_code);
+        if (U_FAILURE(error_code))
+            throw DB::Exception(ErrorCodes::LOGICAL_ERROR, "Error calling ucasemap_open: {}", u_errorName(error_code));

-        String output;
        size_t curr_offset = 0;
-        for (size_t i = 0; i < input_rows_count; ++i)
+        for (size_t row_i = 0; row_i < input_rows_count; ++row_i)
        {
-            const auto * data_start = reinterpret_cast<const char *>(&data[offsets[i - 1]]);
-            size_t size = offsets[i] - offsets[i - 1];
+            const auto * src = reinterpret_cast<const char *>(&data[offsets[row_i - 1]]);
+            size_t src_size = offsets[row_i] - offsets[row_i - 1] - 1;

-            icu::UnicodeString input(data_start, static_cast<int32_t>(size), "UTF-8");
+            int32_t dst_size;
            if constexpr (upper)
-                input.toUpper();
+                dst_size = ucasemap_utf8ToUpper(
+                    case_map, reinterpret_cast<char *>(&res_data[curr_offset]), res_data.size() - curr_offset, src, src_size, &error_code);
            else
-                input.toLower();
+                dst_size = ucasemap_utf8ToLower(
+                    case_map, reinterpret_cast<char *>(&res_data[curr_offset]), res_data.size() - curr_offset, src, src_size, &error_code);

-            output.clear();
-            input.toUTF8String(output);
+            if (error_code == U_BUFFER_OVERFLOW_ERROR || error_code == U_STRING_NOT_TERMINATED_WARNING)
+            {
+                size_t new_size = curr_offset + dst_size + 1;
+                res_data.resize(new_size);

-            /// For valid UTF-8 input strings, ICU sometimes produces output with an extra '\0 at the end. Only the data before that
-            /// '\0' is valid. If the input is not valid UTF-8, then the behavior of lower/upperUTF8 is undefined by definition. In this
-            /// case, the behavior is also reasonable.
-            size_t valid_size = output.size();
-            if (!output.empty() && output.back() == '\0')
-                --valid_size;
+                error_code = U_ZERO_ERROR;
+                if constexpr (upper)
+                    dst_size = ucasemap_utf8ToUpper(
+                        case_map, reinterpret_cast<char *>(&res_data[curr_offset]), res_data.size() - curr_offset, src, src_size, &error_code);
+                else
+                    dst_size = ucasemap_utf8ToLower(
+                        case_map, reinterpret_cast<char *>(&res_data[curr_offset]), res_data.size() - curr_offset, src, src_size, &error_code);
+            }

-            res_data.resize(curr_offset + valid_size + 1);
+            if (error_code != U_ZERO_ERROR)
+                throw DB::Exception(
+                    ErrorCodes::LOGICAL_ERROR,
+                    "Error calling {}: {} input: {} input_size: {}",
+                    upper ? "ucasemap_utf8ToUpper" : "ucasemap_utf8ToLower",
+                    u_errorName(error_code),
+                    std::string_view(src, src_size),
+                    src_size);

-            memcpy(&res_data[curr_offset], output.data(), valid_size);
-            res_data[curr_offset + valid_size] = 0;
-
-            curr_offset += valid_size + 1;
-            res_offsets[i] = curr_offset;
+            res_data[curr_offset + dst_size] = 0;
+            curr_offset += dst_size + 1;
+            res_offsets[row_i] = curr_offset;
        }
+
+        res_data.resize(curr_offset);
    }

    static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &, size_t)
--- a/tests/queries/0_stateless/01431_utf8_ubsan.reference
+++ b/tests/queries/0_stateless/01431_utf8_ubsan.reference
@ -1,2 +1,2 @@
-EFBFBD
-EFBFBD
+FF
+FF
--- a/tests/queries/0_stateless/02071_lower_upper_utf8_row_overlaps.reference
+++ b/tests/queries/0_stateless/02071_lower_upper_utf8_row_overlaps.reference
@ -5,9 +5,9 @@ insert into utf8_overlap values ('\xe2'), ('Foo⚊BarBazBam'), ('\xe2'), ('Foo
 --                                             MONOGRAM FOR YANG
 with lowerUTF8(str) as l_, upperUTF8(str) as u_, '0x' || hex(str) as h_
 select length(str), if(l_ == '\xe2', h_, l_), if(u_ == '\xe2', h_, u_) from utf8_overlap format CSV;
-1,"<EFBFBD>","<22>"
+1,"0xE2","0xE2"
 15,"foo⚊barbazbam","FOO⚊BARBAZBAM"
-1,"<EFBFBD>","<22>"
+1,"0xE2","0xE2"
 15,"foo⚊barbazbam","FOO⚊BARBAZBAM"
 -- NOTE: regression test for introduced bug
 -- https://github.com/ClickHouse/ClickHouse/issues/42756