From a85f544205f2e782d4f6c16c0622728475db3571 Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Fri, 16 Aug 2024 08:47:28 +0000
Subject: [PATCH 01/18] Update analyzer_tech_debt.txt

---
 tests/analyzer_tech_debt.txt | 1 -
 1 file changed, 1 deletion(-)
diff --git a/tests/analyzer_tech_debt.txt b/tests/analyzer_tech_debt.txt
index bd92465e1aa..c8edbdc5932 100644
--- a/tests/analyzer_tech_debt.txt
+++ b/tests/analyzer_tech_debt.txt
@@ -1,4 +1,3 @@
 01624_soft_constraints
-02354_vector_search_queries
 # Check after ConstantNode refactoring
 02944_variant_as_common_type

From 60a6e893a40761eb46655e76cb6a3fe5f177019c Mon Sep 17 00:00:00 2001
From: taiyang-li <654010905@qq.com>
Date: Fri, 16 Aug 2024 17:56:12 +0800
Subject: [PATCH 02/18] first commit

---
 src/Common/examples/CMakeLists.txt            |   5 +
 src/Common/examples/utf8_upper_lower.cpp      |  27 ++
 src/Functions/LowerUpperImpl.h                |   1 -
 src/Functions/LowerUpperUTF8Impl.h            | 283 +++---------------
 src/Functions/initcapUTF8.cpp                 |   3 +-
 src/Functions/lowerUTF8.cpp                   |  25 +-
 src/Functions/upperUTF8.cpp                   |  24 +-
 .../00170_lower_upper_utf8.reference          |   4 +
 .../0_stateless/00170_lower_upper_utf8.sql    |  11 +
 .../00233_position_function_family.sql        |   3 +
 .../0_stateless/00761_lower_utf8_bug.sql      |   3 +
 .../0_stateless/01278_random_string_utf8.sql  |   3 +
 .../0_stateless/01431_utf8_ubsan.reference    |   4 +-
 .../queries/0_stateless/01431_utf8_ubsan.sql  |   3 +
 .../0_stateless/01590_countSubstrings.sql     |   3 +
 ...71_lower_upper_utf8_row_overlaps.reference |   4 +-
 .../02071_lower_upper_utf8_row_overlaps.sql   |   3 +
 ...new_functions_must_be_documented.reference |   2 -
 .../02514_if_with_lazy_low_cardinality.sql    |   3 +
 .../0_stateless/02807_lower_utf8_msan.sql     |   3 +
 tests/queries/0_stateless/03015_peder1001.sql |   3 +
 21 files changed, 159 insertions(+), 261 deletions(-)
 create mode 100644 src/Common/examples/utf8_upper_lower.cpp

diff --git a/src/Common/examples/CMakeLists.txt b/src/Common/examples/CMakeLists.txt
index 69580d4ad0e..8383e80d09d 100644
--- a/src/Common/examples/CMakeLists.txt
+++ b/src/Common/examples/CMakeLists.txt
@@ -92,3 +92,8 @@ endif()
 
 clickhouse_add_executable (check_pointer_valid check_pointer_valid.cpp)
 target_link_libraries (check_pointer_valid PRIVATE clickhouse_common_io clickhouse_common_config)
+
+if (TARGET ch_contrib::icu)
+    clickhouse_add_executable (utf8_upper_lower utf8_upper_lower.cpp)
+    target_link_libraries (utf8_upper_lower PRIVATE ch_contrib::icu)
+endif ()
diff --git a/src/Common/examples/utf8_upper_lower.cpp b/src/Common/examples/utf8_upper_lower.cpp
new file mode 100644
index 00000000000..826e1763105
--- /dev/null
+++ b/src/Common/examples/utf8_upper_lower.cpp
@@ -0,0 +1,27 @@
+#include <iostream>
+#include <unicode/unistr.h>
+
+std::string utf8_to_lower(const std::string & input)
+{
+    icu::UnicodeString unicodeInput(input.c_str(), "UTF-8");
+    unicodeInput.toLower();
+    std::string output;
+    unicodeInput.toUTF8String(output);
+    return output;
+}
+
+std::string utf8_to_upper(const std::string & input)
+{
+    icu::UnicodeString unicodeInput(input.c_str(), "UTF-8");
+    unicodeInput.toUpper();
+    std::string output;
+    unicodeInput.toUTF8String(output);
+    return output;
+}
+
+int main()
+{
+    std::string input = "ır";
+    std::cout << "upper:" << utf8_to_upper(input) << std::endl;
+    return 0;
+}
diff --git a/src/Functions/LowerUpperImpl.h b/src/Functions/LowerUpperImpl.h
index d463ef96e16..a52703d10c8 100644
--- a/src/Functions/LowerUpperImpl.h
+++ b/src/Functions/LowerUpperImpl.h
@@ -1,7 +1,6 @@
 #pragma once
 #include <Columns/ColumnString.h>
 
-
 namespace DB
 {
 
diff --git a/src/Functions/LowerUpperUTF8Impl.h b/src/Functions/LowerUpperUTF8Impl.h
index eedabca5b22..5da085f48e5 100644
--- a/src/Functions/LowerUpperUTF8Impl.h
+++ b/src/Functions/LowerUpperUTF8Impl.h
@@ -1,15 +1,14 @@
 #pragma once
+
+#include "config.h"
+
+#if USE_ICU
+
 #include <Columns/ColumnString.h>
 #include <Functions/LowerUpperImpl.h>
-#include <base/defines.h>
-#include <Poco/UTF8Encoding.h>
+#include <base/find_symbols.h>
+#include <unicode/unistr.h>
 #include <Common/StringUtils.h>
-#include <Common/UTF8Helpers.h>
-
-#ifdef __SSE2__
-#include <emmintrin.h>
-#endif
-
 
 namespace DB
 {
@@ -19,71 +18,7 @@ namespace ErrorCodes
     extern const int BAD_ARGUMENTS;
 }
 
-/// xor or do nothing
-template <bool>
-UInt8 xor_or_identity(const UInt8 c, const int mask)
-{
-    return c ^ mask;
-}
-
-template <>
-inline UInt8 xor_or_identity<false>(const UInt8 c, const int)
-{
-    return c;
-}
-
-/// It is caller's responsibility to ensure the presence of a valid cyrillic sequence in array
-template <bool to_lower>
-inline void UTF8CyrillicToCase(const UInt8 *& src, UInt8 *& dst)
-{
-    if (src[0] == 0xD0u && (src[1] >= 0x80u && src[1] <= 0x8Fu))
-    {
-        /// ЀЁЂЃЄЅІЇЈЉЊЋЌЍЎЏ
-        *dst++ = xor_or_identity<to_lower>(*src++, 0x1);
-        *dst++ = xor_or_identity<to_lower>(*src++, 0x10);
-    }
-    else if (src[0] == 0xD1u && (src[1] >= 0x90u && src[1] <= 0x9Fu))
-    {
-        /// ѐёђѓєѕіїјљњћќѝўџ
-        *dst++ = xor_or_identity<!to_lower>(*src++, 0x1);
-        *dst++ = xor_or_identity<!to_lower>(*src++, 0x10);
-    }
-    else if (src[0] == 0xD0u && (src[1] >= 0x90u && src[1] <= 0x9Fu))
-    {
-        /// А-П
-        *dst++ = *src++;
-        *dst++ = xor_or_identity<to_lower>(*src++, 0x20);
-    }
-    else if (src[0] == 0xD0u && (src[1] >= 0xB0u && src[1] <= 0xBFu))
-    {
-        /// а-п
-        *dst++ = *src++;
-        *dst++ = xor_or_identity<!to_lower>(*src++, 0x20);
-    }
-    else if (src[0] == 0xD0u && (src[1] >= 0xA0u && src[1] <= 0xAFu))
-    {
-        /// Р-Я
-        *dst++ = xor_or_identity<to_lower>(*src++, 0x1);
-        *dst++ = xor_or_identity<to_lower>(*src++, 0x20);
-    }
-    else if (src[0] == 0xD1u && (src[1] >= 0x80u && src[1] <= 0x8Fu))
-    {
-        /// р-я
-        *dst++ = xor_or_identity<!to_lower>(*src++, 0x1);
-        *dst++ = xor_or_identity<!to_lower>(*src++, 0x20);
-    }
-}
-
-
-/** If the string contains UTF-8 encoded text, convert it to the lower (upper) case.
-  * Note: It is assumed that after the character is converted to another case,
-  *  the length of its multibyte sequence in UTF-8 does not change.
-  * Otherwise, the behavior is undefined.
-  */
-template <char not_case_lower_bound,
-    char not_case_upper_bound,
-    int to_case(int),
-    void cyrillic_to_case(const UInt8 *&, UInt8 *&)>
+template <char not_case_lower_bound, char not_case_upper_bound, bool upper>
 struct LowerUpperUTF8Impl
 {
     static void vector(
@@ -103,180 +38,46 @@ struct LowerUpperUTF8Impl
             return;
         }
 
-        res_data.resize_exact(data.size());
-        res_offsets.assign(offsets);
-        array(data.data(), data.data() + data.size(), offsets, res_data.data());
+        res_data.resize(data.size());
+        res_offsets.resize_exact(offsets.size());
+
+        String output;
+        size_t curr_offset = 0;
+        for (size_t i = 0; i < offsets.size(); ++i)
+        {
+            const auto * data_start = reinterpret_cast<const char *>(&data[offsets[i - 1]]);
+            size_t size = offsets[i] - offsets[i - 1];
+
+            icu::UnicodeString input(data_start, static_cast<int32_t>(size), "UTF-8");
+            if constexpr (upper)
+                input.toUpper();
+            else
+                input.toLower();
+
+            output.clear();
+            input.toUTF8String(output);
+
+            /// For valid UTF-8 input strings, ICU sometimes produces output with extra '\0's at the end. Only the data before the first
+            /// '\0' is valid. It the input is not valid UTF-8, then the behavior of lower/upperUTF8 is undefined by definition. In this
+            /// case, the behavior is also reasonable.
+            const char * res_end = find_last_not_symbols_or_null<'\0'>(output.data(), output.data() + output.size());
+            size_t valid_size = res_end ? res_end - output.data() + 1 : 0;
+
+            res_data.resize(curr_offset + valid_size + 1);
+            memcpy(&res_data[curr_offset], output.data(), valid_size);
+            res_data[curr_offset + valid_size] = 0;
+
+            curr_offset += valid_size + 1;
+            res_offsets[i] = curr_offset;
+        }
     }
 
     static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &, size_t)
     {
         throw Exception(ErrorCodes::BAD_ARGUMENTS, "Functions lowerUTF8 and upperUTF8 cannot work with FixedString argument");
     }
-
-    /** Converts a single code point starting at `src` to desired case, storing result starting at `dst`.
-     *    `src` and `dst` are incremented by corresponding sequence lengths. */
-    static bool toCase(const UInt8 *& src, const UInt8 * src_end, UInt8 *& dst, bool partial)
-    {
-        if (src[0] <= ascii_upper_bound)
-        {
-            if (*src >= not_case_lower_bound && *src <= not_case_upper_bound)
-                *dst++ = *src++ ^ flip_case_mask;
-            else
-                *dst++ = *src++;
-        }
-        else if (src + 1 < src_end
-            && ((src[0] == 0xD0u && (src[1] >= 0x80u && src[1] <= 0xBFu)) || (src[0] == 0xD1u && (src[1] >= 0x80u && src[1] <= 0x9Fu))))
-        {
-            cyrillic_to_case(src, dst);
-        }
-        else if (src + 1 < src_end && src[0] == 0xC2u)
-        {
-            /// Punctuation U+0080 - U+00BF, UTF-8: C2 80 - C2 BF
-            *dst++ = *src++;
-            *dst++ = *src++;
-        }
-        else if (src + 2 < src_end && src[0] == 0xE2u)
-        {
-            /// Characters U+2000 - U+2FFF, UTF-8: E2 80 80 - E2 BF BF
-            *dst++ = *src++;
-            *dst++ = *src++;
-            *dst++ = *src++;
-        }
-        else
-        {
-            size_t src_sequence_length = UTF8::seqLength(*src);
-            /// In case partial buffer was passed (due to SSE optimization)
-            /// we cannot convert it with current src_end, but we may have more
-            /// bytes to convert and eventually got correct symbol.
-            if (partial && src_sequence_length > static_cast<size_t>(src_end - src))
-                return false;
-
-            auto src_code_point = UTF8::convertUTF8ToCodePoint(src, src_end - src);
-            if (src_code_point)
-            {
-                int dst_code_point = to_case(*src_code_point);
-                if (dst_code_point > 0)
-                {
-                    size_t dst_sequence_length = UTF8::convertCodePointToUTF8(dst_code_point, dst, src_end - src);
-                    assert(dst_sequence_length <= 4);
-
-                    /// We don't support cases when lowercase and uppercase characters occupy different number of bytes in UTF-8.
-                    /// As an example, this happens for ß and ẞ.
-                    if (dst_sequence_length == src_sequence_length)
-                    {
-                        src += dst_sequence_length;
-                        dst += dst_sequence_length;
-                        return true;
-                    }
-                }
-            }
-
-            *dst = *src;
-            ++dst;
-            ++src;
-        }
-
-        return true;
-    }
-
-private:
-    static constexpr auto ascii_upper_bound = '\x7f';
-    static constexpr auto flip_case_mask = 'A' ^ 'a';
-
-    static void array(const UInt8 * src, const UInt8 * src_end, const ColumnString::Offsets & offsets, UInt8 * dst)
-    {
-        const auto * offset_it = offsets.begin();
-        const UInt8 * begin = src;
-
-#ifdef __SSE2__
-        static constexpr auto bytes_sse = sizeof(__m128i);
-
-        /// If we are before this position, we can still read at least bytes_sse.
-        const auto * src_end_sse = src_end - bytes_sse + 1;
-
-        /// SSE2 packed comparison operate on signed types, hence compare (c < 0) instead of (c > 0x7f)
-        const auto v_zero = _mm_setzero_si128();
-        const auto v_not_case_lower_bound = _mm_set1_epi8(not_case_lower_bound - 1);
-        const auto v_not_case_upper_bound = _mm_set1_epi8(not_case_upper_bound + 1);
-        const auto v_flip_case_mask = _mm_set1_epi8(flip_case_mask);
-
-        while (src < src_end_sse)
-        {
-            const auto chars = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
-
-            /// check for ASCII
-            const auto is_not_ascii = _mm_cmplt_epi8(chars, v_zero);
-            const auto mask_is_not_ascii = _mm_movemask_epi8(is_not_ascii);
-
-            /// ASCII
-            if (mask_is_not_ascii == 0)
-            {
-                const auto is_not_case
-                    = _mm_and_si128(_mm_cmpgt_epi8(chars, v_not_case_lower_bound), _mm_cmplt_epi8(chars, v_not_case_upper_bound));
-                const auto mask_is_not_case = _mm_movemask_epi8(is_not_case);
-
-                /// everything in correct case ASCII
-                if (mask_is_not_case == 0)
-                    _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), chars);
-                else
-                {
-                    /// ASCII in mixed case
-                    /// keep `flip_case_mask` only where necessary, zero out elsewhere
-                    const auto xor_mask = _mm_and_si128(v_flip_case_mask, is_not_case);
-
-                    /// flip case by applying calculated mask
-                    const auto cased_chars = _mm_xor_si128(chars, xor_mask);
-
-                    /// store result back to destination
-                    _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), cased_chars);
-                }
-
-                src += bytes_sse;
-                dst += bytes_sse;
-            }
-            else
-            {
-                /// UTF-8
-
-                /// Find the offset of the next string after src
-                size_t offset_from_begin = src - begin;
-                while (offset_from_begin >= *offset_it)
-                    ++offset_it;
-
-                /// Do not allow one row influence another (since row may have invalid sequence, and break the next)
-                const UInt8 * row_end = begin + *offset_it;
-                chassert(row_end >= src);
-                const UInt8 * expected_end = std::min(src + bytes_sse, row_end);
-
-                while (src < expected_end)
-                {
-                    if (!toCase(src, expected_end, dst, /* partial= */ true))
-                    {
-                        /// Fallback to handling byte by byte.
-                        src_end_sse = src;
-                        break;
-                    }
-                }
-            }
-        }
-
-        /// Find the offset of the next string after src
-        size_t offset_from_begin = src - begin;
-        while (offset_it != offsets.end() && offset_from_begin >= *offset_it)
-            ++offset_it;
-#endif
-
-        /// handle remaining symbols, row by row (to avoid influence of bad UTF8 symbols from one row, to another)
-        while (src < src_end)
-        {
-            const UInt8 * row_end = begin + *offset_it;
-            chassert(row_end >= src);
-
-            while (src < row_end)
-                toCase(src, row_end, dst, /* partial= */ false);
-            ++offset_it;
-        }
-    }
 };
 
 }
+
+#endif
diff --git a/src/Functions/initcapUTF8.cpp b/src/Functions/initcapUTF8.cpp
index 282d846094e..004586dce26 100644
--- a/src/Functions/initcapUTF8.cpp
+++ b/src/Functions/initcapUTF8.cpp
@@ -1,9 +1,8 @@
 #include <DataTypes/DataTypeString.h>
 #include <Functions/FunctionStringToString.h>
-#include <Functions/LowerUpperUTF8Impl.h>
 #include <Functions/FunctionFactory.h>
 #include <Poco/Unicode.h>
-
+#include <Common/UTF8Helpers.h>
 
 namespace DB
 {
diff --git a/src/Functions/lowerUTF8.cpp b/src/Functions/lowerUTF8.cpp
index 7adb0069121..e2f7cb84730 100644
--- a/src/Functions/lowerUTF8.cpp
+++ b/src/Functions/lowerUTF8.cpp
@@ -1,9 +1,10 @@
-#include <DataTypes/DataTypeString.h>
+#include "config.h"
+
+#if USE_ICU
+
+#include <Functions/FunctionFactory.h>
 #include <Functions/FunctionStringToString.h>
 #include <Functions/LowerUpperUTF8Impl.h>
-#include <Functions/FunctionFactory.h>
-#include <Poco/Unicode.h>
-
 
 namespace DB
 {
@@ -15,13 +16,25 @@ struct NameLowerUTF8
     static constexpr auto name = "lowerUTF8";
 };
 
-using FunctionLowerUTF8 = FunctionStringToString<LowerUpperUTF8Impl<'A', 'Z', Poco::Unicode::toLower, UTF8CyrillicToCase<true>>, NameLowerUTF8>;
+using FunctionLowerUTF8 = FunctionStringToString<LowerUpperUTF8Impl<'A', 'Z', false>, NameLowerUTF8>;
 
 }
 
 REGISTER_FUNCTION(LowerUTF8)
 {
-    factory.registerFunction<FunctionLowerUTF8>();
+    FunctionDocumentation::Description description
+        = R"(Converts a string to lowercase, assuming that the string contains valid UTF-8 encoded text. If this assumption is violated, no exception is thrown and the result is undefined.)";
+    FunctionDocumentation::Syntax syntax = "lowerUTF8(input)";
+    FunctionDocumentation::Arguments arguments = {{"input", "Input with String type"}};
+    FunctionDocumentation::ReturnedValue returned_value = "A String data type value";
+    FunctionDocumentation::Examples examples = {
+        {"first", "SELECT lowerUTF8('München') as Lowerutf8;", "münchen"},
+    };
+    FunctionDocumentation::Categories categories = {"String"};
+
+    factory.registerFunction<FunctionLowerUTF8>({description, syntax, arguments, returned_value, examples, categories});
 }
 
 }
+
+#endif
diff --git a/src/Functions/upperUTF8.cpp b/src/Functions/upperUTF8.cpp
index 659e67f0ef3..ef26430331f 100644
--- a/src/Functions/upperUTF8.cpp
+++ b/src/Functions/upperUTF8.cpp
@@ -1,8 +1,10 @@
+#include "config.h"
+
+#if USE_ICU
+
+#include <Functions/FunctionFactory.h>
 #include <Functions/FunctionStringToString.h>
 #include <Functions/LowerUpperUTF8Impl.h>
-#include <Functions/FunctionFactory.h>
-#include <Poco/Unicode.h>
-
 
 namespace DB
 {
@@ -14,13 +16,25 @@ struct NameUpperUTF8
     static constexpr auto name = "upperUTF8";
 };
 
-using FunctionUpperUTF8 = FunctionStringToString<LowerUpperUTF8Impl<'a', 'z', Poco::Unicode::toUpper, UTF8CyrillicToCase<false>>, NameUpperUTF8>;
+using FunctionUpperUTF8 = FunctionStringToString<LowerUpperUTF8Impl<'a', 'z', true>, NameUpperUTF8>;
 
 }
 
 REGISTER_FUNCTION(UpperUTF8)
 {
-    factory.registerFunction<FunctionUpperUTF8>();
+    FunctionDocumentation::Description description
+        = R"(Converts a string to lowercase, assuming that the string contains valid UTF-8 encoded text. If this assumption is violated, no exception is thrown and the result is undefined.)";
+    FunctionDocumentation::Syntax syntax = "upperUTF8(input)";
+    FunctionDocumentation::Arguments arguments = {{"input", "Input with String type"}};
+    FunctionDocumentation::ReturnedValue returned_value = "A String data type value";
+    FunctionDocumentation::Examples examples = {
+        {"first", "SELECT upperUTF8('München') as Upperutf8;", "MÜNCHEN"},
+    };
+    FunctionDocumentation::Categories categories = {"String"};
+
+    factory.registerFunction<FunctionUpperUTF8>({description, syntax, arguments, returned_value, examples, categories});
 }
 
 }
+
+#endif
diff --git a/tests/queries/0_stateless/00170_lower_upper_utf8.reference b/tests/queries/0_stateless/00170_lower_upper_utf8.reference
index f202cb75513..3c644f22b9b 100644
--- a/tests/queries/0_stateless/00170_lower_upper_utf8.reference
+++ b/tests/queries/0_stateless/00170_lower_upper_utf8.reference
@@ -22,3 +22,7 @@
 1
 1
 1
+1
+1
+1
+1
diff --git a/tests/queries/0_stateless/00170_lower_upper_utf8.sql b/tests/queries/0_stateless/00170_lower_upper_utf8.sql
index 4caba2033ff..85b6c5c6095 100644
--- a/tests/queries/0_stateless/00170_lower_upper_utf8.sql
+++ b/tests/queries/0_stateless/00170_lower_upper_utf8.sql
@@ -1,3 +1,6 @@
+-- Tags: no-fasttest
+-- no-fasttest: upper/lowerUTF8 use ICU
+
 select lower('aaaaaaaaaaaaaaa012345789,.!aaaa' as str) = str;
 select lowerUTF8('aaaaaaaaaaaaaaa012345789,.!aaaa' as str) = str;
 select lower('AaAaAaAaAaAaAaA012345789,.!aAaA') = 'aaaaaaaaaaaaaaa012345789,.!aaaa';
@@ -27,3 +30,11 @@ select sum(lower(materialize('aaaaАБВГAAAAaaAA')) = materialize('aaaaАБВ
 select sum(upper(materialize('aaaaАБВГAAAAaaAA')) = materialize('AAAAАБВГAAAAAAAA')) = count() from system.one array join range(16384) as n;
 select sum(lowerUTF8(materialize('aaaaАБВГAAAAaaAA')) = materialize('aaaaабвгaaaaaaaa')) = count() from system.one array join range(16384) as n;
 select sum(upperUTF8(materialize('aaaaАБВГAAAAaaAA')) = materialize('AAAAАБВГAAAAAAAA')) = count() from system.one array join range(16384) as n;
+
+-- Turkish language
+select upperUTF8('ır') = 'IR';
+select lowerUTF8('ır') = 'ır';
+
+-- German language
+select upper('öäüß') = 'öäüß';
+select lower('ÖÄÜẞ') = 'ÖÄÜẞ';
diff --git a/tests/queries/0_stateless/00233_position_function_family.sql b/tests/queries/0_stateless/00233_position_function_family.sql
index dd7394bc39a..d6668cb7ba4 100644
--- a/tests/queries/0_stateless/00233_position_function_family.sql
+++ b/tests/queries/0_stateless/00233_position_function_family.sql
@@ -1,3 +1,6 @@
+-- Tags: no-fasttest
+-- no-fasttest: upper/lowerUTF8 use ICU
+
 SET send_logs_level = 'fatal';
 
 select 1 = position('', '');
diff --git a/tests/queries/0_stateless/00761_lower_utf8_bug.sql b/tests/queries/0_stateless/00761_lower_utf8_bug.sql
index de20b894331..a0ab55edc15 100644
--- a/tests/queries/0_stateless/00761_lower_utf8_bug.sql
+++ b/tests/queries/0_stateless/00761_lower_utf8_bug.sql
@@ -1 +1,4 @@
+-- Tags: no-fasttest
+-- no-fasttest: upper/lowerUTF8 use ICU
+
 SELECT lowerUTF8('\xF0') = lowerUTF8('\xF0');
diff --git a/tests/queries/0_stateless/01278_random_string_utf8.sql b/tests/queries/0_stateless/01278_random_string_utf8.sql
index da2dc48c3e1..290d6a0c759 100644
--- a/tests/queries/0_stateless/01278_random_string_utf8.sql
+++ b/tests/queries/0_stateless/01278_random_string_utf8.sql
@@ -1,3 +1,6 @@
+-- Tags: no-fasttest
+-- no-fasttest: upper/lowerUTF8 use ICU
+
 SELECT randomStringUTF8('string'); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
 SELECT lengthUTF8(randomStringUTF8(100));
 SELECT toTypeName(randomStringUTF8(10));
diff --git a/tests/queries/0_stateless/01431_utf8_ubsan.reference b/tests/queries/0_stateless/01431_utf8_ubsan.reference
index c98c950d535..dc785e57851 100644
--- a/tests/queries/0_stateless/01431_utf8_ubsan.reference
+++ b/tests/queries/0_stateless/01431_utf8_ubsan.reference
@@ -1,2 +1,2 @@
-FF
-FF
+EFBFBD
+EFBFBD
diff --git a/tests/queries/0_stateless/01431_utf8_ubsan.sql b/tests/queries/0_stateless/01431_utf8_ubsan.sql
index d6a299225b1..3a28e023805 100644
--- a/tests/queries/0_stateless/01431_utf8_ubsan.sql
+++ b/tests/queries/0_stateless/01431_utf8_ubsan.sql
@@ -1,2 +1,5 @@
+-- Tags: no-fasttest
+-- no-fasttest: upper/lowerUTF8 use ICU
+
 SELECT hex(lowerUTF8('\xFF'));
 SELECT hex(upperUTF8('\xFF'));
diff --git a/tests/queries/0_stateless/01590_countSubstrings.sql b/tests/queries/0_stateless/01590_countSubstrings.sql
index b38cbb7d188..5ec4f412d7f 100644
--- a/tests/queries/0_stateless/01590_countSubstrings.sql
+++ b/tests/queries/0_stateless/01590_countSubstrings.sql
@@ -1,3 +1,6 @@
+-- Tags: no-fasttest
+-- no-fasttest: upper/lowerUTF8 use ICU
+
 --
 -- countSubstrings
 --
diff --git a/tests/queries/0_stateless/02071_lower_upper_utf8_row_overlaps.reference b/tests/queries/0_stateless/02071_lower_upper_utf8_row_overlaps.reference
index a3bac432482..deabef61a88 100644
--- a/tests/queries/0_stateless/02071_lower_upper_utf8_row_overlaps.reference
+++ b/tests/queries/0_stateless/02071_lower_upper_utf8_row_overlaps.reference
@@ -5,9 +5,9 @@ insert into utf8_overlap values ('\xe2'), ('Foo⚊BarBazBam'), ('\xe2'), ('Foo
 --                                             MONOGRAM FOR YANG
 with lowerUTF8(str) as l_, upperUTF8(str) as u_, '0x' || hex(str) as h_
 select length(str), if(l_ == '\xe2', h_, l_), if(u_ == '\xe2', h_, u_) from utf8_overlap format CSV;
-1,"0xE2","0xE2"
+1,"�","�"
 15,"foo⚊barbazbam","FOO⚊BARBAZBAM"
-1,"0xE2","0xE2"
+1,"�","�"
 15,"foo⚊barbazbam","FOO⚊BARBAZBAM"
 -- NOTE: regression test for introduced bug
 -- https://github.com/ClickHouse/ClickHouse/issues/42756
diff --git a/tests/queries/0_stateless/02071_lower_upper_utf8_row_overlaps.sql b/tests/queries/0_stateless/02071_lower_upper_utf8_row_overlaps.sql
index 8ca0a3f5f75..d175e0659d0 100644
--- a/tests/queries/0_stateless/02071_lower_upper_utf8_row_overlaps.sql
+++ b/tests/queries/0_stateless/02071_lower_upper_utf8_row_overlaps.sql
@@ -1,3 +1,6 @@
+-- Tags: no-fasttest
+-- no-fasttest: upper/lowerUTF8 use ICU
+
 drop table if exists utf8_overlap;
 create table utf8_overlap (str String) engine=Memory();
 
diff --git a/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference b/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference
index c39f1fb1ce9..0980e25b70f 100644
--- a/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference
+++ b/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference
@@ -416,7 +416,6 @@ logTrace
 lowCardinalityIndices
 lowCardinalityKeys
 lower
-lowerUTF8
 makeDate
 makeDate32
 makeDateTime
@@ -897,7 +896,6 @@ tupleToNameValuePairs
 unbin
 unhex
 upper
-upperUTF8
 uptime
 validateNestedArraySizes
 version
diff --git a/tests/queries/0_stateless/02514_if_with_lazy_low_cardinality.sql b/tests/queries/0_stateless/02514_if_with_lazy_low_cardinality.sql
index 80e3c0a9ece..b169cfd0ab9 100644
--- a/tests/queries/0_stateless/02514_if_with_lazy_low_cardinality.sql
+++ b/tests/queries/0_stateless/02514_if_with_lazy_low_cardinality.sql
@@ -1,3 +1,6 @@
+-- Tags: no-fasttest
+-- no-fasttest: upper/lowerUTF8 use ICU
+
 create table if not exists t (`arr.key` Array(LowCardinality(String)), `arr.value` Array(LowCardinality(String))) engine = Memory;
 insert into t (`arr.key`, `arr.value`) values (['a'], ['b']);
 select if(true, if(lowerUTF8(arr.key) = 'a', 1, 2), 3) as x from t left array join arr;
diff --git a/tests/queries/0_stateless/02807_lower_utf8_msan.sql b/tests/queries/0_stateless/02807_lower_utf8_msan.sql
index e9eb18bf615..95f224577f7 100644
--- a/tests/queries/0_stateless/02807_lower_utf8_msan.sql
+++ b/tests/queries/0_stateless/02807_lower_utf8_msan.sql
@@ -1,2 +1,5 @@
+-- Tags: no-fasttest
+-- no-fasttest: upper/lowerUTF8 use ICU
+
 SELECT lowerUTF8(arrayJoin(['©--------------------------------------', '©--------------------'])) ORDER BY 1;
 SELECT upperUTF8(materialize('aaaaАБВГaaaaaaaaaaaaАБВГAAAAaaAA')) FROM numbers(2);
diff --git a/tests/queries/0_stateless/03015_peder1001.sql b/tests/queries/0_stateless/03015_peder1001.sql
index 810503207f2..df8e4db1536 100644
--- a/tests/queries/0_stateless/03015_peder1001.sql
+++ b/tests/queries/0_stateless/03015_peder1001.sql
@@ -1,3 +1,6 @@
+-- Tags: no-fasttest
+-- no-fasttest: upper/lowerUTF8 use ICU
+
 DROP TABLE IF EXISTS test_data;
 
 CREATE TABLE test_data

From 4600b270dafec20b276ab83eb557270c24cb4169 Mon Sep 17 00:00:00 2001
From: taiyang-li <654010905@qq.com>
Date: Fri, 16 Aug 2024 17:58:54 +0800
Subject: [PATCH 03/18] remote icu contrib

---
 .gitmodules | 3 ---
 contrib/icu | 1 -
 2 files changed, 4 deletions(-)
 delete mode 160000 contrib/icu

diff --git a/.gitmodules b/.gitmodules
index 7fdfb1103c5..164da311930 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -106,9 +106,6 @@
 [submodule "contrib/icudata"]
 	path = contrib/icudata
 	url = https://github.com/ClickHouse/icudata
-[submodule "contrib/icu"]
-	path = contrib/icu
-	url = https://github.com/unicode-org/icu
 [submodule "contrib/flatbuffers"]
 	path = contrib/flatbuffers
 	url = https://github.com/ClickHouse/flatbuffers
diff --git a/contrib/icu b/contrib/icu
deleted file mode 160000
index 7750081bda4..00000000000
--- a/contrib/icu
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 7750081bda4b3bc1768ae03849ec70f67ea10625

From 3ee741bd5e33d16b2f5711a8f2b06fca1a64b7bc Mon Sep 17 00:00:00 2001
From: taiyang-li <654010905@qq.com>
Date: Fri, 16 Aug 2024 18:04:15 +0800
Subject: [PATCH 04/18] add submodule contrib/icu from clickhouse

---
 .gitmodules | 4 ++++
 contrib/icu | 1 +
 2 files changed, 5 insertions(+)
 create mode 160000 contrib/icu

diff --git a/.gitmodules b/.gitmodules
index 164da311930..a8cc6a07caf 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -372,3 +372,7 @@
 [submodule "contrib/numactl"]
 	path = contrib/numactl
 	url = https://github.com/ClickHouse/numactl.git
+[submodule "contrib/icu"]
+	path = contrib/icu
+	url = https://github.com/ClickHouse/icu
+	branch = ClickHouse/release-75-1
diff --git a/contrib/icu b/contrib/icu
new file mode 160000
index 00000000000..4216173eeeb
--- /dev/null
+++ b/contrib/icu
@@ -0,0 +1 @@
+Subproject commit 4216173eeeb39c1d4caaa54a68860e800412d273

From 6bd65dbfa5c5d450e355dc64c110db65d2f56cbb Mon Sep 17 00:00:00 2001
From: Aleksei Filatov <alexfvk@yandex-team.ru>
Date: Fri, 16 Aug 2024 15:07:53 +0000
Subject: [PATCH 05/18] Use HTTP/1.1 for external HTTP authentication

---
 src/Access/HTTPAuthClient.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/Access/HTTPAuthClient.h b/src/Access/HTTPAuthClient.h
index a8b56cf05a7..a1b97a729a3 100644
--- a/src/Access/HTTPAuthClient.h
+++ b/src/Access/HTTPAuthClient.h
@@ -82,7 +82,8 @@ public:
 
     Result authenticate(const String & user_name, const String & password) const
     {
-        Poco::Net::HTTPRequest request{Poco::Net::HTTPRequest::HTTP_GET, this->getURI().getPathAndQuery()};
+        Poco::Net::HTTPRequest request{
+            Poco::Net::HTTPRequest::HTTP_GET, this->getURI().getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1};
         Poco::Net::HTTPBasicCredentials basic_credentials{user_name, password};
         basic_credentials.authenticate(request);
 

From 45e06de3267486296cc1452c981a78688a2193ae Mon Sep 17 00:00:00 2001
From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com>
Date: Fri, 16 Aug 2024 18:01:43 +0200
Subject: [PATCH 06/18] Minor update in Dynamic/JSON serializations

---
 src/DataTypes/Serializations/SerializationObject.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/DataTypes/Serializations/SerializationObject.cpp b/src/DataTypes/Serializations/SerializationObject.cpp
index 2dd25e540cc..0042aa6d89d 100644
--- a/src/DataTypes/Serializations/SerializationObject.cpp
+++ b/src/DataTypes/Serializations/SerializationObject.cpp
@@ -199,7 +199,7 @@ void SerializationObject::serializeBinaryBulkStatePrefix(
     auto object_state = std::make_shared<SerializeBinaryBulkStateObject>(serialization_version);
     object_state->max_dynamic_paths = column_object.getMaxDynamicPaths();
     /// Write max_dynamic_paths parameter.
-    writeBinaryLittleEndian(object_state->max_dynamic_paths, *stream);
+    writeVarUInt(object_state->max_dynamic_paths, *stream);
     /// Write all dynamic paths in sorted order.
     object_state->sorted_dynamic_paths.reserve(dynamic_paths.size());
     for (const auto & [path, _] : dynamic_paths)
@@ -354,7 +354,7 @@ ISerialization::DeserializeBinaryBulkStatePtr SerializationObject::deserializeOb
         readBinaryLittleEndian(serialization_version, *structure_stream);
         auto structure_state = std::make_shared<DeserializeBinaryBulkStateObjectStructure>(serialization_version);
         /// Read max_dynamic_paths parameter.
-        readBinaryLittleEndian(structure_state->max_dynamic_paths, *structure_stream);
+        readVarUInt(structure_state->max_dynamic_paths, *structure_stream);
         /// Read the sorted list of dynamic paths.
         size_t dynamic_paths_size;
         readVarUInt(dynamic_paths_size, *structure_stream);

From c85d5e753899503f93a8f9ca7b67776d386d9130 Mon Sep 17 00:00:00 2001
From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com>
Date: Fri, 16 Aug 2024 18:02:51 +0200
Subject: [PATCH 07/18] Update Dynamic serialization

---
 src/DataTypes/Serializations/SerializationDynamic.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/DataTypes/Serializations/SerializationDynamic.cpp b/src/DataTypes/Serializations/SerializationDynamic.cpp
index 6bba87c40fa..ab24779ced2 100644
--- a/src/DataTypes/Serializations/SerializationDynamic.cpp
+++ b/src/DataTypes/Serializations/SerializationDynamic.cpp
@@ -115,7 +115,7 @@ void SerializationDynamic::serializeBinaryBulkStatePrefix(
     dynamic_state->max_dynamic_types = column_dynamic.getMaxDynamicTypes();
     /// Write max_dynamic_types parameter, because it can differ from the max_dynamic_types
     /// that is specified in the Dynamic type (we could decrease it before merge).
-    writeBinaryLittleEndian(dynamic_state->max_dynamic_types, *stream);
+    writeVarUInt(dynamic_state->max_dynamic_types, *stream);
 
     dynamic_state->variant_type = variant_info.variant_type;
     dynamic_state->variant_names = variant_info.variant_names;
@@ -123,7 +123,7 @@ void SerializationDynamic::serializeBinaryBulkStatePrefix(
 
     /// Write information about variants.
     size_t num_variants = dynamic_state->variant_names.size() - 1; /// Don't write shared variant, Dynamic column should always have it.
-    writeBinaryLittleEndian(num_variants, *stream);
+    writeVarUInt(num_variants, *stream);
     if (settings.data_types_binary_encoding)
     {
         const auto & variants = assert_cast<const DataTypeVariant &>(*dynamic_state->variant_type).getVariants();
@@ -252,11 +252,11 @@ ISerialization::DeserializeBinaryBulkStatePtr SerializationDynamic::deserializeD
         readBinaryLittleEndian(structure_version, *structure_stream);
         auto structure_state = std::make_shared<DeserializeBinaryBulkStateDynamicStructure>(structure_version);
         /// Read max_dynamic_types parameter.
-        readBinaryLittleEndian(structure_state->max_dynamic_types, *structure_stream);
+        readVarUInt(structure_state->max_dynamic_types, *structure_stream);
         /// Read information about variants.
         DataTypes variants;
         size_t num_variants;
-        readBinaryLittleEndian(num_variants, *structure_stream);
+        readVarUInt(num_variants, *structure_stream);
         variants.reserve(num_variants + 1); /// +1 for shared variant.
         if (settings.data_types_binary_encoding)
         {

From cc7d22a7b83440cfbf7d37086ece7fac222f24de Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Fri, 16 Aug 2024 23:08:16 +0200
Subject: [PATCH 08/18] Proper parsing of the PostgreSQL-style CAST operator

---
 src/Parsers/ExpressionElementParsers.cpp      | 26 +++++++++++--------
 ..._proper_parsing_of_cast_operator.reference |  4 +++
 .../03227_proper_parsing_of_cast_operator.sql |  6 +++++
 3 files changed, 25 insertions(+), 11 deletions(-)
 create mode 100644 tests/queries/0_stateless/03227_proper_parsing_of_cast_operator.reference
 create mode 100644 tests/queries/0_stateless/03227_proper_parsing_of_cast_operator.sql

diff --git a/src/Parsers/ExpressionElementParsers.cpp b/src/Parsers/ExpressionElementParsers.cpp
index dd22b80b1cb..ffa1bd93ded 100644
--- a/src/Parsers/ExpressionElementParsers.cpp
+++ b/src/Parsers/ExpressionElementParsers.cpp
@@ -853,9 +853,9 @@ bool ParserCastOperator::parseImpl(Pos & pos, ASTPtr & node, Expected & expected
 
     /// Parse numbers (including decimals), strings, arrays and tuples of them.
 
+    Pos begin = pos;
     const char * data_begin = pos->begin;
     const char * data_end = pos->end;
-    bool is_string_literal = pos->type == StringLiteral;
 
     if (pos->type == Minus)
     {
@@ -866,7 +866,7 @@ bool ParserCastOperator::parseImpl(Pos & pos, ASTPtr & node, Expected & expected
         data_end = pos->end;
         ++pos;
     }
-    else if (pos->type == Number || is_string_literal)
+    else if (pos->type == Number || pos->type == StringLiteral)
     {
         ++pos;
     }
@@ -939,18 +939,22 @@ bool ParserCastOperator::parseImpl(Pos & pos, ASTPtr & node, Expected & expected
     {
         String s;
         size_t data_size = data_end - data_begin;
-        if (is_string_literal)
+        if (begin->type == StringLiteral)
         {
-            ReadBufferFromMemory buf(data_begin, data_size);
-            readQuotedStringWithSQLStyle(s, buf);
-            assert(buf.count() == data_size);
+            ASTPtr literal;
+            if (ParserStringLiteral().parse(begin, literal, expected))
+            {
+                node = createFunctionCast(literal, type_ast);
+                return true;
+            }
+            return false;
         }
         else
-            s = String(data_begin, data_size);
-
-        auto literal = std::make_shared<ASTLiteral>(std::move(s));
-        node = createFunctionCast(literal, type_ast);
-        return true;
+        {
+            auto literal = std::make_shared<ASTLiteral>(String(data_begin, data_size));
+            node = createFunctionCast(literal, type_ast);
+            return true;
+        }
     }
 
     return false;
diff --git a/tests/queries/0_stateless/03227_proper_parsing_of_cast_operator.reference b/tests/queries/0_stateless/03227_proper_parsing_of_cast_operator.reference
new file mode 100644
index 00000000000..2127d396bb3
--- /dev/null
+++ b/tests/queries/0_stateless/03227_proper_parsing_of_cast_operator.reference
@@ -0,0 +1,4 @@
+414243
+ABC
+A
+{"a": \'A\'}
diff --git a/tests/queries/0_stateless/03227_proper_parsing_of_cast_operator.sql b/tests/queries/0_stateless/03227_proper_parsing_of_cast_operator.sql
new file mode 100644
index 00000000000..0c2e7dc582a
--- /dev/null
+++ b/tests/queries/0_stateless/03227_proper_parsing_of_cast_operator.sql
@@ -0,0 +1,6 @@
+SELECT '414243'::String;
+SELECT x'414243'::String;
+SELECT b'01000001'::String;
+SELECT '{"a": \'\x41\'}'::String;
+SELECT '{"a": \'\x4\'}'::String; -- { clientError SYNTAX_ERROR }
+SELECT '{"a": \'a\x4\'}'::String; -- { clientError SYNTAX_ERROR }

From aee031ad4468b870073dc46770d07cea07aa829f Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Fri, 16 Aug 2024 23:25:49 +0200
Subject: [PATCH 09/18] Slightly better

---
 src/Parsers/ExpressionElementParsers.cpp | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/Parsers/ExpressionElementParsers.cpp b/src/Parsers/ExpressionElementParsers.cpp
index ffa1bd93ded..726326bfc85 100644
--- a/src/Parsers/ExpressionElementParsers.cpp
+++ b/src/Parsers/ExpressionElementParsers.cpp
@@ -856,6 +856,7 @@ bool ParserCastOperator::parseImpl(Pos & pos, ASTPtr & node, Expected & expected
     Pos begin = pos;
     const char * data_begin = pos->begin;
     const char * data_end = pos->end;
+    ASTPtr string_literal;
 
     if (pos->type == Minus)
     {
@@ -866,10 +867,15 @@ bool ParserCastOperator::parseImpl(Pos & pos, ASTPtr & node, Expected & expected
         data_end = pos->end;
         ++pos;
     }
-    else if (pos->type == Number || pos->type == StringLiteral)
+    else if (pos->type == Number)
     {
         ++pos;
     }
+    else if (pos->type == StringLiteral)
+    {
+        if (!ParserStringLiteral().parse(begin, string_literal, expected))
+            return false;
+    }
     else if (isOneOf<OpeningSquareBracket, OpeningRoundBracket>(pos->type))
     {
         TokenType last_token = OpeningSquareBracket;
@@ -939,15 +945,10 @@ bool ParserCastOperator::parseImpl(Pos & pos, ASTPtr & node, Expected & expected
     {
         String s;
         size_t data_size = data_end - data_begin;
-        if (begin->type == StringLiteral)
+        if (string_literal)
         {
-            ASTPtr literal;
-            if (ParserStringLiteral().parse(begin, literal, expected))
-            {
-                node = createFunctionCast(literal, type_ast);
-                return true;
-            }
-            return false;
+            node = createFunctionCast(string_literal, type_ast);
+            return true;
         }
         else
         {

From b98249ea7fda526a7a561862fcc4a721e5a4587f Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Sat, 17 Aug 2024 00:06:47 +0200
Subject: [PATCH 10/18] Use temporary tables for input and output in
 clickhouse-local

---
 programs/local/LocalServer.cpp                                | 2 +-
 tests/queries/0_stateless/01191_rename_dictionary.sql         | 1 +
 .../02141_clickhouse_local_interactive_table.reference        | 4 ++--
 .../0_stateless/02141_clickhouse_local_interactive_table.sh   | 4 ++--
 4 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp
index 200beea7b63..a8b774562f9 100644
--- a/programs/local/LocalServer.cpp
+++ b/programs/local/LocalServer.cpp
@@ -367,7 +367,7 @@ std::string LocalServer::getInitialCreateTableQuery()
     else
         table_structure = "(" + table_structure + ")";
 
-    return fmt::format("CREATE TABLE {} {} ENGINE = File({}, {});",
+    return fmt::format("CREATE TEMPORARY TABLE {} {} ENGINE = File({}, {});",
                        table_name, table_structure, data_format, table_file);
 }
 
diff --git a/tests/queries/0_stateless/01191_rename_dictionary.sql b/tests/queries/0_stateless/01191_rename_dictionary.sql
index c5012dabc81..be95e5a7d4b 100644
--- a/tests/queries/0_stateless/01191_rename_dictionary.sql
+++ b/tests/queries/0_stateless/01191_rename_dictionary.sql
@@ -27,6 +27,7 @@ RENAME DICTIONARY test_01191.t TO test_01191.dict1; -- {serverError INCORRECT_QU
 DROP DICTIONARY test_01191.t; -- {serverError INCORRECT_QUERY}
 DROP TABLE test_01191.t;
 
+DROP DATABASE IF EXISTS dummy_db;
 CREATE DATABASE dummy_db ENGINE=Atomic;
 RENAME DICTIONARY test_01191.dict TO dummy_db.dict1;
 RENAME DICTIONARY dummy_db.dict1 TO test_01191.dict;
diff --git a/tests/queries/0_stateless/02141_clickhouse_local_interactive_table.reference b/tests/queries/0_stateless/02141_clickhouse_local_interactive_table.reference
index 0bb8966cbe4..0e74c0a083e 100644
--- a/tests/queries/0_stateless/02141_clickhouse_local_interactive_table.reference
+++ b/tests/queries/0_stateless/02141_clickhouse_local_interactive_table.reference
@@ -1,2 +1,2 @@
-CREATE TABLE default.`table`\n(\n    `key` String\n)\nENGINE = File(\'TSVWithNamesAndTypes\', \'/dev/null\')
-CREATE TABLE foo.`table`\n(\n    `key` String\n)\nENGINE = File(\'TSVWithNamesAndTypes\', \'/dev/null\')
+CREATE TEMPORARY TABLE `table`\n(\n    `key` String\n)\nENGINE = File(TSVWithNamesAndTypes, \'/dev/null\')
+CREATE TEMPORARY TABLE `table`\n(\n    `key` String\n)\nENGINE = File(TSVWithNamesAndTypes, \'/dev/null\')
diff --git a/tests/queries/0_stateless/02141_clickhouse_local_interactive_table.sh b/tests/queries/0_stateless/02141_clickhouse_local_interactive_table.sh
index 934d87616ac..3a95e59416a 100755
--- a/tests/queries/0_stateless/02141_clickhouse_local_interactive_table.sh
+++ b/tests/queries/0_stateless/02141_clickhouse_local_interactive_table.sh
@@ -4,5 +4,5 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 # shellcheck source=../shell_config.sh
 . "$CURDIR"/../shell_config.sh
 
-$CLICKHOUSE_LOCAL --file /dev/null --structure "key String" --input-format TSVWithNamesAndTypes --interactive --send_logs_level=trace <<<'show create table table'
-$CLICKHOUSE_LOCAL --database foo --file /dev/null --structure "key String" --input-format TSVWithNamesAndTypes --interactive --send_logs_level=trace <<<'show create table table'
+$CLICKHOUSE_LOCAL --file /dev/null --structure "key String" --input-format TSVWithNamesAndTypes --interactive --send_logs_level=trace <<<'show create temporary table table'
+$CLICKHOUSE_LOCAL --database foo --file /dev/null --structure "key String" --input-format TSVWithNamesAndTypes --interactive --send_logs_level=trace <<<'show create temporary table table'

From da0a8051d8c8e8c2c72145e15cdf5a96e99641d2 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Sat, 17 Aug 2024 00:22:57 +0200
Subject: [PATCH 11/18] Miscellaneous changes in database engines

---
 src/Databases/DatabaseLazy.cpp     |  2 +-
 src/Databases/DatabaseLazy.h       |  2 +-
 src/Databases/DatabaseOnDisk.cpp   | 10 +++++-----
 src/Databases/DatabaseOnDisk.h     |  4 ++--
 src/Databases/DatabaseOrdinary.cpp |  4 ++--
 5 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/Databases/DatabaseLazy.cpp b/src/Databases/DatabaseLazy.cpp
index 3fb6d30fcb8..2ccdd8510a8 100644
--- a/src/Databases/DatabaseLazy.cpp
+++ b/src/Databases/DatabaseLazy.cpp
@@ -52,7 +52,7 @@ DatabaseLazy::DatabaseLazy(const String & name_, const String & metadata_path_,
 
 void DatabaseLazy::loadStoredObjects(ContextMutablePtr local_context, LoadingStrictnessLevel /*mode*/)
 {
-    iterateMetadataFiles(local_context, [this, &local_context](const String & file_name)
+    iterateMetadataFiles([this, &local_context](const String & file_name)
     {
         const std::string table_name = unescapeForFileName(file_name.substr(0, file_name.size() - 4));
 
diff --git a/src/Databases/DatabaseLazy.h b/src/Databases/DatabaseLazy.h
index 41cfb751141..aeac130594f 100644
--- a/src/Databases/DatabaseLazy.h
+++ b/src/Databases/DatabaseLazy.h
@@ -12,7 +12,7 @@ class DatabaseLazyIterator;
 class Context;
 
 /** Lazy engine of databases.
-  * Works like DatabaseOrdinary, but stores in memory only the cache.
+  * Works like DatabaseOrdinary, but stores only recently accessed tables in memory.
   * Can be used only with *Log engines.
   */
 class DatabaseLazy final : public DatabaseOnDisk
diff --git a/src/Databases/DatabaseOnDisk.cpp b/src/Databases/DatabaseOnDisk.cpp
index 734f354d9a5..c80e4def94e 100644
--- a/src/Databases/DatabaseOnDisk.cpp
+++ b/src/Databases/DatabaseOnDisk.cpp
@@ -568,14 +568,14 @@ void DatabaseOnDisk::drop(ContextPtr local_context)
     assert(TSA_SUPPRESS_WARNING_FOR_READ(tables).empty());
     if (local_context->getSettingsRef().force_remove_data_recursively_on_drop)
     {
-        (void)fs::remove_all(local_context->getPath() + getDataPath());
+        (void)fs::remove_all(std::filesystem::path(getContext()->getPath()) / data_path);
         (void)fs::remove_all(getMetadataPath());
     }
     else
     {
         try
         {
-            (void)fs::remove(local_context->getPath() + getDataPath());
+            (void)fs::remove(std::filesystem::path(getContext()->getPath()) / data_path);
             (void)fs::remove(getMetadataPath());
         }
         catch (const fs::filesystem_error & e)
@@ -613,7 +613,7 @@ time_t DatabaseOnDisk::getObjectMetadataModificationTime(const String & object_n
     }
 }
 
-void DatabaseOnDisk::iterateMetadataFiles(ContextPtr local_context, const IteratingFunction & process_metadata_file) const
+void DatabaseOnDisk::iterateMetadataFiles(const IteratingFunction & process_metadata_file) const
 {
     auto process_tmp_drop_metadata_file = [&](const String & file_name)
     {
@@ -621,7 +621,7 @@ void DatabaseOnDisk::iterateMetadataFiles(ContextPtr local_context, const Iterat
         static const char * tmp_drop_ext = ".sql.tmp_drop";
         const std::string object_name = file_name.substr(0, file_name.size() - strlen(tmp_drop_ext));
 
-        if (fs::exists(local_context->getPath() + getDataPath() + '/' + object_name))
+        if (fs::exists(std::filesystem::path(getContext()->getPath()) / data_path / object_name))
         {
             fs::rename(getMetadataPath() + file_name, getMetadataPath() + object_name + ".sql");
             LOG_WARNING(log, "Object {} was not dropped previously and will be restored", backQuote(object_name));
@@ -638,7 +638,7 @@ void DatabaseOnDisk::iterateMetadataFiles(ContextPtr local_context, const Iterat
     std::vector<std::pair<String, bool>> metadata_files;
 
     fs::directory_iterator dir_end;
-    for (fs::directory_iterator dir_it(getMetadataPath()); dir_it != dir_end; ++dir_it)
+    for (fs::directory_iterator dir_it(metadata_path); dir_it != dir_end; ++dir_it)
     {
         String file_name = dir_it->path().filename();
         /// For '.svn', '.gitignore' directory and similar.
diff --git a/src/Databases/DatabaseOnDisk.h b/src/Databases/DatabaseOnDisk.h
index 12656068643..ffc95a7c128 100644
--- a/src/Databases/DatabaseOnDisk.h
+++ b/src/Databases/DatabaseOnDisk.h
@@ -64,7 +64,7 @@ public:
     time_t getObjectMetadataModificationTime(const String & object_name) const override;
 
     String getDataPath() const override { return data_path; }
-    String getTableDataPath(const String & table_name) const override { return data_path + escapeForFileName(table_name) + "/"; }
+    String getTableDataPath(const String & table_name) const override { return std::filesystem::path(data_path) / escapeForFileName(table_name) / ""; }
     String getTableDataPath(const ASTCreateQuery & query) const override { return getTableDataPath(query.getTable()); }
     String getMetadataPath() const override { return metadata_path; }
 
@@ -83,7 +83,7 @@ protected:
 
     using IteratingFunction = std::function<void(const String &)>;
 
-    void iterateMetadataFiles(ContextPtr context, const IteratingFunction & process_metadata_file) const;
+    void iterateMetadataFiles(const IteratingFunction & process_metadata_file) const;
 
     ASTPtr getCreateTableQueryImpl(
         const String & table_name,
diff --git a/src/Databases/DatabaseOrdinary.cpp b/src/Databases/DatabaseOrdinary.cpp
index 8808261654f..dd8a3f42ea8 100644
--- a/src/Databases/DatabaseOrdinary.cpp
+++ b/src/Databases/DatabaseOrdinary.cpp
@@ -55,7 +55,7 @@ static constexpr size_t METADATA_FILE_BUFFER_SIZE = 32768;
 static constexpr const char * const CONVERT_TO_REPLICATED_FLAG_NAME = "convert_to_replicated";
 
 DatabaseOrdinary::DatabaseOrdinary(const String & name_, const String & metadata_path_, ContextPtr context_)
-    : DatabaseOrdinary(name_, metadata_path_, "data/" + escapeForFileName(name_) + "/", "DatabaseOrdinary (" + name_ + ")", context_)
+    : DatabaseOrdinary(name_, metadata_path_, std::filesystem::path("data") / escapeForFileName(name_) / "", "DatabaseOrdinary (" + name_ + ")", context_)
 {
 }
 
@@ -265,7 +265,7 @@ void DatabaseOrdinary::loadTablesMetadata(ContextPtr local_context, ParsedTables
         }
     };
 
-    iterateMetadataFiles(local_context, process_metadata);
+    iterateMetadataFiles(process_metadata);
 
     size_t objects_in_database = metadata.parsed_tables.size() - prev_tables_count;
     size_t dictionaries_in_database = metadata.total_dictionaries - prev_total_dictionaries;

From 02ec4e2f92ac769f92aebdb714d0d8da1a924984 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Sat, 17 Aug 2024 04:00:31 +0200
Subject: [PATCH 12/18] Fix build

---
 src/Parsers/ExpressionElementParsers.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/Parsers/ExpressionElementParsers.cpp b/src/Parsers/ExpressionElementParsers.cpp
index 726326bfc85..61b5723072e 100644
--- a/src/Parsers/ExpressionElementParsers.cpp
+++ b/src/Parsers/ExpressionElementParsers.cpp
@@ -943,7 +943,6 @@ bool ParserCastOperator::parseImpl(Pos & pos, ASTPtr & node, Expected & expected
     if (ParserToken(DoubleColon).ignore(pos, expected)
         && ParserDataType().parse(pos, type_ast, expected))
     {
-        String s;
         size_t data_size = data_end - data_begin;
         if (string_literal)
         {

From a42b12725ab37df74a96e85f7c644c90bd6e30f6 Mon Sep 17 00:00:00 2001
From: Max Kainov <maxkaynov@gmail.com>
Date: Fri, 16 Aug 2024 17:39:09 +0200
Subject: [PATCH 13/18] CI: Native build for package_aarch64

---
 tests/ci/ci_config.py      |  3 ++-
 tests/ci/ci_definitions.py |  1 +
 tests/ci/test_ci_config.py | 14 ++++++++++----
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/tests/ci/ci_config.py b/tests/ci/ci_config.py
index 7a19eb6f827..173c6c9c931 100644
--- a/tests/ci/ci_config.py
+++ b/tests/ci/ci_config.py
@@ -94,7 +94,8 @@ class CI:
                 package_type="deb",
                 static_binary_name="aarch64",
                 additional_pkgs=True,
-            )
+            ),
+            runner_type=Runners.BUILDER_ARM,
         ),
         BuildNames.PACKAGE_ASAN: CommonJobConfigs.BUILD.with_properties(
             build_config=BuildConfig(
diff --git a/tests/ci/ci_definitions.py b/tests/ci/ci_definitions.py
index 48847b0d7a6..1bed9db06f2 100644
--- a/tests/ci/ci_definitions.py
+++ b/tests/ci/ci_definitions.py
@@ -57,6 +57,7 @@ class Runners(metaclass=WithIter):
     """
 
     BUILDER = "builder"
+    BUILDER_ARM = "builder-aarch64"
     STYLE_CHECKER = "style-checker"
     STYLE_CHECKER_ARM = "style-checker-aarch64"
     FUNC_TESTER = "func-tester"
diff --git a/tests/ci/test_ci_config.py b/tests/ci/test_ci_config.py
index 525b3bf367b..c3e55aeac06 100644
--- a/tests/ci/test_ci_config.py
+++ b/tests/ci/test_ci_config.py
@@ -35,10 +35,16 @@ class TestCIConfig(unittest.TestCase):
                     f"Job [{job}] must have style-checker(-aarch64) runner",
                 )
             elif "binary_" in job.lower() or "package_" in job.lower():
-                self.assertTrue(
-                    CI.JOB_CONFIGS[job].runner_type == CI.Runners.BUILDER,
-                    f"Job [{job}] must have [{CI.Runners.BUILDER}] runner",
-                )
+                if job.lower() == CI.BuildNames.PACKAGE_AARCH64:
+                    self.assertTrue(
+                        CI.JOB_CONFIGS[job].runner_type in (CI.Runners.BUILDER_ARM,),
+                        f"Job [{job}] must have [{CI.Runners.BUILDER_ARM}] runner",
+                    )
+                else:
+                    self.assertTrue(
+                        CI.JOB_CONFIGS[job].runner_type in (CI.Runners.BUILDER,),
+                        f"Job [{job}] must have [{CI.Runners.BUILDER}] runner",
+                    )
             elif "aarch64" in job.lower():
                 self.assertTrue(
                     "aarch" in CI.JOB_CONFIGS[job].runner_type,

From 7432400fd0c07b7c967f47b1536706b8f791fcb1 Mon Sep 17 00:00:00 2001
From: Max Kainov <maxkaynov@gmail.com>
Date: Fri, 16 Aug 2024 21:06:58 +0200
Subject: [PATCH 14/18] revert hacks made to prevent OOM in aarch64

---
 cmake/limit_jobs.cmake | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/cmake/limit_jobs.cmake b/cmake/limit_jobs.cmake
index 17d8dd42a2c..8e48fc9b9d8 100644
--- a/cmake/limit_jobs.cmake
+++ b/cmake/limit_jobs.cmake
@@ -42,19 +42,9 @@ endif ()
 # But use 2 parallel jobs, since:
 # - this is what llvm does
 # - and I've verfied that lld-11 does not use all available CPU time (in peak) while linking one binary
-if (CMAKE_BUILD_TYPE_UC STREQUAL "RELWITHDEBINFO" AND ENABLE_THINLTO)
-    if (ARCH_AARCH64)
-        # aarch64 builds start to often fail with OOMs (reason not yet clear), for now let's limit the concurrency
-        message(STATUS "ThinLTO provides its own parallel linking - limiting parallel link jobs to 1.")
-        set (PARALLEL_LINK_JOBS 1)
-        if (LINKER_NAME MATCHES "lld")
-            math(EXPR LTO_JOBS ${NUMBER_OF_LOGICAL_CORES}/4)
-            set (CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO "${CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO} -Wl,--thinlto-jobs=${LTO_JOBS}")
-        endif()
-    elseif (PARALLEL_LINK_JOBS GREATER 2)
-        message(STATUS "ThinLTO provides its own parallel linking - limiting parallel link jobs to 2.")
-        set (PARALLEL_LINK_JOBS 2)
-    endif ()
+if (CMAKE_BUILD_TYPE_UC STREQUAL "RELWITHDEBINFO" AND ENABLE_THINLTO AND PARALLEL_LINK_JOBS GREATER 2)
+    message(STATUS "ThinLTO provides its own parallel linking - limiting parallel link jobs to 2.")
+    set (PARALLEL_LINK_JOBS 2)
 endif()
 
 message(STATUS "Building sub-tree with ${PARALLEL_COMPILE_JOBS} compile jobs and ${PARALLEL_LINK_JOBS} linker jobs (system: ${NUMBER_OF_LOGICAL_CORES} cores, ${TOTAL_PHYSICAL_MEMORY} MB RAM, 'OFF' means the native core count).")

From 7071942858e44053b92b8386e68251be7718e3b5 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Sun, 18 Aug 2024 09:05:45 +0200
Subject: [PATCH 15/18] Miscellanous changes from #66999

---
 programs/keeper/Keeper.cpp | 4 +++-
 src/Daemon/BaseDaemon.cpp  | 4 +---
 src/Daemon/BaseDaemon.h    | 1 -
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/programs/keeper/Keeper.cpp b/programs/keeper/Keeper.cpp
index a447a9e50f6..ced661d9772 100644
--- a/programs/keeper/Keeper.cpp
+++ b/programs/keeper/Keeper.cpp
@@ -66,6 +66,8 @@
 /// A minimal file used when the keeper is run without installation
 INCBIN(keeper_resource_embedded_xml, SOURCE_DIR "/programs/keeper/keeper_embedded.xml");
 
+extern const char * GIT_HASH;
+
 int mainEntryClickHouseKeeper(int argc, char ** argv)
 {
     DB::Keeper app;
@@ -675,7 +677,7 @@ void Keeper::logRevision() const
         "Starting ClickHouse Keeper {} (revision: {}, git hash: {}, build id: {}), PID {}",
         VERSION_STRING,
         ClickHouseRevision::getVersionRevision(),
-        git_hash.empty() ? "<unknown>" : git_hash,
+        GIT_HASH,
         build_id.empty() ? "<unknown>" : build_id,
         getpid());
 }
diff --git a/src/Daemon/BaseDaemon.cpp b/src/Daemon/BaseDaemon.cpp
index f75699881bd..c42bf7641d2 100644
--- a/src/Daemon/BaseDaemon.cpp
+++ b/src/Daemon/BaseDaemon.cpp
@@ -452,8 +452,6 @@ void BaseDaemon::initializeTerminationAndSignalProcessing()
     build_id = SymbolIndex::instance().getBuildIDHex();
 #endif
 
-    git_hash = GIT_HASH;
-
 #if defined(OS_LINUX)
     std::string executable_path = getExecutablePath();
 
@@ -466,7 +464,7 @@ void BaseDaemon::logRevision() const
 {
     logger().information("Starting " + std::string{VERSION_FULL}
         + " (revision: " + std::to_string(ClickHouseRevision::getVersionRevision())
-        + ", git hash: " + (git_hash.empty() ? "<unknown>" : git_hash)
+        + ", git hash: " + std::string(GIT_HASH)
         + ", build id: " + (build_id.empty() ? "<unknown>" : build_id) + ")"
         + ", PID " + std::to_string(getpid()));
 }
diff --git a/src/Daemon/BaseDaemon.h b/src/Daemon/BaseDaemon.h
index b15aa74fcf3..a6efa94a567 100644
--- a/src/Daemon/BaseDaemon.h
+++ b/src/Daemon/BaseDaemon.h
@@ -165,7 +165,6 @@ protected:
     Poco::Util::AbstractConfiguration * last_configuration = nullptr;
 
     String build_id;
-    String git_hash;
     String stored_binary_hash;
 
     bool should_setup_watchdog = false;

From 4f7e3e8374acd98496092e8f8a219af6755a2f70 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Sun, 18 Aug 2024 09:38:00 +0200
Subject: [PATCH 16/18] Fix test 01017_uniqCombined_memory_usage

---
 .../0_stateless/01017_uniqCombined_memory_usage.sql        | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/queries/0_stateless/01017_uniqCombined_memory_usage.sql b/tests/queries/0_stateless/01017_uniqCombined_memory_usage.sql
index c13a0859183..eca370d94af 100644
--- a/tests/queries/0_stateless/01017_uniqCombined_memory_usage.sql
+++ b/tests/queries/0_stateless/01017_uniqCombined_memory_usage.sql
@@ -7,7 +7,8 @@
 -- sizeof(HLL) is (2^K * 6 / 8)
 -- hence max_memory_usage for 100 rows = (96<<10)*100 = 9830400
 
-SET use_uncompressed_cache = 0; 
+SET use_uncompressed_cache = 0;
+SET memory_profiler_step = 1;
 
 -- HashTable for UInt32 (used until (1<<13) elements), hence 8192 elements
 SELECT 'UInt32';
@@ -31,14 +32,14 @@ SELECT 'K=16';
 SELECT 'UInt32';
 SET max_memory_usage = 2000000;
 SELECT sum(u) FROM (SELECT intDiv(number, 4096) AS k, uniqCombined(16)(number % 4096) u FROM numbers(4096 * 100) GROUP BY k); -- { serverError MEMORY_LIMIT_EXCEEDED }
-SET max_memory_usage = 4915200;
+SET max_memory_usage = 5230000;
 SELECT sum(u) FROM (SELECT intDiv(number, 4096) AS k, uniqCombined(16)(number % 4096) u FROM numbers(4096 * 100) GROUP BY k);
 
 -- HashTable for UInt64 (used until (1<<11) elements), hence 2048 elements
 SELECT 'UInt64';
 SET max_memory_usage = 2000000;
 SELECT sum(u) FROM (SELECT intDiv(number, 2048) AS k, uniqCombined(16)(reinterpretAsString(number % 2048)) u FROM numbers(2048 * 100) GROUP BY k); -- { serverError MEMORY_LIMIT_EXCEEDED }
-SET max_memory_usage = 4915200;
+SET max_memory_usage = 5900000;
 SELECT sum(u) FROM (SELECT intDiv(number, 2048) AS k, uniqCombined(16)(reinterpretAsString(number % 2048)) u FROM numbers(2048 * 100) GROUP BY k);
 
 SELECT 'K=18';

From ec8554c85aeae5dcc8367ce09d093c5526ef1d47 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Sun, 18 Aug 2024 09:41:29 +0200
Subject: [PATCH 17/18] Fix build

---
 src/Common/SignalHandlers.cpp | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/Common/SignalHandlers.cpp b/src/Common/SignalHandlers.cpp
index c4358da2453..6ac6cbcae29 100644
--- a/src/Common/SignalHandlers.cpp
+++ b/src/Common/SignalHandlers.cpp
@@ -18,13 +18,17 @@
 
 namespace DB
 {
+
 namespace ErrorCodes
 {
 extern const int CANNOT_SET_SIGNAL_HANDLER;
 extern const int CANNOT_SEND_SIGNAL;
 }
+
 }
 
+extern const char * GIT_HASH;
+
 using namespace DB;
 
 
@@ -334,7 +338,7 @@ void SignalListener::onTerminate(std::string_view message, UInt32 thread_num) co
     size_t pos = message.find('\n');
 
     LOG_FATAL(log, "(version {}{}, build id: {}, git hash: {}) (from thread {}) {}",
-              VERSION_STRING, VERSION_OFFICIAL, daemon ? daemon->build_id : "", daemon ? daemon->git_hash : "", thread_num, message.substr(0, pos));
+              VERSION_STRING, VERSION_OFFICIAL, daemon ? daemon->build_id : "", GIT_HASH, thread_num, message.substr(0, pos));
 
     /// Print trace from std::terminate exception line-by-line to make it easy for grep.
     while (pos != std::string_view::npos)
@@ -368,7 +372,7 @@ try
 
     LOG_FATAL(log, "########## Short fault info ############");
     LOG_FATAL(log, "(version {}{}, build id: {}, git hash: {}) (from thread {}) Received signal {}",
-              VERSION_STRING, VERSION_OFFICIAL, daemon ? daemon->build_id : "", daemon ? daemon->git_hash : "",
+              VERSION_STRING, VERSION_OFFICIAL, daemon ? daemon->build_id : "", GIT_HASH,
               thread_num, sig);
 
     std::string signal_description = "Unknown signal";
@@ -434,13 +438,13 @@ try
     if (query_id.empty())
     {
         LOG_FATAL(log, "(version {}{}, build id: {}, git hash: {}) (from thread {}) (no query) Received signal {} ({})",
-                  VERSION_STRING, VERSION_OFFICIAL, daemon ? daemon->build_id : "", daemon ? daemon->git_hash : "",
+                  VERSION_STRING, VERSION_OFFICIAL, daemon ? daemon->build_id : "", GIT_HASH,
                   thread_num, signal_description, sig);
     }
     else
     {
         LOG_FATAL(log, "(version {}{}, build id: {}, git hash: {}) (from thread {}) (query_id: {}) (query: {}) Received signal {} ({})",
-                  VERSION_STRING, VERSION_OFFICIAL, daemon ? daemon->build_id : "", daemon ? daemon->git_hash : "",
+                  VERSION_STRING, VERSION_OFFICIAL, daemon ? daemon->build_id : "", GIT_HASH,
                   thread_num, query_id, query, signal_description, sig);
     }
 

From f5308635d193859cdb19a71040006278a21bdc51 Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Sun, 18 Aug 2024 15:25:07 +0200
Subject: [PATCH 18/18] Revert "Improve compatibility of `upper/lowerUTF8` with
 Spark"

---
 .gitmodules                                   |   7 +-
 contrib/icu                                   |   2 +-
 src/Common/examples/CMakeLists.txt            |   5 -
 src/Common/examples/utf8_upper_lower.cpp      |  27 --
 src/Functions/LowerUpperImpl.h                |   1 +
 src/Functions/LowerUpperUTF8Impl.h            | 283 +++++++++++++++---
 src/Functions/initcapUTF8.cpp                 |   3 +-
 src/Functions/lowerUTF8.cpp                   |  25 +-
 src/Functions/upperUTF8.cpp                   |  24 +-
 .../00170_lower_upper_utf8.reference          |   4 -
 .../0_stateless/00170_lower_upper_utf8.sql    |  11 -
 .../00233_position_function_family.sql        |   3 -
 .../0_stateless/00761_lower_utf8_bug.sql      |   3 -
 .../0_stateless/01278_random_string_utf8.sql  |   3 -
 .../0_stateless/01431_utf8_ubsan.reference    |   4 +-
 .../queries/0_stateless/01431_utf8_ubsan.sql  |   3 -
 .../0_stateless/01590_countSubstrings.sql     |   3 -
 ...71_lower_upper_utf8_row_overlaps.reference |   4 +-
 .../02071_lower_upper_utf8_row_overlaps.sql   |   3 -
 ...new_functions_must_be_documented.reference |   2 +
 .../02514_if_with_lazy_low_cardinality.sql    |   3 -
 .../0_stateless/02807_lower_utf8_msan.sql     |   3 -
 tests/queries/0_stateless/03015_peder1001.sql |   3 -
 23 files changed, 265 insertions(+), 164 deletions(-)
 delete mode 100644 src/Common/examples/utf8_upper_lower.cpp

diff --git a/.gitmodules b/.gitmodules
index f18844e5eb4..cdee6a43ad8 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -106,6 +106,9 @@
 [submodule "contrib/icudata"]
 	path = contrib/icudata
 	url = https://github.com/ClickHouse/icudata
+[submodule "contrib/icu"]
+	path = contrib/icu
+	url = https://github.com/unicode-org/icu
 [submodule "contrib/flatbuffers"]
 	path = contrib/flatbuffers
 	url = https://github.com/ClickHouse/flatbuffers
@@ -366,7 +369,3 @@
 [submodule "contrib/numactl"]
 	path = contrib/numactl
 	url = https://github.com/ClickHouse/numactl.git
-[submodule "contrib/icu"]
-	path = contrib/icu
-	url = https://github.com/ClickHouse/icu
-	branch = ClickHouse/release-75-1
diff --git a/contrib/icu b/contrib/icu
index 4216173eeeb..7750081bda4 160000
--- a/contrib/icu
+++ b/contrib/icu
@@ -1 +1 @@
-Subproject commit 4216173eeeb39c1d4caaa54a68860e800412d273
+Subproject commit 7750081bda4b3bc1768ae03849ec70f67ea10625
diff --git a/src/Common/examples/CMakeLists.txt b/src/Common/examples/CMakeLists.txt
index 8383e80d09d..69580d4ad0e 100644
--- a/src/Common/examples/CMakeLists.txt
+++ b/src/Common/examples/CMakeLists.txt
@@ -92,8 +92,3 @@ endif()
 
 clickhouse_add_executable (check_pointer_valid check_pointer_valid.cpp)
 target_link_libraries (check_pointer_valid PRIVATE clickhouse_common_io clickhouse_common_config)
-
-if (TARGET ch_contrib::icu)
-    clickhouse_add_executable (utf8_upper_lower utf8_upper_lower.cpp)
-    target_link_libraries (utf8_upper_lower PRIVATE ch_contrib::icu)
-endif ()
diff --git a/src/Common/examples/utf8_upper_lower.cpp b/src/Common/examples/utf8_upper_lower.cpp
deleted file mode 100644
index 826e1763105..00000000000
--- a/src/Common/examples/utf8_upper_lower.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-#include <iostream>
-#include <unicode/unistr.h>
-
-std::string utf8_to_lower(const std::string & input)
-{
-    icu::UnicodeString unicodeInput(input.c_str(), "UTF-8");
-    unicodeInput.toLower();
-    std::string output;
-    unicodeInput.toUTF8String(output);
-    return output;
-}
-
-std::string utf8_to_upper(const std::string & input)
-{
-    icu::UnicodeString unicodeInput(input.c_str(), "UTF-8");
-    unicodeInput.toUpper();
-    std::string output;
-    unicodeInput.toUTF8String(output);
-    return output;
-}
-
-int main()
-{
-    std::string input = "ır";
-    std::cout << "upper:" << utf8_to_upper(input) << std::endl;
-    return 0;
-}
diff --git a/src/Functions/LowerUpperImpl.h b/src/Functions/LowerUpperImpl.h
index a52703d10c8..d463ef96e16 100644
--- a/src/Functions/LowerUpperImpl.h
+++ b/src/Functions/LowerUpperImpl.h
@@ -1,6 +1,7 @@
 #pragma once
 #include <Columns/ColumnString.h>
 
+
 namespace DB
 {
 
diff --git a/src/Functions/LowerUpperUTF8Impl.h b/src/Functions/LowerUpperUTF8Impl.h
index 5da085f48e5..eedabca5b22 100644
--- a/src/Functions/LowerUpperUTF8Impl.h
+++ b/src/Functions/LowerUpperUTF8Impl.h
@@ -1,14 +1,15 @@
 #pragma once
-
-#include "config.h"
-
-#if USE_ICU
-
 #include <Columns/ColumnString.h>
 #include <Functions/LowerUpperImpl.h>
-#include <base/find_symbols.h>
-#include <unicode/unistr.h>
+#include <base/defines.h>
+#include <Poco/UTF8Encoding.h>
 #include <Common/StringUtils.h>
+#include <Common/UTF8Helpers.h>
+
+#ifdef __SSE2__
+#include <emmintrin.h>
+#endif
+
 
 namespace DB
 {
@@ -18,7 +19,71 @@ namespace ErrorCodes
     extern const int BAD_ARGUMENTS;
 }
 
-template <char not_case_lower_bound, char not_case_upper_bound, bool upper>
+/// xor or do nothing
+template <bool>
+UInt8 xor_or_identity(const UInt8 c, const int mask)
+{
+    return c ^ mask;
+}
+
+template <>
+inline UInt8 xor_or_identity<false>(const UInt8 c, const int)
+{
+    return c;
+}
+
+/// It is caller's responsibility to ensure the presence of a valid cyrillic sequence in array
+template <bool to_lower>
+inline void UTF8CyrillicToCase(const UInt8 *& src, UInt8 *& dst)
+{
+    if (src[0] == 0xD0u && (src[1] >= 0x80u && src[1] <= 0x8Fu))
+    {
+        /// ЀЁЂЃЄЅІЇЈЉЊЋЌЍЎЏ
+        *dst++ = xor_or_identity<to_lower>(*src++, 0x1);
+        *dst++ = xor_or_identity<to_lower>(*src++, 0x10);
+    }
+    else if (src[0] == 0xD1u && (src[1] >= 0x90u && src[1] <= 0x9Fu))
+    {
+        /// ѐёђѓєѕіїјљњћќѝўџ
+        *dst++ = xor_or_identity<!to_lower>(*src++, 0x1);
+        *dst++ = xor_or_identity<!to_lower>(*src++, 0x10);
+    }
+    else if (src[0] == 0xD0u && (src[1] >= 0x90u && src[1] <= 0x9Fu))
+    {
+        /// А-П
+        *dst++ = *src++;
+        *dst++ = xor_or_identity<to_lower>(*src++, 0x20);
+    }
+    else if (src[0] == 0xD0u && (src[1] >= 0xB0u && src[1] <= 0xBFu))
+    {
+        /// а-п
+        *dst++ = *src++;
+        *dst++ = xor_or_identity<!to_lower>(*src++, 0x20);
+    }
+    else if (src[0] == 0xD0u && (src[1] >= 0xA0u && src[1] <= 0xAFu))
+    {
+        /// Р-Я
+        *dst++ = xor_or_identity<to_lower>(*src++, 0x1);
+        *dst++ = xor_or_identity<to_lower>(*src++, 0x20);
+    }
+    else if (src[0] == 0xD1u && (src[1] >= 0x80u && src[1] <= 0x8Fu))
+    {
+        /// р-я
+        *dst++ = xor_or_identity<!to_lower>(*src++, 0x1);
+        *dst++ = xor_or_identity<!to_lower>(*src++, 0x20);
+    }
+}
+
+
+/** If the string contains UTF-8 encoded text, convert it to the lower (upper) case.
+  * Note: It is assumed that after the character is converted to another case,
+  *  the length of its multibyte sequence in UTF-8 does not change.
+  * Otherwise, the behavior is undefined.
+  */
+template <char not_case_lower_bound,
+    char not_case_upper_bound,
+    int to_case(int),
+    void cyrillic_to_case(const UInt8 *&, UInt8 *&)>
 struct LowerUpperUTF8Impl
 {
     static void vector(
@@ -38,46 +103,180 @@ struct LowerUpperUTF8Impl
             return;
         }
 
-        res_data.resize(data.size());
-        res_offsets.resize_exact(offsets.size());
-
-        String output;
-        size_t curr_offset = 0;
-        for (size_t i = 0; i < offsets.size(); ++i)
-        {
-            const auto * data_start = reinterpret_cast<const char *>(&data[offsets[i - 1]]);
-            size_t size = offsets[i] - offsets[i - 1];
-
-            icu::UnicodeString input(data_start, static_cast<int32_t>(size), "UTF-8");
-            if constexpr (upper)
-                input.toUpper();
-            else
-                input.toLower();
-
-            output.clear();
-            input.toUTF8String(output);
-
-            /// For valid UTF-8 input strings, ICU sometimes produces output with extra '\0's at the end. Only the data before the first
-            /// '\0' is valid. It the input is not valid UTF-8, then the behavior of lower/upperUTF8 is undefined by definition. In this
-            /// case, the behavior is also reasonable.
-            const char * res_end = find_last_not_symbols_or_null<'\0'>(output.data(), output.data() + output.size());
-            size_t valid_size = res_end ? res_end - output.data() + 1 : 0;
-
-            res_data.resize(curr_offset + valid_size + 1);
-            memcpy(&res_data[curr_offset], output.data(), valid_size);
-            res_data[curr_offset + valid_size] = 0;
-
-            curr_offset += valid_size + 1;
-            res_offsets[i] = curr_offset;
-        }
+        res_data.resize_exact(data.size());
+        res_offsets.assign(offsets);
+        array(data.data(), data.data() + data.size(), offsets, res_data.data());
     }
 
     static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &, size_t)
     {
         throw Exception(ErrorCodes::BAD_ARGUMENTS, "Functions lowerUTF8 and upperUTF8 cannot work with FixedString argument");
     }
+
+    /** Converts a single code point starting at `src` to desired case, storing result starting at `dst`.
+     *    `src` and `dst` are incremented by corresponding sequence lengths. */
+    static bool toCase(const UInt8 *& src, const UInt8 * src_end, UInt8 *& dst, bool partial)
+    {
+        if (src[0] <= ascii_upper_bound)
+        {
+            if (*src >= not_case_lower_bound && *src <= not_case_upper_bound)
+                *dst++ = *src++ ^ flip_case_mask;
+            else
+                *dst++ = *src++;
+        }
+        else if (src + 1 < src_end
+            && ((src[0] == 0xD0u && (src[1] >= 0x80u && src[1] <= 0xBFu)) || (src[0] == 0xD1u && (src[1] >= 0x80u && src[1] <= 0x9Fu))))
+        {
+            cyrillic_to_case(src, dst);
+        }
+        else if (src + 1 < src_end && src[0] == 0xC2u)
+        {
+            /// Punctuation U+0080 - U+00BF, UTF-8: C2 80 - C2 BF
+            *dst++ = *src++;
+            *dst++ = *src++;
+        }
+        else if (src + 2 < src_end && src[0] == 0xE2u)
+        {
+            /// Characters U+2000 - U+2FFF, UTF-8: E2 80 80 - E2 BF BF
+            *dst++ = *src++;
+            *dst++ = *src++;
+            *dst++ = *src++;
+        }
+        else
+        {
+            size_t src_sequence_length = UTF8::seqLength(*src);
+            /// In case partial buffer was passed (due to SSE optimization)
+            /// we cannot convert it with current src_end, but we may have more
+            /// bytes to convert and eventually got correct symbol.
+            if (partial && src_sequence_length > static_cast<size_t>(src_end - src))
+                return false;
+
+            auto src_code_point = UTF8::convertUTF8ToCodePoint(src, src_end - src);
+            if (src_code_point)
+            {
+                int dst_code_point = to_case(*src_code_point);
+                if (dst_code_point > 0)
+                {
+                    size_t dst_sequence_length = UTF8::convertCodePointToUTF8(dst_code_point, dst, src_end - src);
+                    assert(dst_sequence_length <= 4);
+
+                    /// We don't support cases when lowercase and uppercase characters occupy different number of bytes in UTF-8.
+                    /// As an example, this happens for ß and ẞ.
+                    if (dst_sequence_length == src_sequence_length)
+                    {
+                        src += dst_sequence_length;
+                        dst += dst_sequence_length;
+                        return true;
+                    }
+                }
+            }
+
+            *dst = *src;
+            ++dst;
+            ++src;
+        }
+
+        return true;
+    }
+
+private:
+    static constexpr auto ascii_upper_bound = '\x7f';
+    static constexpr auto flip_case_mask = 'A' ^ 'a';
+
+    static void array(const UInt8 * src, const UInt8 * src_end, const ColumnString::Offsets & offsets, UInt8 * dst)
+    {
+        const auto * offset_it = offsets.begin();
+        const UInt8 * begin = src;
+
+#ifdef __SSE2__
+        static constexpr auto bytes_sse = sizeof(__m128i);
+
+        /// If we are before this position, we can still read at least bytes_sse.
+        const auto * src_end_sse = src_end - bytes_sse + 1;
+
+        /// SSE2 packed comparison operate on signed types, hence compare (c < 0) instead of (c > 0x7f)
+        const auto v_zero = _mm_setzero_si128();
+        const auto v_not_case_lower_bound = _mm_set1_epi8(not_case_lower_bound - 1);
+        const auto v_not_case_upper_bound = _mm_set1_epi8(not_case_upper_bound + 1);
+        const auto v_flip_case_mask = _mm_set1_epi8(flip_case_mask);
+
+        while (src < src_end_sse)
+        {
+            const auto chars = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
+
+            /// check for ASCII
+            const auto is_not_ascii = _mm_cmplt_epi8(chars, v_zero);
+            const auto mask_is_not_ascii = _mm_movemask_epi8(is_not_ascii);
+
+            /// ASCII
+            if (mask_is_not_ascii == 0)
+            {
+                const auto is_not_case
+                    = _mm_and_si128(_mm_cmpgt_epi8(chars, v_not_case_lower_bound), _mm_cmplt_epi8(chars, v_not_case_upper_bound));
+                const auto mask_is_not_case = _mm_movemask_epi8(is_not_case);
+
+                /// everything in correct case ASCII
+                if (mask_is_not_case == 0)
+                    _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), chars);
+                else
+                {
+                    /// ASCII in mixed case
+                    /// keep `flip_case_mask` only where necessary, zero out elsewhere
+                    const auto xor_mask = _mm_and_si128(v_flip_case_mask, is_not_case);
+
+                    /// flip case by applying calculated mask
+                    const auto cased_chars = _mm_xor_si128(chars, xor_mask);
+
+                    /// store result back to destination
+                    _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), cased_chars);
+                }
+
+                src += bytes_sse;
+                dst += bytes_sse;
+            }
+            else
+            {
+                /// UTF-8
+
+                /// Find the offset of the next string after src
+                size_t offset_from_begin = src - begin;
+                while (offset_from_begin >= *offset_it)
+                    ++offset_it;
+
+                /// Do not allow one row influence another (since row may have invalid sequence, and break the next)
+                const UInt8 * row_end = begin + *offset_it;
+                chassert(row_end >= src);
+                const UInt8 * expected_end = std::min(src + bytes_sse, row_end);
+
+                while (src < expected_end)
+                {
+                    if (!toCase(src, expected_end, dst, /* partial= */ true))
+                    {
+                        /// Fallback to handling byte by byte.
+                        src_end_sse = src;
+                        break;
+                    }
+                }
+            }
+        }
+
+        /// Find the offset of the next string after src
+        size_t offset_from_begin = src - begin;
+        while (offset_it != offsets.end() && offset_from_begin >= *offset_it)
+            ++offset_it;
+#endif
+
+        /// handle remaining symbols, row by row (to avoid influence of bad UTF8 symbols from one row, to another)
+        while (src < src_end)
+        {
+            const UInt8 * row_end = begin + *offset_it;
+            chassert(row_end >= src);
+
+            while (src < row_end)
+                toCase(src, row_end, dst, /* partial= */ false);
+            ++offset_it;
+        }
+    }
 };
 
 }
-
-#endif
diff --git a/src/Functions/initcapUTF8.cpp b/src/Functions/initcapUTF8.cpp
index 004586dce26..282d846094e 100644
--- a/src/Functions/initcapUTF8.cpp
+++ b/src/Functions/initcapUTF8.cpp
@@ -1,8 +1,9 @@
 #include <DataTypes/DataTypeString.h>
 #include <Functions/FunctionStringToString.h>
+#include <Functions/LowerUpperUTF8Impl.h>
 #include <Functions/FunctionFactory.h>
 #include <Poco/Unicode.h>
-#include <Common/UTF8Helpers.h>
+
 
 namespace DB
 {
diff --git a/src/Functions/lowerUTF8.cpp b/src/Functions/lowerUTF8.cpp
index e2f7cb84730..7adb0069121 100644
--- a/src/Functions/lowerUTF8.cpp
+++ b/src/Functions/lowerUTF8.cpp
@@ -1,10 +1,9 @@
-#include "config.h"
-
-#if USE_ICU
-
-#include <Functions/FunctionFactory.h>
+#include <DataTypes/DataTypeString.h>
 #include <Functions/FunctionStringToString.h>
 #include <Functions/LowerUpperUTF8Impl.h>
+#include <Functions/FunctionFactory.h>
+#include <Poco/Unicode.h>
+
 
 namespace DB
 {
@@ -16,25 +15,13 @@ struct NameLowerUTF8
     static constexpr auto name = "lowerUTF8";
 };
 
-using FunctionLowerUTF8 = FunctionStringToString<LowerUpperUTF8Impl<'A', 'Z', false>, NameLowerUTF8>;
+using FunctionLowerUTF8 = FunctionStringToString<LowerUpperUTF8Impl<'A', 'Z', Poco::Unicode::toLower, UTF8CyrillicToCase<true>>, NameLowerUTF8>;
 
 }
 
 REGISTER_FUNCTION(LowerUTF8)
 {
-    FunctionDocumentation::Description description
-        = R"(Converts a string to lowercase, assuming that the string contains valid UTF-8 encoded text. If this assumption is violated, no exception is thrown and the result is undefined.)";
-    FunctionDocumentation::Syntax syntax = "lowerUTF8(input)";
-    FunctionDocumentation::Arguments arguments = {{"input", "Input with String type"}};
-    FunctionDocumentation::ReturnedValue returned_value = "A String data type value";
-    FunctionDocumentation::Examples examples = {
-        {"first", "SELECT lowerUTF8('München') as Lowerutf8;", "münchen"},
-    };
-    FunctionDocumentation::Categories categories = {"String"};
-
-    factory.registerFunction<FunctionLowerUTF8>({description, syntax, arguments, returned_value, examples, categories});
+    factory.registerFunction<FunctionLowerUTF8>();
 }
 
 }
-
-#endif
diff --git a/src/Functions/upperUTF8.cpp b/src/Functions/upperUTF8.cpp
index ef26430331f..659e67f0ef3 100644
--- a/src/Functions/upperUTF8.cpp
+++ b/src/Functions/upperUTF8.cpp
@@ -1,10 +1,8 @@
-#include "config.h"
-
-#if USE_ICU
-
-#include <Functions/FunctionFactory.h>
 #include <Functions/FunctionStringToString.h>
 #include <Functions/LowerUpperUTF8Impl.h>
+#include <Functions/FunctionFactory.h>
+#include <Poco/Unicode.h>
+
 
 namespace DB
 {
@@ -16,25 +14,13 @@ struct NameUpperUTF8
     static constexpr auto name = "upperUTF8";
 };
 
-using FunctionUpperUTF8 = FunctionStringToString<LowerUpperUTF8Impl<'a', 'z', true>, NameUpperUTF8>;
+using FunctionUpperUTF8 = FunctionStringToString<LowerUpperUTF8Impl<'a', 'z', Poco::Unicode::toUpper, UTF8CyrillicToCase<false>>, NameUpperUTF8>;
 
 }
 
 REGISTER_FUNCTION(UpperUTF8)
 {
-    FunctionDocumentation::Description description
-        = R"(Converts a string to lowercase, assuming that the string contains valid UTF-8 encoded text. If this assumption is violated, no exception is thrown and the result is undefined.)";
-    FunctionDocumentation::Syntax syntax = "upperUTF8(input)";
-    FunctionDocumentation::Arguments arguments = {{"input", "Input with String type"}};
-    FunctionDocumentation::ReturnedValue returned_value = "A String data type value";
-    FunctionDocumentation::Examples examples = {
-        {"first", "SELECT upperUTF8('München') as Upperutf8;", "MÜNCHEN"},
-    };
-    FunctionDocumentation::Categories categories = {"String"};
-
-    factory.registerFunction<FunctionUpperUTF8>({description, syntax, arguments, returned_value, examples, categories});
+    factory.registerFunction<FunctionUpperUTF8>();
 }
 
 }
-
-#endif
diff --git a/tests/queries/0_stateless/00170_lower_upper_utf8.reference b/tests/queries/0_stateless/00170_lower_upper_utf8.reference
index 3c644f22b9b..f202cb75513 100644
--- a/tests/queries/0_stateless/00170_lower_upper_utf8.reference
+++ b/tests/queries/0_stateless/00170_lower_upper_utf8.reference
@@ -22,7 +22,3 @@
 1
 1
 1
-1
-1
-1
-1
diff --git a/tests/queries/0_stateless/00170_lower_upper_utf8.sql b/tests/queries/0_stateless/00170_lower_upper_utf8.sql
index 85b6c5c6095..4caba2033ff 100644
--- a/tests/queries/0_stateless/00170_lower_upper_utf8.sql
+++ b/tests/queries/0_stateless/00170_lower_upper_utf8.sql
@@ -1,6 +1,3 @@
--- Tags: no-fasttest
--- no-fasttest: upper/lowerUTF8 use ICU
-
 select lower('aaaaaaaaaaaaaaa012345789,.!aaaa' as str) = str;
 select lowerUTF8('aaaaaaaaaaaaaaa012345789,.!aaaa' as str) = str;
 select lower('AaAaAaAaAaAaAaA012345789,.!aAaA') = 'aaaaaaaaaaaaaaa012345789,.!aaaa';
@@ -30,11 +27,3 @@ select sum(lower(materialize('aaaaАБВГAAAAaaAA')) = materialize('aaaaАБВ
 select sum(upper(materialize('aaaaАБВГAAAAaaAA')) = materialize('AAAAАБВГAAAAAAAA')) = count() from system.one array join range(16384) as n;
 select sum(lowerUTF8(materialize('aaaaАБВГAAAAaaAA')) = materialize('aaaaабвгaaaaaaaa')) = count() from system.one array join range(16384) as n;
 select sum(upperUTF8(materialize('aaaaАБВГAAAAaaAA')) = materialize('AAAAАБВГAAAAAAAA')) = count() from system.one array join range(16384) as n;
-
--- Turkish language
-select upperUTF8('ır') = 'IR';
-select lowerUTF8('ır') = 'ır';
-
--- German language
-select upper('öäüß') = 'öäüß';
-select lower('ÖÄÜẞ') = 'ÖÄÜẞ';
diff --git a/tests/queries/0_stateless/00233_position_function_family.sql b/tests/queries/0_stateless/00233_position_function_family.sql
index d6668cb7ba4..dd7394bc39a 100644
--- a/tests/queries/0_stateless/00233_position_function_family.sql
+++ b/tests/queries/0_stateless/00233_position_function_family.sql
@@ -1,6 +1,3 @@
--- Tags: no-fasttest
--- no-fasttest: upper/lowerUTF8 use ICU
-
 SET send_logs_level = 'fatal';
 
 select 1 = position('', '');
diff --git a/tests/queries/0_stateless/00761_lower_utf8_bug.sql b/tests/queries/0_stateless/00761_lower_utf8_bug.sql
index a0ab55edc15..de20b894331 100644
--- a/tests/queries/0_stateless/00761_lower_utf8_bug.sql
+++ b/tests/queries/0_stateless/00761_lower_utf8_bug.sql
@@ -1,4 +1 @@
--- Tags: no-fasttest
--- no-fasttest: upper/lowerUTF8 use ICU
-
 SELECT lowerUTF8('\xF0') = lowerUTF8('\xF0');
diff --git a/tests/queries/0_stateless/01278_random_string_utf8.sql b/tests/queries/0_stateless/01278_random_string_utf8.sql
index 290d6a0c759..da2dc48c3e1 100644
--- a/tests/queries/0_stateless/01278_random_string_utf8.sql
+++ b/tests/queries/0_stateless/01278_random_string_utf8.sql
@@ -1,6 +1,3 @@
--- Tags: no-fasttest
--- no-fasttest: upper/lowerUTF8 use ICU
-
 SELECT randomStringUTF8('string'); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
 SELECT lengthUTF8(randomStringUTF8(100));
 SELECT toTypeName(randomStringUTF8(10));
diff --git a/tests/queries/0_stateless/01431_utf8_ubsan.reference b/tests/queries/0_stateless/01431_utf8_ubsan.reference
index dc785e57851..c98c950d535 100644
--- a/tests/queries/0_stateless/01431_utf8_ubsan.reference
+++ b/tests/queries/0_stateless/01431_utf8_ubsan.reference
@@ -1,2 +1,2 @@
-EFBFBD
-EFBFBD
+FF
+FF
diff --git a/tests/queries/0_stateless/01431_utf8_ubsan.sql b/tests/queries/0_stateless/01431_utf8_ubsan.sql
index 3a28e023805..d6a299225b1 100644
--- a/tests/queries/0_stateless/01431_utf8_ubsan.sql
+++ b/tests/queries/0_stateless/01431_utf8_ubsan.sql
@@ -1,5 +1,2 @@
--- Tags: no-fasttest
--- no-fasttest: upper/lowerUTF8 use ICU
-
 SELECT hex(lowerUTF8('\xFF'));
 SELECT hex(upperUTF8('\xFF'));
diff --git a/tests/queries/0_stateless/01590_countSubstrings.sql b/tests/queries/0_stateless/01590_countSubstrings.sql
index 5ec4f412d7f..b38cbb7d188 100644
--- a/tests/queries/0_stateless/01590_countSubstrings.sql
+++ b/tests/queries/0_stateless/01590_countSubstrings.sql
@@ -1,6 +1,3 @@
--- Tags: no-fasttest
--- no-fasttest: upper/lowerUTF8 use ICU
-
 --
 -- countSubstrings
 --
diff --git a/tests/queries/0_stateless/02071_lower_upper_utf8_row_overlaps.reference b/tests/queries/0_stateless/02071_lower_upper_utf8_row_overlaps.reference
index deabef61a88..a3bac432482 100644
--- a/tests/queries/0_stateless/02071_lower_upper_utf8_row_overlaps.reference
+++ b/tests/queries/0_stateless/02071_lower_upper_utf8_row_overlaps.reference
@@ -5,9 +5,9 @@ insert into utf8_overlap values ('\xe2'), ('Foo⚊BarBazBam'), ('\xe2'), ('Foo
 --                                             MONOGRAM FOR YANG
 with lowerUTF8(str) as l_, upperUTF8(str) as u_, '0x' || hex(str) as h_
 select length(str), if(l_ == '\xe2', h_, l_), if(u_ == '\xe2', h_, u_) from utf8_overlap format CSV;
-1,"�","�"
+1,"0xE2","0xE2"
 15,"foo⚊barbazbam","FOO⚊BARBAZBAM"
-1,"�","�"
+1,"0xE2","0xE2"
 15,"foo⚊barbazbam","FOO⚊BARBAZBAM"
 -- NOTE: regression test for introduced bug
 -- https://github.com/ClickHouse/ClickHouse/issues/42756
diff --git a/tests/queries/0_stateless/02071_lower_upper_utf8_row_overlaps.sql b/tests/queries/0_stateless/02071_lower_upper_utf8_row_overlaps.sql
index d175e0659d0..8ca0a3f5f75 100644
--- a/tests/queries/0_stateless/02071_lower_upper_utf8_row_overlaps.sql
+++ b/tests/queries/0_stateless/02071_lower_upper_utf8_row_overlaps.sql
@@ -1,6 +1,3 @@
--- Tags: no-fasttest
--- no-fasttest: upper/lowerUTF8 use ICU
-
 drop table if exists utf8_overlap;
 create table utf8_overlap (str String) engine=Memory();
 
diff --git a/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference b/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference
index 0980e25b70f..c39f1fb1ce9 100644
--- a/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference
+++ b/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference
@@ -416,6 +416,7 @@ logTrace
 lowCardinalityIndices
 lowCardinalityKeys
 lower
+lowerUTF8
 makeDate
 makeDate32
 makeDateTime
@@ -896,6 +897,7 @@ tupleToNameValuePairs
 unbin
 unhex
 upper
+upperUTF8
 uptime
 validateNestedArraySizes
 version
diff --git a/tests/queries/0_stateless/02514_if_with_lazy_low_cardinality.sql b/tests/queries/0_stateless/02514_if_with_lazy_low_cardinality.sql
index b169cfd0ab9..80e3c0a9ece 100644
--- a/tests/queries/0_stateless/02514_if_with_lazy_low_cardinality.sql
+++ b/tests/queries/0_stateless/02514_if_with_lazy_low_cardinality.sql
@@ -1,6 +1,3 @@
--- Tags: no-fasttest
--- no-fasttest: upper/lowerUTF8 use ICU
-
 create table if not exists t (`arr.key` Array(LowCardinality(String)), `arr.value` Array(LowCardinality(String))) engine = Memory;
 insert into t (`arr.key`, `arr.value`) values (['a'], ['b']);
 select if(true, if(lowerUTF8(arr.key) = 'a', 1, 2), 3) as x from t left array join arr;
diff --git a/tests/queries/0_stateless/02807_lower_utf8_msan.sql b/tests/queries/0_stateless/02807_lower_utf8_msan.sql
index 95f224577f7..e9eb18bf615 100644
--- a/tests/queries/0_stateless/02807_lower_utf8_msan.sql
+++ b/tests/queries/0_stateless/02807_lower_utf8_msan.sql
@@ -1,5 +1,2 @@
--- Tags: no-fasttest
--- no-fasttest: upper/lowerUTF8 use ICU
-
 SELECT lowerUTF8(arrayJoin(['©--------------------------------------', '©--------------------'])) ORDER BY 1;
 SELECT upperUTF8(materialize('aaaaАБВГaaaaaaaaaaaaАБВГAAAAaaAA')) FROM numbers(2);
diff --git a/tests/queries/0_stateless/03015_peder1001.sql b/tests/queries/0_stateless/03015_peder1001.sql
index df8e4db1536..810503207f2 100644
--- a/tests/queries/0_stateless/03015_peder1001.sql
+++ b/tests/queries/0_stateless/03015_peder1001.sql
@@ -1,6 +1,3 @@
--- Tags: no-fasttest
--- no-fasttest: upper/lowerUTF8 use ICU
-
 DROP TABLE IF EXISTS test_data;
 
 CREATE TABLE test_data