ClickHouse/src/Functions/LowerUpperUTF8Impl.h

#pragma once
#include <Columns/ColumnString.h>
#include <Poco/UTF8Encoding.h>
#include <Common/UTF8Helpers.h>
#include <base/defines.h>

#ifdef __SSE2__
#include <emmintrin.h>
#endif


namespace DB
{

namespace ErrorCodes
{
    extern const int BAD_ARGUMENTS;
}

/// xor or do nothing
template <bool>
UInt8 xor_or_identity(const UInt8 c, const int mask)
{
    return c ^ mask;
}

template <>
inline UInt8 xor_or_identity<false>(const UInt8 c, const int)
{
    return c;
}

/// It is caller's responsibility to ensure the presence of a valid cyrillic sequence in array
template <bool to_lower>
inline void UTF8CyrillicToCase(const UInt8 *& src, UInt8 *& dst)
{
    if (src[0] == 0xD0u && (src[1] >= 0x80u && src[1] <= 0x8Fu))
    {
        /// ЀЁЂЃЄЅІЇЈЉЊЋЌЍЎЏ
        *dst++ = xor_or_identity<to_lower>(*src++, 0x1);
        *dst++ = xor_or_identity<to_lower>(*src++, 0x10);
    }
    else if (src[0] == 0xD1u && (src[1] >= 0x90u && src[1] <= 0x9Fu))
    {
        /// ѐёђѓєѕіїјљњћќѝўџ
        *dst++ = xor_or_identity<!to_lower>(*src++, 0x1);
        *dst++ = xor_or_identity<!to_lower>(*src++, 0x10);
    }
    else if (src[0] == 0xD0u && (src[1] >= 0x90u && src[1] <= 0x9Fu))
    {
        /// А-П
        *dst++ = *src++;
        *dst++ = xor_or_identity<to_lower>(*src++, 0x20);
    }
    else if (src[0] == 0xD0u && (src[1] >= 0xB0u && src[1] <= 0xBFu))
    {
        /// а-п
        *dst++ = *src++;
        *dst++ = xor_or_identity<!to_lower>(*src++, 0x20);
    }
    else if (src[0] == 0xD0u && (src[1] >= 0xA0u && src[1] <= 0xAFu))
    {
        /// Р-Я
        *dst++ = xor_or_identity<to_lower>(*src++, 0x1);
        *dst++ = xor_or_identity<to_lower>(*src++, 0x20);
    }
    else if (src[0] == 0xD1u && (src[1] >= 0x80u && src[1] <= 0x8Fu))
    {
        /// р-я
        *dst++ = xor_or_identity<!to_lower>(*src++, 0x1);
        *dst++ = xor_or_identity<!to_lower>(*src++, 0x20);
    }
}


/** If the string contains UTF-8 encoded text, convert it to the lower (upper) case.
  * Note: It is assumed that after the character is converted to another case,
  *  the length of its multibyte sequence in UTF-8 does not change.
  * Otherwise, the behavior is undefined.
  */
template <char not_case_lower_bound,
    char not_case_upper_bound,
    int to_case(int),
    void cyrillic_to_case(const UInt8 *&, UInt8 *&)>
struct LowerUpperUTF8Impl
{
    static void vector(
        const ColumnString::Chars & data,
        const ColumnString::Offsets & offsets,
        ColumnString::Chars & res_data,
        ColumnString::Offsets & res_offsets)
    {
        if (data.empty())
            return;
        res_data.resize(data.size());
        res_offsets.assign(offsets);
        array(data.data(), data.data() + data.size(), offsets, res_data.data());
    }

    static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &)
    {
        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Functions lowerUTF8 and upperUTF8 cannot work with FixedString argument");
    }

    /** Converts a single code point starting at `src` to desired case, storing result starting at `dst`.
     *    `src` and `dst` are incremented by corresponding sequence lengths. */
    static bool toCase(const UInt8 *& src, const UInt8 * src_end, UInt8 *& dst, bool partial)
    {
        if (src[0] <= ascii_upper_bound)
        {
            if (*src >= not_case_lower_bound && *src <= not_case_upper_bound)
                *dst++ = *src++ ^ flip_case_mask;
            else
                *dst++ = *src++;
        }
        else if (src + 1 < src_end
            && ((src[0] == 0xD0u && (src[1] >= 0x80u && src[1] <= 0xBFu)) || (src[0] == 0xD1u && (src[1] >= 0x80u && src[1] <= 0x9Fu))))
        {
            cyrillic_to_case(src, dst);
        }
        else if (src + 1 < src_end && src[0] == 0xC2u)
        {
            /// Punctuation U+0080 - U+00BF, UTF-8: C2 80 - C2 BF
            *dst++ = *src++;
            *dst++ = *src++;
        }
        else if (src + 2 < src_end && src[0] == 0xE2u)
        {
            /// Characters U+2000 - U+2FFF, UTF-8: E2 80 80 - E2 BF BF
            *dst++ = *src++;
            *dst++ = *src++;
            *dst++ = *src++;
        }
        else
        {
            static const Poco::UTF8Encoding utf8;

            size_t src_sequence_length = UTF8::seqLength(*src);
            /// In case partial buffer was passed (due to SSE optimization)
            /// we cannot convert it with current src_end, but we may have more
            /// bytes to convert and eventually got correct symbol.
            if (partial && src_sequence_length > static_cast<size_t>(src_end-src))
                return false;

            auto src_code_point = UTF8::convertUTF8ToCodePoint(src, src_end - src);
            if (src_code_point)
            {
                int dst_code_point = to_case(*src_code_point);
                if (dst_code_point > 0)
                {
                    size_t dst_sequence_length = UTF8::convertCodePointToUTF8(dst_code_point, dst, src_end - src);
                    assert(dst_sequence_length <= 4);

                    /// We don't support cases when lowercase and uppercase characters occupy different number of bytes in UTF-8.
                    /// As an example, this happens for ß and ẞ.
                    if (dst_sequence_length == src_sequence_length)
                    {
                        src += dst_sequence_length;
                        dst += dst_sequence_length;
                        return true;
                    }
                }
            }

            *dst = *src;
            ++dst;
            ++src;
        }

        return true;
    }

private:
    static constexpr auto ascii_upper_bound = '\x7f';
    static constexpr auto flip_case_mask = 'A' ^ 'a';

    static void array(const UInt8 * src, const UInt8 * src_end, const ColumnString::Offsets & offsets, UInt8 * dst)
    {
        auto offset_it = offsets.begin();
        const UInt8 * begin = src;

#ifdef __SSE2__
        static constexpr auto bytes_sse = sizeof(__m128i);
        const auto * src_end_sse = src + (src_end - src) / bytes_sse * bytes_sse;

        /// SSE2 packed comparison operate on signed types, hence compare (c < 0) instead of (c > 0x7f)
        const auto v_zero = _mm_setzero_si128();
        const auto v_not_case_lower_bound = _mm_set1_epi8(not_case_lower_bound - 1);
        const auto v_not_case_upper_bound = _mm_set1_epi8(not_case_upper_bound + 1);
        const auto v_flip_case_mask = _mm_set1_epi8(flip_case_mask);

        while (src < src_end_sse)
        {
            const auto chars = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));

            /// check for ASCII
            const auto is_not_ascii = _mm_cmplt_epi8(chars, v_zero);
            const auto mask_is_not_ascii = _mm_movemask_epi8(is_not_ascii);

            /// ASCII
            if (mask_is_not_ascii == 0)
            {
                const auto is_not_case
                    = _mm_and_si128(_mm_cmpgt_epi8(chars, v_not_case_lower_bound), _mm_cmplt_epi8(chars, v_not_case_upper_bound));
                const auto mask_is_not_case = _mm_movemask_epi8(is_not_case);

                /// everything in correct case ASCII
                if (mask_is_not_case == 0)
                    _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), chars);
                else
                {
                    /// ASCII in mixed case
                    /// keep `flip_case_mask` only where necessary, zero out elsewhere
                    const auto xor_mask = _mm_and_si128(v_flip_case_mask, is_not_case);

                    /// flip case by applying calculated mask
                    const auto cased_chars = _mm_xor_si128(chars, xor_mask);

                    /// store result back to destination
                    _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), cased_chars);
                }

                src += bytes_sse;
                dst += bytes_sse;
            }
            else
            {
                /// UTF-8

                size_t offset_from_begin = src - begin;
                while (offset_from_begin >= *offset_it)
                    ++offset_it;
                /// Do not allow one row influence another (since row may have invalid sequence, and break the next)
                const UInt8 * row_end = begin + *offset_it;
                chassert(row_end >= src);
                const UInt8 * expected_end = std::min(src + bytes_sse, row_end);

                while (src < expected_end)
                {
                    if (!toCase(src, expected_end, dst, /* partial= */ true))
                    {
                        /// Fallback to handling byte by byte.
                        src_end_sse = src;
                        break;
                    }
                }
            }
        }

        /// Find which offset src has now
        while (offset_it != offsets.end() && static_cast<size_t>(src - begin) >= *offset_it)
            ++offset_it;
#endif

        /// handle remaining symbols, row by row (to avoid influence of bad UTF8 symbols from one row, to another)
        while (src < src_end)
        {
            const UInt8 * row_end = begin + *offset_it;
            chassert(row_end >= src);

            while (src < row_end)
                toCase(src, row_end, dst, /* partial= */ false);
            ++offset_it;
        }
    }
};

}
-												Check for #pragma once in headers

											
										
										
											2020-10-10 18:37:02 +00:00
+								#pragma once
-												Every function in its own file, part 3

											
										
										
											2018-09-09 23:36:06 +00:00
+								#include <Columns/ColumnString.h>
-												Every function in its own file, part 3

											
										
										
											2018-09-09 23:47:56 +00:00
+								#include <Poco/UTF8Encoding.h>
-												Fixed error in lowerUTF8 and upperUTF8 functions [#CLICKHOUSE-2]

											
										
										
											2018-11-25 22:26:36 +00:00
+								#include <Common/UTF8Helpers.h>
-												Do not allow invalid sequences influence other rows in lowerUTF8/upperUTF8

Right now lowerUTF8() and upperUTF8() does not respect row boundaries,
and so one row may break another.

Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>

											
										
										
											2022-09-13 19:08:01 +00:00
+								#include <base/defines.h>
-												Every function in its own file, part 3

											
										
										
											2018-09-09 23:36:06 +00:00
-												Added some warnings from clang's -Weverything

											
										
										
											2019-01-04 12:10:00 +00:00
+								#ifdef __SSE2__
-												Every function in its own file, part 3

											
										
										
											2018-09-09 23:36:06 +00:00
+								#include <emmintrin.h>
 								#endif
 								namespace DB
 								{
-												Fixed error in lowerUTF8 and upperUTF8 functions [#CLICKHOUSE-2]

											
										
										
											2018-11-25 22:26:36 +00:00
+								namespace ErrorCodes
 								{
 								    extern const int BAD_ARGUMENTS;
 								}
-												Fix clang-tidy warnings in Disks, Formats, Functions folders

											
										
										
											2022-03-12 18:05:50 +00:00
+								/// xor or do nothing
 								template <bool>
 								UInt8 xor_or_identity(const UInt8 c, const int mask)
-												Every function in its own file, part 3

											
										
										
											2018-09-09 23:36:06 +00:00
+								{
-												Fix clang-tidy warnings in Disks, Formats, Functions folders

											
										
										
											2022-03-12 18:05:50 +00:00
+								    return c ^ mask;
 								}
 								template <>
 								inline UInt8 xor_or_identity<false>(const UInt8 c, const int)
 								{
 								    return c;
 								}
 								/// It is caller's responsibility to ensure the presence of a valid cyrillic sequence in array
 								template <bool to_lower>
 								inline void UTF8CyrillicToCase(const UInt8 *& src, UInt8 *& dst)
 								{
 								    if (src[0] == 0xD0u && (src[1] >= 0x80u && src[1] <= 0x8Fu))
-												Every function in its own file, part 3

											
										
										
											2018-09-09 23:36:06 +00:00
+								    {
-												Fix clang-tidy warnings in Disks, Formats, Functions folders

											
										
										
											2022-03-12 18:05:50 +00:00
+								        /// ЀЁЂЃЄЅІЇЈЉЊЋЌЍЎЏ
 								        *dst++ = xor_or_identity<to_lower>(*src++, 0x1);
 								        *dst++ = xor_or_identity<to_lower>(*src++, 0x10);
-												Every function in its own file, part 3

											
										
										
											2018-09-09 23:36:06 +00:00
+								    }
-												Fix clang-tidy warnings in Disks, Formats, Functions folders

											
										
										
											2022-03-12 18:05:50 +00:00
+								    else if (src[0] == 0xD1u && (src[1] >= 0x90u && src[1] <= 0x9Fu))
-												Every function in its own file, part 3

											
										
										
											2018-09-09 23:36:06 +00:00
+								    {
-												Fix clang-tidy warnings in Disks, Formats, Functions folders

											
										
										
											2022-03-12 18:05:50 +00:00
+								        /// ѐёђѓєѕіїјљњћќѝўџ
 								        *dst++ = xor_or_identity<!to_lower>(*src++, 0x1);
 								        *dst++ = xor_or_identity<!to_lower>(*src++, 0x10);
-												Every function in its own file, part 3

											
										
										
											2018-09-09 23:36:06 +00:00
+								    }
-												Fix clang-tidy warnings in Disks, Formats, Functions folders

											
										
										
											2022-03-12 18:05:50 +00:00
+								    else if (src[0] == 0xD0u && (src[1] >= 0x90u && src[1] <= 0x9Fu))
-												Every function in its own file, part 3

											
										
										
											2018-09-09 23:36:06 +00:00
+								    {
-												Fix clang-tidy warnings in Disks, Formats, Functions folders

											
										
										
											2022-03-12 18:05:50 +00:00
+								        /// А-П
 								        *dst++ = *src++;
 								        *dst++ = xor_or_identity<to_lower>(*src++, 0x20);
 								    }
 								    else if (src[0] == 0xD0u && (src[1] >= 0xB0u && src[1] <= 0xBFu))
 								    {
 								        /// а-п
 								        *dst++ = *src++;
 								        *dst++ = xor_or_identity<!to_lower>(*src++, 0x20);
 								    }
 								    else if (src[0] == 0xD0u && (src[1] >= 0xA0u && src[1] <= 0xAFu))
 								    {
 								        /// Р-Я
 								        *dst++ = xor_or_identity<to_lower>(*src++, 0x1);
 								        *dst++ = xor_or_identity<to_lower>(*src++, 0x20);
 								    }
 								    else if (src[0] == 0xD1u && (src[1] >= 0x80u && src[1] <= 0x8Fu))
 								    {
 								        /// р-я
 								        *dst++ = xor_or_identity<!to_lower>(*src++, 0x1);
 								        *dst++ = xor_or_identity<!to_lower>(*src++, 0x20);
-												Every function in its own file, part 3

											
										
										
											2018-09-09 23:36:06 +00:00
+								    }
 								}
 								/** If the string contains UTF-8 encoded text, convert it to the lower (upper) case.
 								  * Note: It is assumed that after the character is converted to another case,
 								  *  the length of its multibyte sequence in UTF-8 does not change.
 								  * Otherwise, the behavior is undefined.
 								  */
 								template <char not_case_lower_bound,
 								    char not_case_upper_bound,
 								    int to_case(int),
 								    void cyrillic_to_case(const UInt8 *&, UInt8 *&)>
 								struct LowerUpperUTF8Impl
 								{
 								    static void vector(
-												Fixed error with "extractURLParameter" function (read after buffer); improved performance; added support for zero bytes in URLs; renamed Chars_t type [#CLICKHOUSE-2]

											
										
										
											2018-11-25 00:08:50 +00:00
+								        const ColumnString::Chars & data,
-												Every function in its own file, part 3

											
										
										
											2018-09-09 23:36:06 +00:00
+								        const ColumnString::Offsets & offsets,
-												Fixed error with "extractURLParameter" function (read after buffer); improved performance; added support for zero bytes in URLs; renamed Chars_t type [#CLICKHOUSE-2]

											
										
										
											2018-11-25 00:08:50 +00:00
+								        ColumnString::Chars & res_data,
-												Every function in its own file, part 3

											
										
										
											2018-09-09 23:36:06 +00:00
+								        ColumnString::Offsets & res_offsets)
 								    {
-												Do not allow invalid sequences influence other rows in lowerUTF8/upperUTF8

Right now lowerUTF8() and upperUTF8() does not respect row boundaries,
and so one row may break another.

Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>

											
										
										
											2022-09-13 19:08:01 +00:00
+								        if (data.empty())
 								            return;
-												Every function in its own file, part 3

											
										
										
											2018-09-09 23:36:06 +00:00
+								        res_data.resize(data.size());
 								        res_offsets.assign(offsets);
-												Do not allow invalid sequences influence other rows in lowerUTF8/upperUTF8

Right now lowerUTF8() and upperUTF8() does not respect row boundaries,
and so one row may break another.

Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>

											
										
										
											2022-09-13 19:08:01 +00:00
+								        array(data.data(), data.data() + data.size(), offsets, res_data.data());
-												Every function in its own file, part 3

											
										
										
											2018-09-09 23:36:06 +00:00
+								    }
-												Clang Tidy, part 7 (#9799)

* Attempt to enable identifier naming check

* Attempt to enable identifier naming check, continue

* Progress on identifier names

* Adopt identifier names check

* Fixed error

* Merge with master

* Fixed build

* Fixed build

* Fixed build

* Fixed build

* Fixed build

* Fixed error

* Fixed error

* Fixed error
											
										
										
											2020-03-23 02:12:31 +00:00
+								    static void vectorFixed(const ColumnString::Chars &, size_t, ColumnString::Chars &)
-												Every function in its own file, part 3

											
										
										
											2018-09-09 23:36:06 +00:00
+								    {
-												Better formatting for exception messages (#45449)

* save format string for NetException

* format exceptions

* format exceptions 2

* format exceptions 3

* format exceptions 4

* format exceptions 5

* format exceptions 6

* fix

* format exceptions 7

* format exceptions 8

* Update MergeTreeIndexGin.cpp

* Update AggregateFunctionMap.cpp

* Update AggregateFunctionMap.cpp

* fix
											
										
										
											2023-01-23 21:13:58 +00:00
+								        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Functions lowerUTF8 and upperUTF8 cannot work with FixedString argument");
-												Every function in its own file, part 3

											
										
										
											2018-09-09 23:36:06 +00:00
+								    }
 								    /** Converts a single code point starting at `src` to desired case, storing result starting at `dst`.
 								     *    `src` and `dst` are incremented by corresponding sequence lengths. */
-												Fix lowerUTF8()/upperUTF8() in case of symbol was in between 16-byte boundary

In lowerUTF8()/upperUTF8() there is an SSE optimization that handles
16 byte at a time, but only for ASCII, for UTF8 symbols converion will
be done by symbol.

Consider the following example:

    КВ АМ И СЖ
             ^ - offset is 15, length of sequence is 2
                 so first byte of a symbol is in first 16 bytes
                 second byte of a symbol is not ther

And in this case it will be handled incorrectly because it will try to
process oly these 16 bytes w/o looking forward.

This had been broken by #41286, before this patch it does not looks at
the row boundaries but only at the string end and so this sutation
wasn't possible.

Fixes: #42756
Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>

											
										
										
											2022-10-30 07:50:05 +00:00
+								    static bool toCase(const UInt8 *& src, const UInt8 * src_end, UInt8 *& dst, bool partial)
-												Every function in its own file, part 3

											
										
										
											2018-09-09 23:36:06 +00:00
+								    {
 								        if (src[0] <= ascii_upper_bound)
 								        {
 								            if (*src >= not_case_lower_bound && *src <= not_case_upper_bound)
 								                *dst++ = *src++ ^ flip_case_mask;
 								            else
 								                *dst++ = *src++;
 								        }
 								        else if (src + 1 < src_end
 								            && ((src[0] == 0xD0u && (src[1] >= 0x80u && src[1] <= 0xBFu)) || (src[0] == 0xD1u && (src[1] >= 0x80u && src[1] <= 0x9Fu))))
 								        {
 								            cyrillic_to_case(src, dst);
 								        }
 								        else if (src + 1 < src_end && src[0] == 0xC2u)
 								        {
 								            /// Punctuation U+0080 - U+00BF, UTF-8: C2 80 - C2 BF
 								            *dst++ = *src++;
 								            *dst++ = *src++;
 								        }
 								        else if (src + 2 < src_end && src[0] == 0xE2u)
 								        {
 								            /// Characters U+2000 - U+2FFF, UTF-8: E2 80 80 - E2 BF BF
 								            *dst++ = *src++;
 								            *dst++ = *src++;
 								            *dst++ = *src++;
 								        }
 								        else
 								        {
 								            static const Poco::UTF8Encoding utf8;
-												Remove tons of garbage

											
										
										
											2021-01-31 02:36:52 +00:00
+								            size_t src_sequence_length = UTF8::seqLength(*src);
-												Fix lowerUTF8()/upperUTF8() in case of symbol was in between 16-byte boundary

In lowerUTF8()/upperUTF8() there is an SSE optimization that handles
16 byte at a time, but only for ASCII, for UTF8 symbols converion will
be done by symbol.

Consider the following example:

    КВ АМ И СЖ
             ^ - offset is 15, length of sequence is 2
                 so first byte of a symbol is in first 16 bytes
                 second byte of a symbol is not ther

And in this case it will be handled incorrectly because it will try to
process oly these 16 bytes w/o looking forward.

This had been broken by #41286, before this patch it does not looks at
the row boundaries but only at the string end and so this sutation
wasn't possible.

Fixes: #42756
Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>

											
										
										
											2022-10-30 07:50:05 +00:00
+								            /// In case partial buffer was passed (due to SSE optimization)
 								            /// we cannot convert it with current src_end, but we may have more
 								            /// bytes to convert and eventually got correct symbol.
 								            if (partial && src_sequence_length > static_cast<size_t>(src_end-src))
 								                return false;
-												Fixed error in lowerUTF8 and upperUTF8 functions [#CLICKHOUSE-2]

											
										
										
											2018-11-25 22:26:36 +00:00
-												Remove tons of garbage

											
										
										
											2021-01-31 02:36:52 +00:00
+								            auto src_code_point = UTF8::convertUTF8ToCodePoint(src, src_end - src);
 								            if (src_code_point)
-												Every function in its own file, part 3

											
										
										
											2018-09-09 23:36:06 +00:00
+								            {
-												Remove tons of garbage

											
										
										
											2021-01-31 02:36:52 +00:00
+								                int dst_code_point = to_case(*src_code_point);
-												Fixed error in lowerUTF8 and upperUTF8 functions [#CLICKHOUSE-2]

											
										
										
											2018-11-25 22:26:36 +00:00
+								                if (dst_code_point > 0)
 								                {
-												Remove tons of garbage

											
										
										
											2021-01-31 02:36:52 +00:00
+								                    size_t dst_sequence_length = UTF8::convertCodePointToUTF8(dst_code_point, dst, src_end - src);
 								                    assert(dst_sequence_length <= 4);
-												Fixed error in lowerUTF8 and upperUTF8 functions [#CLICKHOUSE-2]

											
										
										
											2018-11-25 22:26:36 +00:00
 								                    /// We don't support cases when lowercase and uppercase characters occupy different number of bytes in UTF-8.
 								                    /// As an example, this happens for ß and ẞ.
 								                    if (dst_sequence_length == src_sequence_length)
 								                    {
 								                        src += dst_sequence_length;
 								                        dst += dst_sequence_length;
-												Fix lowerUTF8()/upperUTF8() in case of symbol was in between 16-byte boundary

In lowerUTF8()/upperUTF8() there is an SSE optimization that handles
16 byte at a time, but only for ASCII, for UTF8 symbols converion will
be done by symbol.

Consider the following example:

    КВ АМ И СЖ
             ^ - offset is 15, length of sequence is 2
                 so first byte of a symbol is in first 16 bytes
                 second byte of a symbol is not ther

And in this case it will be handled incorrectly because it will try to
process oly these 16 bytes w/o looking forward.

This had been broken by #41286, before this patch it does not looks at
the row boundaries but only at the string end and so this sutation
wasn't possible.

Fixes: #42756
Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>

											
										
										
											2022-10-30 07:50:05 +00:00
+								                        return true;
-												Fixed error in lowerUTF8 and upperUTF8 functions [#CLICKHOUSE-2]

											
										
										
											2018-11-25 22:26:36 +00:00
+								                    }
 								                }
-												Every function in its own file, part 3

											
										
										
											2018-09-09 23:36:06 +00:00
+								            }
-												Fixed error in lowerUTF8 and upperUTF8 functions [#CLICKHOUSE-2]

											
										
										
											2018-11-25 22:26:36 +00:00
-												Remove tons of garbage

											
										
										
											2021-01-31 02:36:52 +00:00
+								            *dst = *src;
 								            ++dst;
 								            ++src;
-												Every function in its own file, part 3

											
										
										
											2018-09-09 23:36:06 +00:00
+								        }
-												Fix lowerUTF8()/upperUTF8() in case of symbol was in between 16-byte boundary

In lowerUTF8()/upperUTF8() there is an SSE optimization that handles
16 byte at a time, but only for ASCII, for UTF8 symbols converion will
be done by symbol.

Consider the following example:

    КВ АМ И СЖ
             ^ - offset is 15, length of sequence is 2
                 so first byte of a symbol is in first 16 bytes
                 second byte of a symbol is not ther

And in this case it will be handled incorrectly because it will try to
process oly these 16 bytes w/o looking forward.

This had been broken by #41286, before this patch it does not looks at
the row boundaries but only at the string end and so this sutation
wasn't possible.

Fixes: #42756
Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>

											
										
										
											2022-10-30 07:50:05 +00:00
 								        return true;
-												Every function in its own file, part 3

											
										
										
											2018-09-09 23:36:06 +00:00
+								    }
 								private:
 								    static constexpr auto ascii_upper_bound = '\x7f';
 								    static constexpr auto flip_case_mask = 'A' ^ 'a';
-												Do not allow invalid sequences influence other rows in lowerUTF8/upperUTF8

Right now lowerUTF8() and upperUTF8() does not respect row boundaries,
and so one row may break another.

Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>

											
										
										
											2022-09-13 19:08:01 +00:00
+								    static void array(const UInt8 * src, const UInt8 * src_end, const ColumnString::Offsets & offsets, UInt8 * dst)
-												Every function in its own file, part 3

											
										
										
											2018-09-09 23:36:06 +00:00
+								    {
-												Do not allow invalid sequences influence other rows in lowerUTF8/upperUTF8

Right now lowerUTF8() and upperUTF8() does not respect row boundaries,
and so one row may break another.

Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>

											
										
										
											2022-09-13 19:08:01 +00:00
+								        auto offset_it = offsets.begin();
 								        const UInt8 * begin = src;
-												Added some warnings from clang's -Weverything

											
										
										
											2019-01-04 12:10:00 +00:00
+								#ifdef __SSE2__
-												Fixed error in lowerUTF8 and upperUTF8 functions [#CLICKHOUSE-2]

											
										
										
											2018-11-25 22:26:36 +00:00
+								        static constexpr auto bytes_sse = sizeof(__m128i);
-												Fix clang-tidy warnings in Disks, Formats, Functions folders

											
										
										
											2022-03-12 18:05:50 +00:00
+								        const auto * src_end_sse = src + (src_end - src) / bytes_sse * bytes_sse;
-												Every function in its own file, part 3

											
										
										
											2018-09-09 23:36:06 +00:00
 								        /// SSE2 packed comparison operate on signed types, hence compare (c < 0) instead of (c > 0x7f)
 								        const auto v_zero = _mm_setzero_si128();
 								        const auto v_not_case_lower_bound = _mm_set1_epi8(not_case_lower_bound - 1);
 								        const auto v_not_case_upper_bound = _mm_set1_epi8(not_case_upper_bound + 1);
 								        const auto v_flip_case_mask = _mm_set1_epi8(flip_case_mask);
 								        while (src < src_end_sse)
 								        {
 								            const auto chars = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
 								            /// check for ASCII
 								            const auto is_not_ascii = _mm_cmplt_epi8(chars, v_zero);
 								            const auto mask_is_not_ascii = _mm_movemask_epi8(is_not_ascii);
 								            /// ASCII
 								            if (mask_is_not_ascii == 0)
 								            {
 								                const auto is_not_case
 								                    = _mm_and_si128(_mm_cmpgt_epi8(chars, v_not_case_lower_bound), _mm_cmplt_epi8(chars, v_not_case_upper_bound));
 								                const auto mask_is_not_case = _mm_movemask_epi8(is_not_case);
 								                /// everything in correct case ASCII
 								                if (mask_is_not_case == 0)
 								                    _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), chars);
 								                else
 								                {
 								                    /// ASCII in mixed case
 								                    /// keep `flip_case_mask` only where necessary, zero out elsewhere
 								                    const auto xor_mask = _mm_and_si128(v_flip_case_mask, is_not_case);
 								                    /// flip case by applying calculated mask
 								                    const auto cased_chars = _mm_xor_si128(chars, xor_mask);
 								                    /// store result back to destination
 								                    _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), cased_chars);
 								                }
 								                src += bytes_sse;
 								                dst += bytes_sse;
 								            }
 								            else
 								            {
 								                /// UTF-8
-												Do not allow invalid sequences influence other rows in lowerUTF8/upperUTF8

Right now lowerUTF8() and upperUTF8() does not respect row boundaries,
and so one row may break another.

Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>

											
										
										
											2022-09-13 19:08:01 +00:00
 								                size_t offset_from_begin = src - begin;
 								                while (offset_from_begin >= *offset_it)
 								                    ++offset_it;
 								                /// Do not allow one row influence another (since row may have invalid sequence, and break the next)
 								                const UInt8 * row_end = begin + *offset_it;
 								                chassert(row_end >= src);
 								                const UInt8 * expected_end = std::min(src + bytes_sse, row_end);
-												Every function in its own file, part 3

											
										
										
											2018-09-09 23:36:06 +00:00
 								                while (src < expected_end)
-												Fix lowerUTF8()/upperUTF8() in case of symbol was in between 16-byte boundary

In lowerUTF8()/upperUTF8() there is an SSE optimization that handles
16 byte at a time, but only for ASCII, for UTF8 symbols converion will
be done by symbol.

Consider the following example:

    КВ АМ И СЖ
             ^ - offset is 15, length of sequence is 2
                 so first byte of a symbol is in first 16 bytes
                 second byte of a symbol is not ther

And in this case it will be handled incorrectly because it will try to
process oly these 16 bytes w/o looking forward.

This had been broken by #41286, before this patch it does not looks at
the row boundaries but only at the string end and so this sutation
wasn't possible.

Fixes: #42756
Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>

											
										
										
											2022-10-30 07:50:05 +00:00
+								                {
 								                    if (!toCase(src, expected_end, dst, /* partial= */ true))
 								                    {
 								                        /// Fallback to handling byte by byte.
 								                        src_end_sse = src;
 								                        break;
 								                    }
 								                }
-												Every function in its own file, part 3

											
										
										
											2018-09-09 23:36:06 +00:00
+								            }
 								        }
-												Do not allow invalid sequences influence other rows in lowerUTF8/upperUTF8

Right now lowerUTF8() and upperUTF8() does not respect row boundaries,
and so one row may break another.

Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>

											
										
										
											2022-09-13 19:08:01 +00:00
 								        /// Find which offset src has now
 								        while (offset_it != offsets.end() && static_cast<size_t>(src - begin) >= *offset_it)
 								            ++offset_it;
-												Every function in its own file, part 3

											
										
										
											2018-09-09 23:36:06 +00:00
+								#endif
-												Do not allow invalid sequences influence other rows in lowerUTF8/upperUTF8

Right now lowerUTF8() and upperUTF8() does not respect row boundaries,
and so one row may break another.

Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>

											
										
										
											2022-09-13 19:08:01 +00:00
 								        /// handle remaining symbols, row by row (to avoid influence of bad UTF8 symbols from one row, to another)
-												Every function in its own file, part 3

											
										
										
											2018-09-09 23:36:06 +00:00
+								        while (src < src_end)
-												Do not allow invalid sequences influence other rows in lowerUTF8/upperUTF8

Right now lowerUTF8() and upperUTF8() does not respect row boundaries,
and so one row may break another.

Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>

											
										
										
											2022-09-13 19:08:01 +00:00
+								        {
 								            const UInt8 * row_end = begin + *offset_it;
 								            chassert(row_end >= src);
 								            while (src < row_end)
-												Fix lowerUTF8()/upperUTF8() in case of symbol was in between 16-byte boundary

In lowerUTF8()/upperUTF8() there is an SSE optimization that handles
16 byte at a time, but only for ASCII, for UTF8 symbols converion will
be done by symbol.

Consider the following example:

    КВ АМ И СЖ
             ^ - offset is 15, length of sequence is 2
                 so first byte of a symbol is in first 16 bytes
                 second byte of a symbol is not ther

And in this case it will be handled incorrectly because it will try to
process oly these 16 bytes w/o looking forward.

This had been broken by #41286, before this patch it does not looks at
the row boundaries but only at the string end and so this sutation
wasn't possible.

Fixes: #42756
Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>

											
										
										
											2022-10-30 07:50:05 +00:00
+								                toCase(src, row_end, dst, /* partial= */ false);
-												Do not allow invalid sequences influence other rows in lowerUTF8/upperUTF8

Right now lowerUTF8() and upperUTF8() does not respect row boundaries,
and so one row may break another.

Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>

											
										
										
											2022-09-13 19:08:01 +00:00
+								            ++offset_it;
 								        }
-												Every function in its own file, part 3

											
										
										
											2018-09-09 23:36:06 +00:00
+								    }
 								};
 								}