Merge pull request #34683 from vdimir/fix_aarch64/position_utf_8

Fix `positionUTF8` on aarch64
2024-11-25 17:12:03 +00:00 · 2022-02-20 11:35:40 +01:00 · 2022-02-20 11:35:40 +01:00 · 5284083647
commit 5284083647
parent b75fbaa6a9 73fe35e552
7 changed files with 86 additions and 52 deletions
--- a/src/Common/StringSearcher.h
+++ b/src/Common/StringSearcher.h
@ -158,8 +158,8 @@ public:
                int c_l_u32 = Poco::Unicode::toLower(*c_u32);
                int c_u_u32 = Poco::Unicode::toUpper(*c_u32);

-                uint8_t dst_l_len = static_cast<uint8_t>(UTF8::convertCodePointToUTF8(c_l_u32, l_seq, sizeof(l_seq)));
-                uint8_t dst_u_len = static_cast<uint8_t>(UTF8::convertCodePointToUTF8(c_u_u32, u_seq, sizeof(u_seq)));
+                size_t dst_l_len = UTF8::convertCodePointToUTF8(c_l_u32, l_seq, sizeof(l_seq));
+                size_t dst_u_len = UTF8::convertCodePointToUTF8(c_u_u32, u_seq, sizeof(u_seq));

                /// @note Unicode standard states it is a rare but possible occasion
                if (!(dst_l_len == dst_u_len && dst_u_len == src_len))
--- a/src/Common/UTF8Helpers.h
+++ b/src/Common/UTF8Helpers.h
@ -76,7 +76,7 @@ inline size_t countCodePoints(const UInt8 * data, size_t size)


 template <typename CharT, typename = std::enable_if_t<sizeof(CharT) == 1>>
-size_t convertCodePointToUTF8(uint32_t code_point, CharT * out_bytes, size_t out_length)
+size_t convertCodePointToUTF8(int code_point, CharT * out_bytes, size_t out_length)
 {
    static const Poco::UTF8Encoding utf8;
    int res = utf8.convert(code_point, reinterpret_cast<uint8_t *>(out_bytes), out_length);
--- a/src/Common/Volnitsky.h
+++ b/src/Common/Volnitsky.h
@ -60,7 +60,7 @@ namespace VolnitskyTraits
    static inline Ngram toNGram(const UInt8 * const pos) { return unalignedLoad<Ngram>(pos); }

    template <typename Callback>
-    static inline void putNGramASCIICaseInsensitive(const UInt8 * pos, int offset, Callback && putNGramBase)
+    static inline bool putNGramASCIICaseInsensitive(const UInt8 * pos, int offset, Callback && putNGramBase)
    {
        struct Chars
        {
@ -107,10 +107,12 @@ namespace VolnitskyTraits
        else
            /// 1 combination: 01
            putNGramBase(n, offset);
+
+        return true;
    }

    template <typename Callback>
-    static inline void putNGramUTF8CaseInsensitive(
+    static inline bool putNGramUTF8CaseInsensitive(
        const UInt8 * pos, int offset, const UInt8 * begin, size_t size, Callback && putNGramBase)
    {
        const UInt8 * end = begin + size;
@ -131,7 +133,7 @@ namespace VolnitskyTraits

        if (isascii(chars.c0) && isascii(chars.c1))
        {
-            putNGramASCIICaseInsensitive(pos, offset, putNGramBase);
+            return putNGramASCIICaseInsensitive(pos, offset, putNGramBase);
        }
        else
        {
@ -177,21 +179,25 @@ namespace VolnitskyTraits
                        /// where is the given ngram in respect to the start of UTF-8 sequence?
                        size_t seq_ngram_offset = pos - seq_pos;

-                        Seq seq;
+                        Seq seq_l;
+                        size_t length_l = UTF8::convertCodePointToUTF8(l_u32, seq_l, sizeof(seq_l));

-                        /// put ngram for lowercase
-                        size_t length_l [[maybe_unused]] = UTF8::convertCodePointToUTF8(l_u32, seq, sizeof(seq));
-                        assert(length_l >= 2);
-                        chars.c0 = seq[seq_ngram_offset];
-                        chars.c1 = seq[seq_ngram_offset + 1];
+                        Seq seq_r;
+                        size_t length_r = UTF8::convertCodePointToUTF8(u_u32, seq_r, sizeof(seq_r));
+
+                        if (length_l != length_r)
+                            return false;
+
+                        assert(length_l >= 2 && length_r >= 2);
+
+                        chars.c0 = seq_l[seq_ngram_offset];
+                        chars.c1 = seq_l[seq_ngram_offset + 1];
                        putNGramBase(n, offset);

-                        /// put ngram for uppercase
-                        size_t length_r [[maybe_unused]] = UTF8::convertCodePointToUTF8(u_u32, seq, sizeof(seq));
-                        assert(length_r >= 2);
-                        chars.c0 = seq[seq_ngram_offset]; //-V519
-                        chars.c1 = seq[seq_ngram_offset + 1]; //-V519
+                        chars.c0 = seq_r[seq_ngram_offset]; //-V519
+                        chars.c1 = seq_r[seq_ngram_offset + 1]; //-V519
                        putNGramBase(n, offset);
+
                    }
                }
            }
@ -235,40 +241,47 @@ namespace VolnitskyTraits
                else if (first_l_u32 == first_u_u32)
                {
                    /// first symbol is case-independent
-                    Seq seq;
+                    Seq seq_l;
+                    size_t size_l = UTF8::convertCodePointToUTF8(second_l_u32, seq_l, sizeof(seq_l));

-                    /// put ngram for lowercase
-                    size_t size_l [[maybe_unused]] = UTF8::convertCodePointToUTF8(second_l_u32, seq, sizeof(seq));
-                    assert(size_l >= 1);
-                    chars.c1 = seq[0];
+                    Seq seq_u;
+                    size_t size_u = UTF8::convertCodePointToUTF8(second_u_u32, seq_u, sizeof(seq_u));
+
+                    if (size_l != size_u)
+                        return false;
+
+                    assert(size_l >= 1 && size_u >= 1);
+                    chars.c1 = seq_l[0];
                    putNGramBase(n, offset);

                    /// put ngram from uppercase, if it is different
-                    size_t size_u [[maybe_unused]] = UTF8::convertCodePointToUTF8(second_u_u32, seq, sizeof(seq));
-                    assert(size_u >= 1);
-                    if (chars.c1 != seq[0])
+                    if (chars.c1 != seq_u[0])
                    {
-                        chars.c1 = seq[0];
+                        chars.c1 = seq_u[0];
                        putNGramBase(n, offset);
                    }
                }
                else if (second_l_u32 == second_u_u32)
                {
                    /// second symbol is case-independent
-                    Seq seq;

-                    /// put ngram for lowercase
-                    size_t size_l [[maybe_unused]] = UTF8::convertCodePointToUTF8(first_l_u32, seq, sizeof(seq));
-                    assert(size_l > seq_ngram_offset);
-                    chars.c0 = seq[seq_ngram_offset];
+                    Seq seq_l;
+                    size_t size_l = UTF8::convertCodePointToUTF8(first_l_u32, seq_l, sizeof(seq_l));
+                    Seq seq_u;
+                    size_t size_u = UTF8::convertCodePointToUTF8(first_u_u32, seq_u, sizeof(seq_u));
+
+                    if (size_l != size_u)
+                        return false;
+
+                    assert(size_l > seq_ngram_offset && size_u > seq_ngram_offset);
+
+                    chars.c0 = seq_l[seq_ngram_offset];
                    putNGramBase(n, offset);

                    /// put ngram for uppercase, if it is different
-                    size_t size_u [[maybe_unused]] = UTF8::convertCodePointToUTF8(first_u_u32, seq, sizeof(seq));
-                    assert(size_u > seq_ngram_offset);
-                    if (chars.c0 != seq[seq_ngram_offset])
+                    if (chars.c0 != seq_u[seq_ngram_offset])
                    {
-                        chars.c0 = seq[seq_ngram_offset];
+                        chars.c0 = seq_u[seq_ngram_offset];
                        putNGramBase(n, offset);
                    }
                }
@ -279,10 +292,12 @@ namespace VolnitskyTraits
                    Seq second_l_seq;
                    Seq second_u_seq;

-                    size_t size_first_l [[maybe_unused]] = UTF8::convertCodePointToUTF8(first_l_u32, first_l_seq, sizeof(first_l_seq));
-                    size_t size_first_u [[maybe_unused]] = UTF8::convertCodePointToUTF8(first_u_u32, first_u_seq, sizeof(first_u_seq));
-                    size_t size_second_l [[maybe_unused]] = UTF8::convertCodePointToUTF8(second_l_u32, second_l_seq, sizeof(second_l_seq));
-                    size_t size_second_u [[maybe_unused]] = UTF8::convertCodePointToUTF8(second_u_u32, second_u_seq, sizeof(second_u_seq));
+                    size_t size_first_l = UTF8::convertCodePointToUTF8(first_l_u32, first_l_seq, sizeof(first_l_seq));
+                    size_t size_first_u = UTF8::convertCodePointToUTF8(first_u_u32, first_u_seq, sizeof(first_u_seq));
+                    size_t size_second_l = UTF8::convertCodePointToUTF8(second_l_u32, second_l_seq, sizeof(second_l_seq));
+                    size_t size_second_u = UTF8::convertCodePointToUTF8(second_u_u32, second_u_seq, sizeof(second_u_seq));
+                    if (size_first_l != size_first_u || size_second_l != size_second_u)
+                        return false;

                    assert(size_first_l > seq_ngram_offset);
                    assert(size_first_u > seq_ngram_offset);
@ -325,17 +340,25 @@ namespace VolnitskyTraits
                }
            }
        }
+        return true;
    }

    template <bool CaseSensitive, bool ASCII, typename Callback>
-    static inline void putNGram(const UInt8 * pos, int offset, [[maybe_unused]] const UInt8 * begin, size_t size, Callback && putNGramBase)
+    static inline bool putNGram(const UInt8 * pos, int offset, [[maybe_unused]] const UInt8 * begin, size_t size, Callback && putNGramBase)
    {
        if constexpr (CaseSensitive)
+        {
            putNGramBase(toNGram(pos), offset);
+            return true;
+        }
        else if constexpr (ASCII)
-            putNGramASCIICaseInsensitive(pos, offset, std::forward<Callback>(putNGramBase));
+        {
+            return putNGramASCIICaseInsensitive(pos, offset, std::forward<Callback>(putNGramBase));
+        }
        else
-            putNGramUTF8CaseInsensitive(pos, offset, begin, size, std::forward<Callback>(putNGramBase));
+        {
+            return putNGramUTF8CaseInsensitive(pos, offset, begin, size, std::forward<Callback>(putNGramBase));
+        }
    }
 }

@ -381,7 +404,20 @@ public:
        /// ssize_t is used here because unsigned can't be used with condition like `i >= 0`, unsigned always >= 0
        /// And also adding from the end guarantees that we will find first occurrence because we will lookup bigger offsets first.
        for (auto i = static_cast<ssize_t>(needle_size - sizeof(VolnitskyTraits::Ngram)); i >= 0; --i)
-            VolnitskyTraits::putNGram<CaseSensitive, ASCII>(needle + i, i + 1, needle, needle_size, callback);
+        {
+            bool ok = VolnitskyTraits::putNGram<CaseSensitive, ASCII>(needle + i, i + 1, needle, needle_size, callback);
+
+            /** `putNGramUTF8CaseInsensitive` does not work if characters with lower and upper cases
+              * are represented by different number of bytes or code points.
+              * So, use fallback if error occurred.
+              */
+            if (!ok)
+            {
+                fallback_searcher.force_fallback = true;
+                hash = nullptr;
+                return;
+            }
+        }
    }


--- a/tests/queries/0_stateless/00926_multimatch.sql
+++ b/tests/queries/0_stateless/00926_multimatch.sql
@ -1,5 +1,4 @@
-- Tags: no-fasttest
-- Tag no-fasttest: Hyperscan
+-- Tags: no-fasttest, use-hyperscan

 select 0 = multiMatchAny(materialize('mpnsguhwsitzvuleiwebwjfitmsg'), ['wbirxqoabpblrnvvmjizj', 'cfcxhuvrexyzyjsh', 'oldhtubemyuqlqbwvwwkwin', 'bumoozxdkjglzu', 'intxlfohlxmajjomw', 'dxkeghohv', 'arsvmwwkjeopnlwnan', 'ouugllgowpqtaxslcopkytbfhifaxbgt', 'hkedmjlbcrzvryaopjqdjjc', 'tbqkljywstuahzh', 'o', 'wowoclosyfcuwotmvjygzuzhrery', 'vpefjiffkhlggntcu', 'ytdixvasrorhripzfhjdmlhqksmctyycwp']) from system.numbers limit 10;
 select 0 = multiMatchAny(materialize('qjjzqexjpgkglgxpzrbqbnskq'), ['vaiatcjacmlffdzsejpdareqzy', 'xspcfzdufkmecud', 'bcvtbuqtctq', 'nkcopwbfytgemkqcfnnno', 'dylxnzuyhq', 'tno', 'scukuhufly', 'cdyquzuqlptv', 'ohluyfeksyxepezdhqmtfmgkvzsyph', 'ualzwtahvqvtijwp', 'jg', 'gwbawqlngzcknzgtmlj', 'qimvjcgbkkp', 'eaedbcgyrdvv', 'qcwrncjoewwedyyewcdkh', 'uqcvhngoqngmitjfxpznqomertqnqcveoqk', 'ydrgjiankgygpm', 'axepgap']) from system.numbers limit 10;
--- a/tests/queries/0_stateless/00929_multi_match_edit_distance.sql
+++ b/tests/queries/0_stateless/00929_multi_match_edit_distance.sql
@ -1,5 +1,4 @@
-- Tags: no-fasttest
-- Tag no-fasttest: Hyperscan
+-- Tags: no-fasttest, use-hyperscan

 SET send_logs_level = 'fatal';

--- a/tests/queries/0_stateless/01670_dictionary_create_key_expression.reference
+++ b/tests/queries/0_stateless/01670_dictionary_create_key_expression.reference
@ -1,8 +1,8 @@
 Simple
-5791441145865411458	Test2
-3450587330153346914	Test1
 3111929972906540512	Test3
+3450587330153346914	Test1
+5791441145865411458	Test2
 Complex
 3111929972906540512	5	Test3
-5791441145865411458	5	Test2
 3450587330153346914	5	Test1
+5791441145865411458	5	Test2
--- a/tests/queries/0_stateless/01670_dictionary_create_key_expression.sql
+++ b/tests/queries/0_stateless/01670_dictionary_create_key_expression.sql
@ -17,7 +17,7 @@ SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'test_for
 LIFETIME(MIN 1 MAX 10)
 LAYOUT(HASHED());

-SELECT * FROM database_dictionary_test_key_expression.test_query_log_dictionary_simple;
+SELECT * FROM database_dictionary_test_key_expression.test_query_log_dictionary_simple ORDER BY value_id;

 DROP DICTIONARY IF EXISTS database_dictionary_test_key_expression.test_query_log_dictionary_simple;

@ -34,7 +34,7 @@ SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'test_for
 LIFETIME(MIN 1 MAX 10)
 LAYOUT(COMPLEX_KEY_HASHED());

-SELECT * FROM database_dictionary_test_key_expression.test_query_log_dictionary_complex;
+SELECT * FROM database_dictionary_test_key_expression.test_query_log_dictionary_complex ORDER BY value_id;

 DROP DICTIONARY IF EXISTS database_dictionary_test_key_expression.test_query_log_dictionary_complex;