Feractoring StringHash

2024-09-26 19:50:51 +00:00 · 2020-12-25 14:22:48 +03:00 · 2020-12-25 14:22:48 +03:00 · 04d55dc495
commit 04d55dc495
parent e8f4a19a10
2 changed files with 222 additions and 122 deletions
--- a/src/Functions/ExtractString.h
+++ b/src/Functions/ExtractString.h
@ -23,112 +23,51 @@ namespace ErrorCodes

 // used by FunctionsStringSimilarity and FunctionsStringHash
 // includes extracting ASCII ngram, UTF8 ngram, ASCII word and UTF8 word
-template <bool CaseInsensitive>
 struct ExtractStringImpl
 {
-    /// Padding form ColumnsString. It is a number of bytes we can always read starting from pos if pos < end.
-    static constexpr size_t default_padding = 16;
-
-    const size_t shingle_size;
-    const size_t tail_size;
-
-    /// Functions are read `default_padding - (N - 1)` bytes into the buffer. Window of size N is used.
-    /// Read copies `N - 1` last bytes from buffer into beginning, and then reads new bytes.
-    const size_t buffer_size = default_padding + tail_size;
-
-    explicit ExtractStringImpl(size_t shingle_size_)
-            : shingle_size(shingle_size_)
-            , tail_size(shingle_size > default_padding ? shingle_size : roundUpToPowerOfTwoOrZero(shingle_size - 1))
-    {
-        if (shingle_size == 0)
-            throw Exception(ErrorCodes::LOGICAL_ERROR, "Shingle size can't be zero");
-    }
-
    // read a ASCII word
-    static ALWAYS_INLINE inline size_t readOneASCIIWord(PaddedPODArray<UInt8> & word_buf, const char *& pos, const char * end)
+    static ALWAYS_INLINE inline const UInt8 * readOneASCIIWord(const UInt8 *& pos, const UInt8 * end)
    {
        // jump separators
        while (pos < end && !isAlphaNumericASCII(*pos))
            ++pos;

        // word start from here
-        const char * word_start = pos;
+        const UInt8 * word_start = pos;
        while (pos < end && isAlphaNumericASCII(*pos))
            ++pos;

-        word_buf.assign(word_start, pos);
-        if (CaseInsensitive)
-        {
-            for (auto & symbol : word_buf)
-                symbol = toLowerIfAlphaASCII(symbol);
-        }
-        return word_buf.size();
+        return word_start;
    }

    // read one UTF8 word from pos to word
-    static ALWAYS_INLINE inline size_t readOneUTF8Word(PaddedPODArray<UInt32> & word_buf, const char *& pos, const char * end)
+    static ALWAYS_INLINE inline const UInt8 * readOneUTF8Word(const UInt8 *& pos, const UInt8 * end)
    {
        // jump UTF8 separator
        while (pos < end && isUTF8Sep(*pos))
            ++pos;
-        word_buf.clear();
+
        // UTF8 word's character number
+        const UInt8 * word_start = pos;
+
        while (pos < end && !isUTF8Sep(*pos))
-        {
-            word_buf.push_back(readOneUTF8Code(pos, end));
-        }
-        return word_buf.size();
+            readOneUTF8Code(pos, end);
+
+        return word_start;
    }

    // we use ASCII non-alphanum character as UTF8 separator
    static ALWAYS_INLINE inline bool isUTF8Sep(const UInt8 c) { return c < 128 && !isAlphaNumericASCII(c); }

-    // read one UTF8 character and return it
-    static ALWAYS_INLINE inline UInt32 readOneUTF8Code(const char *& pos, const char * end)
+    // read one UTF8 character
+    static ALWAYS_INLINE inline void readOneUTF8Code(const UInt8 *& pos, const UInt8 * end)
    {
        size_t length = UTF8::seqLength(*pos);

        if (pos + length > end)
            length = end - pos;
-        UInt32 res;
-        switch (length)
-        {
-            case 1:
-                res = 0;
-                memcpy(&res, pos, 1);
-                break;
-            case 2:
-                res = 0;
-                memcpy(&res, pos, 2);
-                break;
-            case 3:
-                res = 0;
-                memcpy(&res, pos, 3);
-                break;
-            default:
-                memcpy(&res, pos, 4);
-        }

-        if constexpr (CaseInsensitive)
-        {
-            switch (length)
-            {
-                case 4:
-                    res &= ~(1u << (5 + 3 * CHAR_BIT));
-                    [[fallthrough]];
-                case 3:
-                    res &= ~(1u << (5 + 2 * CHAR_BIT));
-                    [[fallthrough]];
-                case 2:
-                    res &= ~(1u);
-                    res &= ~(1u << (5 + CHAR_BIT));
-                    [[fallthrough]];
-                default:
-                    res &= ~(1u << 5);
-            }
-        }
        pos += length;
-        return res;
    }
 };
 }
--- a/src/Functions/FunctionsStringHash.cpp
+++ b/src/Functions/FunctionsStringHash.cpp
@ -46,6 +46,17 @@ struct Hash
 #endif
    }

+    static UInt64 crc32u16(UInt64 crc [[maybe_unused]], UInt16 val [[maybe_unused]])
+    {
+#ifdef __SSE4_2__
+        return _mm_crc32_u16(crc, val);
+#elif defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+        return __crc32ch(crc, val);
+#else
+        throw Exception("String hash is not implemented without sse4.2 support", ErrorCodes::NOT_IMPLEMENTED);
+#endif
+    }
+
    static UInt64 crc32u8(UInt64 crc [[maybe_unused]], UInt8 val [[maybe_unused]])
    {
 #ifdef __SSE4_2__
@ -57,18 +68,70 @@ struct Hash
 #endif
    }

-    static ALWAYS_INLINE inline UInt64 ngramASCIIHash(const UInt8 * code_points)
+    template <bool CaseInsensitive>
+    static ALWAYS_INLINE inline UInt64 shingleHash(UInt64 crc, const UInt8 * start, size_t size)
    {
-        return crc32u64(-1ULL, unalignedLoad<UInt32>(code_points));
+        if (size & 1)
+        {
+            UInt8 x = *start;
+
+            if constexpr (CaseInsensitive)
+                x |= 0x20u; /// see toLowerIfAlphaASCII from StringUtils.h
+
+            crc = crc32u8(crc, x);
+            --size;
+            ++start;
+        }
+
+        if (size & 2)
+        {
+            UInt16 x = unalignedLoad<UInt16>(start);
+
+            if constexpr (CaseInsensitive)
+                x |= 0x2020u;
+
+            crc = crc32u16(crc, x);
+            size -= 2;
+            start += 2;
+        }
+
+        if (size & 4)
+        {
+            UInt32 x = unalignedLoad<UInt32>(start);
+
+            if constexpr (CaseInsensitive)
+                x |= 0x20202020u;
+
+            crc = crc32u32(crc, x);
+            size -= 4;
+            start += 4;
+        }
+
+        while (size)
+        {
+            UInt64 x = unalignedLoad<UInt64>(start);
+
+            if constexpr (CaseInsensitive)
+                x |= 0x2020202020202020u;
+
+            crc = crc32u64(crc, x);
+            size -= 8;
+            start += 8;
+        }
+
+        return crc;
    }

-    static ALWAYS_INLINE inline UInt64 ngramUTF8Hash(const UInt32 * code_points)
+    template <bool CaseInsensitive>
+    static ALWAYS_INLINE inline UInt64 shingleHash(const std::vector<StringRef> & shingle, size_t offset = 0)
    {
        UInt64 crc = -1ULL;
-        crc = crc32u64(crc, code_points[0]);
-        crc = crc32u64(crc, code_points[1]);
-        crc = crc32u64(crc, code_points[2]);
-        return crc;
+
+        for (size_t i = offset; i < shingle.size(); ++i)
+            crc = shingleHash<CaseInsensitive>(crc, shingle[i].data, shingle[i].size);
+
+        for (size_t i = 0; i < offset; ++i)
+            crc = shingleHash<CaseInsensitive>(crc, shingle[i].data, shingle[i].size);
    }

    static ALWAYS_INLINE inline UInt64 wordShinglesHash(const UInt64 * hashes, size_t size, size_t offset)
@ -148,54 +211,82 @@ struct Hash
 template <size_t N, typename CodePoint, bool UTF8, bool Ngram, bool CaseInsensitive>
 struct SimHashImpl
 {
-    using StrOp = ExtractStringImpl<N, CaseInsensitive>;
-    static constexpr size_t simultaneously_codepoints_num = StrOp::buffer_size;
+    //using StrOp = ExtractStringImpl<N, CaseInsensitive>;
+    //static constexpr size_t simultaneously_codepoints_num = StrOp::buffer_size;

-    // SimHash ngram calculate function: String ->UInt64
+    /// Update fingerprint according to hash_value bits.
+    static ALWAYS_INLINE inline void updateFingerVector(Int64 * finger_vec, UInt64 hash_value)
+    {
+        for (size_t i = 0; i < 64; ++i)
+            finger_vec[i] += (hash_value & (1ULL << i)) ? 1 : -1;
+    }
+
+    /// Return a 64 bit value according to finger_vec.
+    static ALWAYS_INLINE inline UInt64 getSimHash(const Int64 * finger_vec)
+    {
+        UInt64 res = 0;
+
+        for (size_t i = 0; i < 64; ++i)
+            if (finger_vec[i] > 0)
+                res |= (1ULL << i);
+
+        return res;
+    }
+
+    // SimHash ngram calculate function: String -> UInt64
    // this function extracting ngram from input string, and maintain a 64-dimensions vector
    // for each ngram, calculate a 64 bit hash value, and update the vector according the hash value
    // finally return a 64 bit value(UInt64), i'th bit is 1 means vector[i] > 0, otherwise, vector[i] < 0
-    static ALWAYS_INLINE inline UInt64 ngramCalculateHashValue(
-        const char * data,
-        size_t size,
-        size_t (*read_code_points)(CodePoint *, const char *&, const char *),
-        UInt64 (*hash_functor)(const CodePoint *))
+
+    static ALWAYS_INLINE inline UInt64 ngramHash(const UInt8 * data, size_t size, size_t shingle_size)
    {
-        const char * start = data;
-        const char * end = data + size;
-        // fingerprint vector, all dimensions initialized to zero at the first
+        if (size < shingle_size)
+            return Hash::shingleHash<CaseInsensitive>(-1ULL, data, size);
+
        Int64 finger_vec[64] = {};
-        CodePoint cp[simultaneously_codepoints_num] = {};
+        const UInt8 * end = data + size;

-        size_t found = read_code_points(cp, start, end);
-        size_t iter = N - 1;
-
-        do
+        for (const UInt8 * pos = data; pos + shingle_size <= end; ++pos)
        {
-            for (; iter + N <= found; ++iter)
-            {
-                // for each ngram, we can calculate an 64 bit hash
-                // then update finger_vec according to this hash value
-                // if the i'th bit is 1, finger_vec[i] plus 1, otherwise minus 1
-                UInt64 hash_value = hash_functor(cp + iter);
-                std::bitset<64> bits(hash_value);
-                for (size_t i = 0; i < 64; ++i)
-                {
-                    finger_vec[i] += ((bits.test(i)) ? 1 : -1);
-                }
-            }
-            iter = 0;
-        } while (start < end && (found = read_code_points(cp, start, end)));
-
-        // finally, we return a 64 bit value according to finger_vec
-        // if finger_vec[i] > 0, the i'th bit of the value is 1, otherwise 0
-        std::bitset<64> res_bit(0u);
-        for (size_t i = 0; i < 64; ++i)
-        {
-            if (finger_vec[i] > 0)
-                res_bit.set(i);
+            UInt64 hash_value = Hash::shingleHash<CaseInsensitive>(-1ULL, pos, shingle_size);
+            updateFingerVector(finger_vec, hash_value);
        }
-        return res_bit.to_ullong();
+
+        return getSimHash(finger_vec);
+    }
+
+    static ALWAYS_INLINE inline UInt64 ngramHashUTF8(
+        size_t shingle_size,
+        const UInt8 * data,
+        size_t size)
+    {
+        const UInt8 * start = data;
+        const UInt8 * end = data + size;
+
+        const UInt8 * word_start = start;
+        const UInt8 * word_end = start;
+
+        for (size_t i = 0; i < shingle_size; ++i)
+        {
+            if (word_end >= end)
+                return Hash::shingleHash<CaseInsensitive>(-1ULL, data, size);
+
+            ExtractStringImpl::readOneUTF8Code(word_end, end);
+        }
+
+        Int64 finger_vec[64] = {};
+
+        while (word_end < end)
+        {
+            ExtractStringImpl::readOneUTF8Code(word_start, word_end);
+            ExtractStringImpl::readOneUTF8Code(word_end, end);
+
+            size_t length = word_end - word_start;
+            UInt64 hash_value = Hash::shingleHash<CaseInsensitive>(-1ULL, word_start, length);
+            updateFingerVector(finger_vec, hash_value);
+        }
+
+        return getSimHash(finger_vec);
    }

    // SimHash word shingle calculate function: String -> UInt64
@ -208,11 +299,81 @@ struct SimHashImpl
    // to calculate the first word shingle hash value
    // 2. next, we extract one word each time, and calculate a new hash value of the new word,then use the latest N hash
    // values to calculate the next word shingle hash value
+
+    static ALWAYS_INLINE inline UInt64 wordShingleHash(
+        const UInt8 * data,
+        size_t size,
+        size_t shingle_size)
+    {
+        const UInt8 * start = data;
+        const UInt8 * end = data + size;
+
+        // A 64 bit vector initialized to zero.
+        Int64 finger_vec[64] = {};
+        // An array to store N words.
+        std::vector<StringRef> words;
+        words.reserve(shingle_size);
+
+        // get first word shingle
+        while (start < end && words.size() < shingle_size)
+        {
+            const UInt8 * word_start = nullptr;
+
+            if constexpr (UTF8)
+                word_start = ExtractStringImpl::readOneUTF8Word(start, end);
+            else
+                word_start = ExtractStringImpl::readOneASCIIWord(start, end);
+
+            size_t length = start - word_start;
+
+            if (length)
+                words.emplace_back(word_start, length);
+        }
+
+        UInt64 hash_value = Hash::shingleHash<CaseInsensitive>(words);
+        updateFingerVector(finger_vec, hash_value);
+
+        size_t offset = 0;
+        while (start < end)
+        {
+            const UInt8 * word_start = nullptr;
+
+            if constexpr (UTF8)
+                word_start = ExtractStringImpl::readOneUTF8Word(start, end);
+            else
+                word_start = ExtractStringImpl::readOneASCIIWord(start, end);
+
+            size_t length = start - word_start;
+
+            if (length == 0)
+                continue;
+
+            // we need to store the new word hash value to the oldest location.
+            // for example, N = 5, array |a0|a1|a2|a3|a4|, now , a0 is the oldest location,
+            // so we need to store new word hash into location of a0, then ,this array become
+            // |a5|a1|a2|a3|a4|, next time, a1 become the oldest location, we need to store new
+            // word hash value into location of a1, then array become |a5|a6|a2|a3|a4|
+            words[offset] = StringRef(word_start, length);
+            ++offset;
+            if (offset >= shingle_size)
+                offset = 0;
+
+            // according to the word hash storation way, in order to not lose the word shingle's
+            // sequence information, when calculation word shingle hash value, we need provide the offset
+            // information, which is the offset of the first word's hash value of the word shingle
+            hash_value = Hash::shingleHash<CaseInsensitive>(words, offset);
+            updateFingerVector(finger_vec, hash_value);
+        }
+
+        return getSimHash(finger_vec);
+    }
+
    static ALWAYS_INLINE inline UInt64 wordShinglesCalculateHashValue(
        const char * data,
        size_t size,
-        size_t (*read_one_word)(PaddedPODArray<CodePoint> &, const char *&, const char *),
-        UInt64 (*hash_functor)(const UInt64 *, size_t, size_t))
+        size_t shingle_size,
+        size_t heap_size,
+        size_t max_word_length)
    {
        const char * start = data;
        const char * end = data + size;
@ -220,7 +381,7 @@ struct SimHashImpl
        // Also, a 64 bit vector initialized to zero
        Int64 finger_vec[64] = {};
        // a array to store N word hash values
-        UInt64 nword_hashes[N] = {};
+        std::vector<UInt64> word_hashes(shingle_size, 0);
        // word buffer to store one word
        PaddedPODArray<CodePoint> word_buf;
        // get first word shingle