#include #include #include #include #include #include #include #include #include #include #include #if (defined(__PPC64__) || defined(__powerpc64__)) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ #include "vec_crc32.h" #endif namespace DB { namespace ErrorCodes { extern const int NOT_IMPLEMENTED; } struct BytesRef { const UInt8 * data; size_t size; }; struct Hash { static UInt64 crc32u64(UInt64 crc [[maybe_unused]], UInt64 val [[maybe_unused]]) { #ifdef __SSE4_2__ return _mm_crc32_u64(crc, val); #elif defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) return __crc32cd(static_cast(crc), val); #elif (defined(__PPC64__) || defined(__powerpc64__)) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ return crc32_ppc(crc, reinterpret_cast(&val), sizeof(val)); #elif defined(__s390x__) && __BYTE_ORDER__==__ORDER_BIG_ENDIAN__ return s390x_crc32(crc, val); #else throw Exception(ErrorCodes::NOT_IMPLEMENTED, "String hash is not implemented without sse4.2 support"); #endif } static UInt64 crc32u32(UInt32 crc [[maybe_unused]], UInt32 val [[maybe_unused]]) { #ifdef __SSE4_2__ return _mm_crc32_u32(crc, val); #elif defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) return __crc32cw(crc, val); #elif (defined(__PPC64__) || defined(__powerpc64__)) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ return crc32_ppc(crc, reinterpret_cast(&val), sizeof(val)); #elif defined(__s390x__) && __BYTE_ORDER__==__ORDER_BIG_ENDIAN__ return s390x_crc32_u32(crc, val); #else throw Exception(ErrorCodes::NOT_IMPLEMENTED, "String hash is not implemented without sse4.2 support"); #endif } static UInt64 crc32u16(UInt32 crc [[maybe_unused]], UInt16 val [[maybe_unused]]) { #ifdef __SSE4_2__ return _mm_crc32_u16(crc, val); #elif defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) return __crc32ch(crc, val); #elif (defined(__PPC64__) || defined(__powerpc64__)) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ return crc32_ppc(crc, reinterpret_cast(&val), sizeof(val)); #elif defined(__s390x__) && __BYTE_ORDER__==__ORDER_BIG_ENDIAN__ return s390x_crc32_u16(crc, val); #else throw Exception(ErrorCodes::NOT_IMPLEMENTED, "String hash is not implemented without sse4.2 support"); #endif } static UInt64 crc32u8(UInt32 crc [[maybe_unused]], UInt8 val [[maybe_unused]]) { #ifdef __SSE4_2__ return _mm_crc32_u8(crc, val); #elif defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) return __crc32cb(crc, val); #elif (defined(__PPC64__) || defined(__powerpc64__)) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ return crc32_ppc(crc, reinterpret_cast(&val), sizeof(val)); #elif defined(__s390x__) && __BYTE_ORDER__==__ORDER_BIG_ENDIAN__ return s390x_crc32_u8(crc, val); #else throw Exception(ErrorCodes::NOT_IMPLEMENTED, "String hash is not implemented without sse4.2 support"); #endif } template static ALWAYS_INLINE inline UInt64 shingleHash(UInt64 crc, const UInt8 * start, size_t size) { if (size & 1) { UInt8 x = *start; if constexpr (CaseInsensitive) x |= 0x20u; /// see toLowerIfAlphaASCII from StringUtils.h crc = crc32u8(static_cast(crc), x); --size; ++start; } if (size & 2) { UInt16 x = unalignedLoad(start); if constexpr (CaseInsensitive) x |= 0x2020u; crc = crc32u16(static_cast(crc), x); size -= 2; start += 2; } if (size & 4) { UInt32 x = unalignedLoad(start); if constexpr (CaseInsensitive) x |= 0x20202020u; crc = crc32u32(static_cast(crc), x); size -= 4; start += 4; } while (size) { UInt64 x = unalignedLoad(start); if constexpr (CaseInsensitive) x |= 0x2020202020202020u; crc = crc32u64(crc, x); size -= 8; start += 8; } return crc; } template static ALWAYS_INLINE inline UInt64 shingleHash(const std::vector & shingle, size_t offset = 0) { UInt64 crc = -1ULL; for (size_t i = offset; i < shingle.size(); ++i) crc = shingleHash(crc, shingle[i].data, shingle[i].size); for (size_t i = 0; i < offset; ++i) crc = shingleHash(crc, shingle[i].data, shingle[i].size); return crc; } }; // SimHash String -> UInt64 // UTF8: means ASCII or UTF8, these two parameters CodePoint and UTF8 can only be (UInt8, false) or (UInt32, true) // Ngram: means ngram(true) or words shingles(false) // CaseInsensitive: means should we consider about letter case or not template struct SimHashImpl { static constexpr size_t min_word_size = 4; /// Update fingerprint according to hash_value bits. static ALWAYS_INLINE inline void updateFingerVector(Int64 * finger_vec, UInt64 hash_value) { for (size_t i = 0; i < 64; ++i) finger_vec[i] += (hash_value & (1ULL << i)) ? 1 : -1; } /// Return a 64 bit value according to finger_vec. static ALWAYS_INLINE inline UInt64 getSimHash(const Int64 * finger_vec) { UInt64 res = 0; for (size_t i = 0; i < 64; ++i) if (finger_vec[i] > 0) res |= (1ULL << i); return res; } // SimHash ngram calculate function: String -> UInt64 // this function extracting ngram from input string, and maintain a 64-dimensions vector // for each ngram, calculate a 64 bit hash value, and update the vector according the hash value // finally return a 64 bit value(UInt64), i'th bit is 1 means vector[i] > 0, otherwise, vector[i] < 0 static ALWAYS_INLINE inline UInt64 ngramHashASCII(const UInt8 * data, size_t size, size_t shingle_size) { if (size < shingle_size) return Hash::shingleHash(-1ULL, data, size); Int64 finger_vec[64] = {}; const UInt8 * end = data + size; for (const UInt8 * pos = data; pos + shingle_size <= end; ++pos) { UInt64 hash_value = Hash::shingleHash(-1ULL, pos, shingle_size); updateFingerVector(finger_vec, hash_value); } return getSimHash(finger_vec); } static ALWAYS_INLINE inline UInt64 ngramHashUTF8(const UInt8 * data, size_t size, size_t shingle_size) { const UInt8 * start = data; const UInt8 * end = data + size; const UInt8 * word_start = start; const UInt8 * word_end = start; for (size_t i = 0; i < shingle_size; ++i) { if (word_end >= end) return Hash::shingleHash(-1ULL, data, size); ExtractStringImpl::readOneUTF8Code(word_end, end); } Int64 finger_vec[64] = {}; while (word_end < end) { ExtractStringImpl::readOneUTF8Code(word_start, word_end); ExtractStringImpl::readOneUTF8Code(word_end, end); size_t length = word_end - word_start; UInt64 hash_value = Hash::shingleHash(-1ULL, word_start, length); updateFingerVector(finger_vec, hash_value); } return getSimHash(finger_vec); } // SimHash word shingle calculate function: String -> UInt64 // this function extracting n word shingle from input string, and maintain a 64-dimensions vector as well // for each word shingle, calculate a 64 bit hash value, and update the vector according the hash value // finally return a 64 bit value(UInt64), i'th bit is 1 means vector[i] > 0, otherwise, vector[i] < 0 // // word shingle hash value calculate: // 1. at the first, extracts N word shingles and calculate N hash values, store into an array, use this N hash values // to calculate the first word shingle hash value // 2. next, we extract one word each time, and calculate a new hash value of the new word,then use the latest N hash // values to calculate the next word shingle hash value static ALWAYS_INLINE inline UInt64 wordShingleHash(const UInt8 * data, size_t size, size_t shingle_size) { const UInt8 * start = data; const UInt8 * end = data + size; // A 64 bit vector initialized to zero. Int64 finger_vec[64] = {}; // An array to store N words. std::vector words; words.reserve(shingle_size); // get first word shingle while (start < end && words.size() < shingle_size) { const UInt8 * word_start = ExtractStringImpl::readOneWord(start, end); size_t length = start - word_start; if (length >= min_word_size) words.emplace_back(BytesRef{word_start, length}); } if (words.empty()) return 0; UInt64 hash_value = Hash::shingleHash(words); updateFingerVector(finger_vec, hash_value); size_t offset = 0; while (start < end) { const UInt8 * word_start = ExtractStringImpl::readOneWord(start, end); size_t length = start - word_start; if (length < min_word_size) continue; // we need to store the new word hash value to the oldest location. // for example, N = 5, array |a0|a1|a2|a3|a4|, now, a0 is the oldest location, // so we need to store new word hash into location of a0, then this array become // |a5|a1|a2|a3|a4|, next time, a1 become the oldest location, we need to store new // word hash value into location of a1, then array become |a5|a6|a2|a3|a4| words[offset] = BytesRef{word_start, length}; ++offset; if (offset >= shingle_size) offset = 0; // according to the word hash storation way, in order to not lose the word shingle's // sequence information, when calculation word shingle hash value, we need provide the offset // information, which is the offset of the first word's hash value of the word shingle hash_value = Hash::shingleHash(words, offset); updateFingerVector(finger_vec, hash_value); } return getSimHash(finger_vec); } static void apply(const ColumnString::Chars & data, const ColumnString::Offsets & offsets, size_t shingle_size, PaddedPODArray & res) { for (size_t i = 0; i < offsets.size(); ++i) { const UInt8 * one_data = &data[offsets[i - 1]]; const size_t data_size = offsets[i] - offsets[i - 1] - 1; if constexpr (Ngram) { if constexpr (!UTF8) res[i] = ngramHashASCII(one_data, data_size, shingle_size); else res[i] = ngramHashUTF8(one_data, data_size, shingle_size); } else { res[i] = wordShingleHash(one_data, data_size, shingle_size); } } } }; // MinHash: String -> Tuple(UInt64, UInt64) // for each string, we extract ngram or word shingle, // for each ngram or word shingle, calculate a hash value, // then we take the K minimum hash values to calculate a hashsum, // and take the K maximum hash values to calculate another hashsum, // return this two hashsum: Tuple(hashsum1, hashsum2) // // UTF8: means ASCII or UTF8, these two parameters CodePoint and UTF8 can only be (UInt8, false) or (UInt32, true) // Ngram: means ngram(true) or words shingles(false) // CaseInsensitive: means should we consider about letter case or not template struct MinHashImpl { static constexpr size_t min_word_size = 4; template struct Heap { void update(UInt64 hash, BytesRef ref, size_t limit) { if (values.contains(hash)) return; values[hash] = ref; if (values.size() > limit) values.erase(values.begin()); } UInt64 getHash() { if (values.empty()) return 0; UInt64 res = 0; for (auto it = values.begin(); it != values.end(); ++it) res = CityHash_v1_0_2::Hash128to64(CityHash_v1_0_2::uint128(res, it->first)); return res; } void fill(ColumnTuple & strings) { auto it = values.begin(); for (size_t i = 0; i < strings.tupleSize(); ++i) { auto & col_string = static_cast(strings.getColumn(i)); if (it != values.end()) { col_string.insertData(reinterpret_cast(it->second.data), it->second.size); ++it; } else col_string.insertDefault(); } } std::map values; }; using MaxHeap = Heap>; using MinHeap = Heap>; static ALWAYS_INLINE inline void ngramHashASCII( MinHeap & min_heap, MaxHeap & max_heap, const UInt8 * data, size_t size, size_t shingle_size, size_t heap_size) { if (size < shingle_size) { UInt64 hash_value = Hash::shingleHash(-1ULL, data, size); min_heap.update(hash_value, BytesRef{data, size}, heap_size); max_heap.update(hash_value, BytesRef{data, size}, heap_size); return; } const UInt8 * end = data + size; for (const UInt8 * pos = data; pos + shingle_size <= end; ++pos) { UInt64 hash_value = Hash::shingleHash(-1ULL, pos, shingle_size); // insert the new hash value into array used to store K minimum value // and K maximum value min_heap.update(hash_value, BytesRef{pos, shingle_size}, heap_size); max_heap.update(hash_value, BytesRef{pos, shingle_size}, heap_size); } } static ALWAYS_INLINE inline void ngramHashUTF8( MinHeap & min_heap, MaxHeap & max_heap, const UInt8 * data, size_t size, size_t shingle_size, size_t heap_size) { const UInt8 * start = data; const UInt8 * end = data + size; const UInt8 * word_start = start; const UInt8 * word_end = start; for (size_t i = 0; i < shingle_size; ++i) { if (word_end >= end) { auto hash_value = Hash::shingleHash(-1ULL, data, size); min_heap.update(hash_value, BytesRef{data, size}, heap_size); max_heap.update(hash_value, BytesRef{data, size}, heap_size); return; } ExtractStringImpl::readOneUTF8Code(word_end, end); } while (word_end < end) { ExtractStringImpl::readOneUTF8Code(word_start, word_end); ExtractStringImpl::readOneUTF8Code(word_end, end); size_t length = word_end - word_start; UInt64 hash_value = Hash::shingleHash(-1ULL, word_start, length); min_heap.update(hash_value, BytesRef{word_start, length}, heap_size); max_heap.update(hash_value, BytesRef{word_start, length}, heap_size); } } // MinHash word shingle hash value calculate function: String ->Tuple(UInt64, UInt64) // for each word shingle, we calculate a hash value, but in fact, we just maintain the // K minimum and K maximum hash value static ALWAYS_INLINE inline void wordShingleHash( MinHeap & min_heap, MaxHeap & max_heap, const UInt8 * data, size_t size, size_t shingle_size, size_t heap_size) { const UInt8 * start = data; const UInt8 * end = data + size; // An array to store N words. std::vector words; words.reserve(shingle_size); // get first word shingle while (start < end && words.size() < shingle_size) { const UInt8 * word_start = ExtractStringImpl::readOneWord(start, end); size_t length = start - word_start; if (length >= min_word_size) words.emplace_back(BytesRef{word_start, length}); } if (words.empty()) return; UInt64 hash_value = Hash::shingleHash(words); { const UInt8 * shingle_start = words.front().data; const UInt8 * shingle_end = words.back().data + words.back().size; BytesRef ref{shingle_start, static_cast(shingle_end - shingle_start)}; min_heap.update(hash_value, ref, heap_size); max_heap.update(hash_value, ref, heap_size); } size_t offset = 0; while (start < end) { const UInt8 * word_start = ExtractStringImpl::readOneWord(start, end); size_t length = start - word_start; if (length < min_word_size) continue; words[offset] = BytesRef{word_start, length}; const UInt8 * shingle_end = words[offset].data + length; ++offset; if (offset >= shingle_size) offset = 0; const UInt8 * shingle_start = words[offset].data; hash_value = Hash::shingleHash(words, offset); BytesRef ref{shingle_start, static_cast(shingle_end - shingle_start)}; min_heap.update(hash_value, ref, heap_size); max_heap.update(hash_value, ref, heap_size); } } static void apply( const ColumnString::Chars & data, const ColumnString::Offsets & offsets, size_t shingle_size, size_t heap_size, PaddedPODArray * res1, PaddedPODArray * res2, ColumnTuple * res1_strings, ColumnTuple * res2_strings) { MinHeap min_heap; MaxHeap max_heap; for (size_t i = 0; i < offsets.size(); ++i) { const UInt8 * one_data = &data[offsets[i - 1]]; const size_t data_size = offsets[i] - offsets[i - 1] - 1; min_heap.values.clear(); max_heap.values.clear(); if constexpr (Ngram) { if constexpr (!UTF8) ngramHashASCII(min_heap, max_heap, one_data, data_size, shingle_size, heap_size); else ngramHashUTF8(min_heap, max_heap, one_data, data_size, shingle_size, heap_size); } else { wordShingleHash(min_heap, max_heap, one_data, data_size, shingle_size, heap_size); } if (res1) (*res1)[i] = min_heap.getHash(); if (res2) (*res2)[i] = max_heap.getHash(); if (res1_strings) min_heap.fill(*res1_strings); if (res2_strings) max_heap.fill(*res2_strings); } } }; struct NameNgramSimHash { static constexpr auto name = "ngramSimHash"; }; struct NameNgramSimHashCaseInsensitive { static constexpr auto name = "ngramSimHashCaseInsensitive"; }; struct NameNgramSimHashUTF8 { static constexpr auto name = "ngramSimHashUTF8"; }; struct NameNgramSimHashCaseInsensitiveUTF8 { static constexpr auto name = "ngramSimHashCaseInsensitiveUTF8"; }; struct NameWordShingleSimHash { static constexpr auto name = "wordShingleSimHash"; }; struct NameWordShingleSimHashCaseInsensitive { static constexpr auto name = "wordShingleSimHashCaseInsensitive"; }; struct NameWordShingleSimHashUTF8 { static constexpr auto name = "wordShingleSimHashUTF8"; }; struct NameWordShingleSimHashCaseInsensitiveUTF8 { static constexpr auto name = "wordShingleSimHashCaseInsensitiveUTF8"; }; struct NameNgramMinHash { static constexpr auto name = "ngramMinHash"; }; struct NameNgramMinHashCaseInsensitive { static constexpr auto name = "ngramMinHashCaseInsensitive"; }; struct NameNgramMinHashUTF8 { static constexpr auto name = "ngramMinHashUTF8"; }; struct NameNgramMinHashCaseInsensitiveUTF8 { static constexpr auto name = "ngramMinHashCaseInsensitiveUTF8"; }; struct NameWordShingleMinHash { static constexpr auto name = "wordShingleMinHash"; }; struct NameWordShingleMinHashCaseInsensitive { static constexpr auto name = "wordShingleMinHashCaseInsensitive"; }; struct NameWordShingleMinHashUTF8 { static constexpr auto name = "wordShingleMinHashUTF8"; }; struct NameWordShingleMinHashCaseInsensitiveUTF8 { static constexpr auto name = "wordShingleMinHashCaseInsensitiveUTF8"; }; struct NameNgramMinHashArg { static constexpr auto name = "ngramMinHashArg"; }; struct NameNgramMinHashArgCaseInsensitive { static constexpr auto name = "ngramMinHashArgCaseInsensitive"; }; struct NameNgramMinHashArgUTF8 { static constexpr auto name = "ngramMinHashArgUTF8"; }; struct NameNgramMinHashArgCaseInsensitiveUTF8 { static constexpr auto name = "ngramMinHashArgCaseInsensitiveUTF8"; }; struct NameWordShingleMinHashArg { static constexpr auto name = "wordShingleMinHashArg"; }; struct NameWordShingleMinHashArgCaseInsensitive { static constexpr auto name = "wordShingleMinHashArgCaseInsensitive"; }; struct NameWordShingleMinHashArgUTF8 { static constexpr auto name = "wordShingleMinHashArgUTF8"; }; struct NameWordShingleMinHashArgCaseInsensitiveUTF8 { static constexpr auto name = "wordShingleMinHashArgCaseInsensitiveUTF8"; }; // SimHash using FunctionNgramSimHash = FunctionsStringHash, NameNgramSimHash, true>; using FunctionNgramSimHashCaseInsensitive = FunctionsStringHash, NameNgramSimHashCaseInsensitive, true>; using FunctionNgramSimHashUTF8 = FunctionsStringHash, NameNgramSimHashUTF8, true>; using FunctionNgramSimHashCaseInsensitiveUTF8 = FunctionsStringHash, NameNgramSimHashCaseInsensitiveUTF8, true>; using FunctionWordShingleSimHash = FunctionsStringHash, NameWordShingleSimHash, true>; using FunctionWordShingleSimHashCaseInsensitive = FunctionsStringHash, NameWordShingleSimHashCaseInsensitive, true>; using FunctionWordShingleSimHashUTF8 = FunctionsStringHash, NameWordShingleSimHashUTF8, true>; using FunctionWordShingleSimHashCaseInsensitiveUTF8 = FunctionsStringHash, NameWordShingleSimHashCaseInsensitiveUTF8, true>; // MinHash using FunctionNgramMinHash = FunctionsStringHash, NameNgramMinHash, false>; using FunctionNgramMinHashCaseInsensitive = FunctionsStringHash, NameNgramMinHashCaseInsensitive, false>; using FunctionNgramMinHashUTF8 = FunctionsStringHash, NameNgramMinHashUTF8, false>; using FunctionNgramMinHashCaseInsensitiveUTF8 = FunctionsStringHash, NameNgramMinHashCaseInsensitiveUTF8, false>; using FunctionWordShingleMinHash = FunctionsStringHash, NameWordShingleMinHash, false>; using FunctionWordShingleMinHashCaseInsensitive = FunctionsStringHash, NameWordShingleMinHashCaseInsensitive, false>; using FunctionWordShingleMinHashUTF8 = FunctionsStringHash, NameWordShingleMinHashUTF8, false>; using FunctionWordShingleMinHashCaseInsensitiveUTF8 = FunctionsStringHash, NameWordShingleMinHashCaseInsensitiveUTF8, false>; // MinHasArg using FunctionNgramMinHashArg = FunctionsStringHash, NameNgramMinHashArg, false, true>; using FunctionNgramMinHashArgCaseInsensitive = FunctionsStringHash, NameNgramMinHashArgCaseInsensitive, false, true>; using FunctionNgramMinHashArgUTF8 = FunctionsStringHash, NameNgramMinHashArgUTF8, false, true>; using FunctionNgramMinHashArgCaseInsensitiveUTF8 = FunctionsStringHash, NameNgramMinHashArgCaseInsensitiveUTF8, false, true>; using FunctionWordShingleMinHashArg = FunctionsStringHash, NameWordShingleMinHashArg, false, true>; using FunctionWordShingleMinHashArgCaseInsensitive = FunctionsStringHash, NameWordShingleMinHashArgCaseInsensitive, false, true>; using FunctionWordShingleMinHashArgUTF8 = FunctionsStringHash, NameWordShingleMinHashArgUTF8, false, true>; using FunctionWordShingleMinHashArgCaseInsensitiveUTF8 = FunctionsStringHash, NameWordShingleMinHashArgCaseInsensitiveUTF8, false, true>; REGISTER_FUNCTION(StringHash) { factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); } }