2019-11-06 10:34:13 +00:00
|
|
|
#include <Functions/FunctionsStringHash.h>
|
|
|
|
|
|
|
|
#include <Functions/ExtractString.h>
|
|
|
|
#include <Functions/FunctionFactory.h>
|
|
|
|
#include <Functions/FunctionsHashing.h>
|
2020-06-10 15:02:58 +00:00
|
|
|
#include <Common/PODArray.h>
|
2019-11-06 10:34:13 +00:00
|
|
|
|
|
|
|
#include <Core/Defines.h>
|
|
|
|
|
2020-06-10 15:02:58 +00:00
|
|
|
#include <functional>
|
2019-11-06 10:34:13 +00:00
|
|
|
#include <tuple>
|
2020-06-10 15:02:58 +00:00
|
|
|
#include <vector>
|
2021-10-02 07:13:14 +00:00
|
|
|
#include <base/unaligned.h>
|
2019-11-06 10:34:13 +00:00
|
|
|
|
2020-12-25 14:18:41 +00:00
|
|
|
#include <city.h>
|
|
|
|
|
2023-01-11 04:56:16 +00:00
|
|
|
#if (defined(__PPC64__) || defined(__powerpc64__)) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
|
|
|
#include "vec_crc32.h"
|
|
|
|
#endif
|
|
|
|
|
2019-11-06 10:34:13 +00:00
|
|
|
namespace DB
|
|
|
|
{
|
2020-12-21 11:39:17 +00:00
|
|
|
|
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
|
|
|
extern const int NOT_IMPLEMENTED;
|
|
|
|
}
|
|
|
|
|
2020-12-25 16:04:55 +00:00
|
|
|
struct BytesRef
|
|
|
|
{
|
|
|
|
const UInt8 * data;
|
|
|
|
size_t size;
|
|
|
|
};
|
|
|
|
|
2019-11-06 10:34:13 +00:00
|
|
|
struct Hash
|
|
|
|
{
|
2020-12-22 15:29:32 +00:00
|
|
|
static UInt64 crc32u64(UInt64 crc [[maybe_unused]], UInt64 val [[maybe_unused]])
|
2019-11-06 10:34:13 +00:00
|
|
|
{
|
2020-12-21 11:39:17 +00:00
|
|
|
#ifdef __SSE4_2__
|
2020-12-22 15:29:32 +00:00
|
|
|
return _mm_crc32_u64(crc, val);
|
|
|
|
#elif defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
|
2022-10-07 10:46:45 +00:00
|
|
|
return __crc32cd(static_cast<UInt32>(crc), val);
|
2023-01-11 04:56:16 +00:00
|
|
|
#elif (defined(__PPC64__) || defined(__powerpc64__)) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
|
|
|
return crc32_ppc(crc, reinterpret_cast<const unsigned char *>(&val), sizeof(val));
|
2023-01-12 01:00:10 +00:00
|
|
|
#elif defined(__s390x__) && __BYTE_ORDER__==__ORDER_BIG_ENDIAN__
|
|
|
|
return s390x_crc32(crc, val);
|
2020-12-21 11:39:17 +00:00
|
|
|
#else
|
2023-01-23 21:13:58 +00:00
|
|
|
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "String hash is not implemented without sse4.2 support");
|
2020-12-21 11:39:17 +00:00
|
|
|
#endif
|
2019-11-06 10:34:13 +00:00
|
|
|
}
|
|
|
|
|
2022-10-07 10:46:45 +00:00
|
|
|
static UInt64 crc32u32(UInt32 crc [[maybe_unused]], UInt32 val [[maybe_unused]])
|
2019-11-06 10:34:13 +00:00
|
|
|
{
|
|
|
|
#ifdef __SSE4_2__
|
2020-12-22 15:29:32 +00:00
|
|
|
return _mm_crc32_u32(crc, val);
|
|
|
|
#elif defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
|
|
|
|
return __crc32cw(crc, val);
|
2023-01-11 04:56:16 +00:00
|
|
|
#elif (defined(__PPC64__) || defined(__powerpc64__)) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
|
|
|
return crc32_ppc(crc, reinterpret_cast<const unsigned char *>(&val), sizeof(val));
|
2023-01-12 01:00:10 +00:00
|
|
|
#elif defined(__s390x__) && __BYTE_ORDER__==__ORDER_BIG_ENDIAN__
|
|
|
|
return s390x_crc32_u32(crc, val);
|
2019-11-06 10:34:13 +00:00
|
|
|
#else
|
2023-01-23 21:13:58 +00:00
|
|
|
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "String hash is not implemented without sse4.2 support");
|
2019-11-06 10:34:13 +00:00
|
|
|
#endif
|
2020-12-22 15:29:32 +00:00
|
|
|
}
|
|
|
|
|
2022-10-07 10:46:45 +00:00
|
|
|
static UInt64 crc32u16(UInt32 crc [[maybe_unused]], UInt16 val [[maybe_unused]])
|
2020-12-25 11:22:48 +00:00
|
|
|
{
|
|
|
|
#ifdef __SSE4_2__
|
|
|
|
return _mm_crc32_u16(crc, val);
|
|
|
|
#elif defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
|
|
|
|
return __crc32ch(crc, val);
|
2023-01-11 04:56:16 +00:00
|
|
|
#elif (defined(__PPC64__) || defined(__powerpc64__)) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
|
|
|
return crc32_ppc(crc, reinterpret_cast<const unsigned char *>(&val), sizeof(val));
|
2023-01-12 01:00:10 +00:00
|
|
|
#elif defined(__s390x__) && __BYTE_ORDER__==__ORDER_BIG_ENDIAN__
|
|
|
|
return s390x_crc32_u16(crc, val);
|
2020-12-25 11:22:48 +00:00
|
|
|
#else
|
2023-01-23 21:13:58 +00:00
|
|
|
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "String hash is not implemented without sse4.2 support");
|
2020-12-25 11:22:48 +00:00
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2022-10-07 10:46:45 +00:00
|
|
|
static UInt64 crc32u8(UInt32 crc [[maybe_unused]], UInt8 val [[maybe_unused]])
|
2020-12-22 15:29:32 +00:00
|
|
|
{
|
|
|
|
#ifdef __SSE4_2__
|
|
|
|
return _mm_crc32_u8(crc, val);
|
|
|
|
#elif defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
|
|
|
|
return __crc32cb(crc, val);
|
2023-01-11 04:56:16 +00:00
|
|
|
#elif (defined(__PPC64__) || defined(__powerpc64__)) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
|
|
|
return crc32_ppc(crc, reinterpret_cast<const unsigned char *>(&val), sizeof(val));
|
2023-01-12 01:00:10 +00:00
|
|
|
#elif defined(__s390x__) && __BYTE_ORDER__==__ORDER_BIG_ENDIAN__
|
|
|
|
return s390x_crc32_u8(crc, val);
|
2020-12-22 15:29:32 +00:00
|
|
|
#else
|
2023-01-23 21:13:58 +00:00
|
|
|
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "String hash is not implemented without sse4.2 support");
|
2020-12-22 15:29:32 +00:00
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2020-12-25 11:22:48 +00:00
|
|
|
template <bool CaseInsensitive>
|
|
|
|
static ALWAYS_INLINE inline UInt64 shingleHash(UInt64 crc, const UInt8 * start, size_t size)
|
2020-12-22 15:29:32 +00:00
|
|
|
{
|
2020-12-25 11:22:48 +00:00
|
|
|
if (size & 1)
|
|
|
|
{
|
|
|
|
UInt8 x = *start;
|
|
|
|
|
|
|
|
if constexpr (CaseInsensitive)
|
|
|
|
x |= 0x20u; /// see toLowerIfAlphaASCII from StringUtils.h
|
|
|
|
|
2022-10-07 10:46:45 +00:00
|
|
|
crc = crc32u8(static_cast<UInt32>(crc), x);
|
2020-12-25 11:22:48 +00:00
|
|
|
--size;
|
|
|
|
++start;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (size & 2)
|
|
|
|
{
|
|
|
|
UInt16 x = unalignedLoad<UInt16>(start);
|
|
|
|
|
|
|
|
if constexpr (CaseInsensitive)
|
|
|
|
x |= 0x2020u;
|
|
|
|
|
2022-10-07 10:46:45 +00:00
|
|
|
crc = crc32u16(static_cast<UInt32>(crc), x);
|
2020-12-25 11:22:48 +00:00
|
|
|
size -= 2;
|
|
|
|
start += 2;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (size & 4)
|
|
|
|
{
|
|
|
|
UInt32 x = unalignedLoad<UInt32>(start);
|
|
|
|
|
|
|
|
if constexpr (CaseInsensitive)
|
|
|
|
x |= 0x20202020u;
|
|
|
|
|
2022-10-07 10:46:45 +00:00
|
|
|
crc = crc32u32(static_cast<UInt32>(crc), x);
|
2020-12-25 11:22:48 +00:00
|
|
|
size -= 4;
|
|
|
|
start += 4;
|
|
|
|
}
|
|
|
|
|
|
|
|
while (size)
|
|
|
|
{
|
|
|
|
UInt64 x = unalignedLoad<UInt64>(start);
|
|
|
|
|
|
|
|
if constexpr (CaseInsensitive)
|
|
|
|
x |= 0x2020202020202020u;
|
|
|
|
|
|
|
|
crc = crc32u64(crc, x);
|
|
|
|
size -= 8;
|
|
|
|
start += 8;
|
|
|
|
}
|
|
|
|
|
|
|
|
return crc;
|
2020-12-22 15:29:32 +00:00
|
|
|
}
|
|
|
|
|
2020-12-25 11:22:48 +00:00
|
|
|
template <bool CaseInsensitive>
|
2020-12-25 16:04:55 +00:00
|
|
|
static ALWAYS_INLINE inline UInt64 shingleHash(const std::vector<BytesRef> & shingle, size_t offset = 0)
|
2020-12-22 15:29:32 +00:00
|
|
|
{
|
|
|
|
UInt64 crc = -1ULL;
|
2020-12-25 11:22:48 +00:00
|
|
|
|
|
|
|
for (size_t i = offset; i < shingle.size(); ++i)
|
|
|
|
crc = shingleHash<CaseInsensitive>(crc, shingle[i].data, shingle[i].size);
|
|
|
|
|
|
|
|
for (size_t i = 0; i < offset; ++i)
|
|
|
|
crc = shingleHash<CaseInsensitive>(crc, shingle[i].data, shingle[i].size);
|
2020-12-25 16:49:55 +00:00
|
|
|
|
|
|
|
return crc;
|
2019-11-06 10:34:13 +00:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2020-12-24 15:49:05 +00:00
|
|
|
// SimHash String -> UInt64
|
2020-06-10 15:02:58 +00:00
|
|
|
// UTF8: means ASCII or UTF8, these two parameters CodePoint and UTF8 can only be (UInt8, false) or (UInt32, true)
|
|
|
|
// Ngram: means ngram(true) or words shingles(false)
|
|
|
|
// CaseInsensitive: means should we consider about letter case or not
|
2020-12-25 14:18:41 +00:00
|
|
|
template <bool UTF8, bool Ngram, bool CaseInsensitive>
|
2020-12-24 15:49:05 +00:00
|
|
|
struct SimHashImpl
|
2019-11-06 10:34:13 +00:00
|
|
|
{
|
2020-12-25 17:58:44 +00:00
|
|
|
static constexpr size_t min_word_size = 4;
|
|
|
|
|
2020-12-25 11:22:48 +00:00
|
|
|
/// Update fingerprint according to hash_value bits.
|
|
|
|
static ALWAYS_INLINE inline void updateFingerVector(Int64 * finger_vec, UInt64 hash_value)
|
|
|
|
{
|
|
|
|
for (size_t i = 0; i < 64; ++i)
|
|
|
|
finger_vec[i] += (hash_value & (1ULL << i)) ? 1 : -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Return a 64 bit value according to finger_vec.
|
|
|
|
static ALWAYS_INLINE inline UInt64 getSimHash(const Int64 * finger_vec)
|
|
|
|
{
|
|
|
|
UInt64 res = 0;
|
2019-11-06 10:34:13 +00:00
|
|
|
|
2020-12-25 11:22:48 +00:00
|
|
|
for (size_t i = 0; i < 64; ++i)
|
|
|
|
if (finger_vec[i] > 0)
|
|
|
|
res |= (1ULL << i);
|
|
|
|
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
// SimHash ngram calculate function: String -> UInt64
|
2019-11-06 10:34:13 +00:00
|
|
|
// this function extracting ngram from input string, and maintain a 64-dimensions vector
|
|
|
|
// for each ngram, calculate a 64 bit hash value, and update the vector according the hash value
|
|
|
|
// finally return a 64 bit value(UInt64), i'th bit is 1 means vector[i] > 0, otherwise, vector[i] < 0
|
2020-12-25 11:22:48 +00:00
|
|
|
|
2020-12-25 14:18:41 +00:00
|
|
|
static ALWAYS_INLINE inline UInt64 ngramHashASCII(const UInt8 * data, size_t size, size_t shingle_size)
|
2019-11-06 10:34:13 +00:00
|
|
|
{
|
2020-12-25 11:22:48 +00:00
|
|
|
if (size < shingle_size)
|
|
|
|
return Hash::shingleHash<CaseInsensitive>(-1ULL, data, size);
|
|
|
|
|
2019-11-06 10:34:13 +00:00
|
|
|
Int64 finger_vec[64] = {};
|
2020-12-25 11:22:48 +00:00
|
|
|
const UInt8 * end = data + size;
|
|
|
|
|
|
|
|
for (const UInt8 * pos = data; pos + shingle_size <= end; ++pos)
|
|
|
|
{
|
|
|
|
UInt64 hash_value = Hash::shingleHash<CaseInsensitive>(-1ULL, pos, shingle_size);
|
|
|
|
updateFingerVector(finger_vec, hash_value);
|
|
|
|
}
|
|
|
|
|
|
|
|
return getSimHash(finger_vec);
|
|
|
|
}
|
|
|
|
|
2020-12-25 14:18:41 +00:00
|
|
|
static ALWAYS_INLINE inline UInt64 ngramHashUTF8(const UInt8 * data, size_t size, size_t shingle_size)
|
2020-12-25 11:22:48 +00:00
|
|
|
{
|
|
|
|
const UInt8 * start = data;
|
|
|
|
const UInt8 * end = data + size;
|
2019-11-06 10:34:13 +00:00
|
|
|
|
2020-12-25 11:22:48 +00:00
|
|
|
const UInt8 * word_start = start;
|
|
|
|
const UInt8 * word_end = start;
|
2019-11-06 10:34:13 +00:00
|
|
|
|
2020-12-25 11:22:48 +00:00
|
|
|
for (size_t i = 0; i < shingle_size; ++i)
|
2019-11-06 10:34:13 +00:00
|
|
|
{
|
2020-12-25 11:22:48 +00:00
|
|
|
if (word_end >= end)
|
|
|
|
return Hash::shingleHash<CaseInsensitive>(-1ULL, data, size);
|
|
|
|
|
|
|
|
ExtractStringImpl::readOneUTF8Code(word_end, end);
|
|
|
|
}
|
|
|
|
|
|
|
|
Int64 finger_vec[64] = {};
|
|
|
|
|
|
|
|
while (word_end < end)
|
2019-11-06 10:34:13 +00:00
|
|
|
{
|
2020-12-25 11:22:48 +00:00
|
|
|
ExtractStringImpl::readOneUTF8Code(word_start, word_end);
|
|
|
|
ExtractStringImpl::readOneUTF8Code(word_end, end);
|
|
|
|
|
|
|
|
size_t length = word_end - word_start;
|
|
|
|
UInt64 hash_value = Hash::shingleHash<CaseInsensitive>(-1ULL, word_start, length);
|
|
|
|
updateFingerVector(finger_vec, hash_value);
|
2019-11-06 10:34:13 +00:00
|
|
|
}
|
2020-12-25 11:22:48 +00:00
|
|
|
|
|
|
|
return getSimHash(finger_vec);
|
2019-11-06 10:34:13 +00:00
|
|
|
}
|
|
|
|
|
2020-12-24 15:49:05 +00:00
|
|
|
// SimHash word shingle calculate function: String -> UInt64
|
2019-11-06 10:34:13 +00:00
|
|
|
// this function extracting n word shingle from input string, and maintain a 64-dimensions vector as well
|
|
|
|
// for each word shingle, calculate a 64 bit hash value, and update the vector according the hash value
|
|
|
|
// finally return a 64 bit value(UInt64), i'th bit is 1 means vector[i] > 0, otherwise, vector[i] < 0
|
|
|
|
//
|
|
|
|
// word shingle hash value calculate:
|
|
|
|
// 1. at the first, extracts N word shingles and calculate N hash values, store into an array, use this N hash values
|
|
|
|
// to calculate the first word shingle hash value
|
2020-12-24 10:48:10 +00:00
|
|
|
// 2. next, we extract one word each time, and calculate a new hash value of the new word,then use the latest N hash
|
|
|
|
// values to calculate the next word shingle hash value
|
2020-12-25 11:22:48 +00:00
|
|
|
|
2020-12-25 14:18:41 +00:00
|
|
|
static ALWAYS_INLINE inline UInt64 wordShingleHash(const UInt8 * data, size_t size, size_t shingle_size)
|
2020-12-25 11:22:48 +00:00
|
|
|
{
|
|
|
|
const UInt8 * start = data;
|
|
|
|
const UInt8 * end = data + size;
|
|
|
|
|
|
|
|
// A 64 bit vector initialized to zero.
|
|
|
|
Int64 finger_vec[64] = {};
|
|
|
|
// An array to store N words.
|
2020-12-25 16:04:55 +00:00
|
|
|
std::vector<BytesRef> words;
|
2020-12-25 11:22:48 +00:00
|
|
|
words.reserve(shingle_size);
|
|
|
|
|
|
|
|
// get first word shingle
|
|
|
|
while (start < end && words.size() < shingle_size)
|
|
|
|
{
|
2020-12-29 10:16:22 +00:00
|
|
|
const UInt8 * word_start = ExtractStringImpl::readOneWord(start, end);
|
2020-12-25 11:22:48 +00:00
|
|
|
size_t length = start - word_start;
|
|
|
|
|
2020-12-25 17:58:44 +00:00
|
|
|
if (length >= min_word_size)
|
2020-12-25 16:11:05 +00:00
|
|
|
words.emplace_back(BytesRef{word_start, length});
|
2020-12-25 11:22:48 +00:00
|
|
|
}
|
|
|
|
|
2020-12-27 09:54:59 +00:00
|
|
|
if (words.empty())
|
|
|
|
return 0;
|
|
|
|
|
2020-12-25 11:22:48 +00:00
|
|
|
UInt64 hash_value = Hash::shingleHash<CaseInsensitive>(words);
|
|
|
|
updateFingerVector(finger_vec, hash_value);
|
|
|
|
|
|
|
|
size_t offset = 0;
|
|
|
|
while (start < end)
|
|
|
|
{
|
2020-12-29 10:16:22 +00:00
|
|
|
const UInt8 * word_start = ExtractStringImpl::readOneWord(start, end);
|
2020-12-25 11:22:48 +00:00
|
|
|
size_t length = start - word_start;
|
|
|
|
|
2020-12-25 17:58:44 +00:00
|
|
|
if (length < min_word_size)
|
2020-12-25 11:22:48 +00:00
|
|
|
continue;
|
|
|
|
|
|
|
|
// we need to store the new word hash value to the oldest location.
|
2023-07-25 04:10:04 +00:00
|
|
|
// for example, N = 5, array |a0|a1|a2|a3|a4|, now, a0 is the oldest location,
|
|
|
|
// so we need to store new word hash into location of a0, then this array become
|
2020-12-25 11:22:48 +00:00
|
|
|
// |a5|a1|a2|a3|a4|, next time, a1 become the oldest location, we need to store new
|
|
|
|
// word hash value into location of a1, then array become |a5|a6|a2|a3|a4|
|
2020-12-25 16:04:55 +00:00
|
|
|
words[offset] = BytesRef{word_start, length};
|
2020-12-25 11:22:48 +00:00
|
|
|
++offset;
|
|
|
|
if (offset >= shingle_size)
|
|
|
|
offset = 0;
|
|
|
|
|
|
|
|
// according to the word hash storation way, in order to not lose the word shingle's
|
|
|
|
// sequence information, when calculation word shingle hash value, we need provide the offset
|
|
|
|
// information, which is the offset of the first word's hash value of the word shingle
|
|
|
|
hash_value = Hash::shingleHash<CaseInsensitive>(words, offset);
|
|
|
|
updateFingerVector(finger_vec, hash_value);
|
|
|
|
}
|
|
|
|
|
|
|
|
return getSimHash(finger_vec);
|
|
|
|
}
|
|
|
|
|
2020-12-25 14:18:41 +00:00
|
|
|
static void apply(const ColumnString::Chars & data, const ColumnString::Offsets & offsets, size_t shingle_size, PaddedPODArray<UInt64> & res)
|
2019-11-06 10:34:13 +00:00
|
|
|
{
|
|
|
|
for (size_t i = 0; i < offsets.size(); ++i)
|
|
|
|
{
|
2020-12-25 14:18:41 +00:00
|
|
|
const UInt8 * one_data = &data[offsets[i - 1]];
|
2019-11-06 10:34:13 +00:00
|
|
|
const size_t data_size = offsets[i] - offsets[i - 1] - 1;
|
2020-12-24 15:49:05 +00:00
|
|
|
|
|
|
|
if constexpr (Ngram)
|
2019-11-06 10:34:13 +00:00
|
|
|
{
|
2020-12-24 15:49:05 +00:00
|
|
|
if constexpr (!UTF8)
|
2020-12-25 14:18:41 +00:00
|
|
|
res[i] = ngramHashASCII(one_data, data_size, shingle_size);
|
2019-11-06 10:34:13 +00:00
|
|
|
else
|
2020-12-25 14:18:41 +00:00
|
|
|
res[i] = ngramHashUTF8(one_data, data_size, shingle_size);
|
2019-11-06 10:34:13 +00:00
|
|
|
}
|
2020-06-10 15:02:58 +00:00
|
|
|
else
|
2020-12-24 15:49:05 +00:00
|
|
|
{
|
2020-12-25 14:18:41 +00:00
|
|
|
res[i] = wordShingleHash(one_data, data_size, shingle_size);
|
2020-12-24 15:49:05 +00:00
|
|
|
}
|
2019-11-06 10:34:13 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2020-12-24 15:49:05 +00:00
|
|
|
// MinHash: String -> Tuple(UInt64, UInt64)
|
2019-11-06 10:35:55 +00:00
|
|
|
// for each string, we extract ngram or word shingle,
|
|
|
|
// for each ngram or word shingle, calculate a hash value,
|
|
|
|
// then we take the K minimum hash values to calculate a hashsum,
|
|
|
|
// and take the K maximum hash values to calculate another hashsum,
|
|
|
|
// return this two hashsum: Tuple(hashsum1, hashsum2)
|
2020-06-10 15:02:58 +00:00
|
|
|
//
|
|
|
|
// UTF8: means ASCII or UTF8, these two parameters CodePoint and UTF8 can only be (UInt8, false) or (UInt32, true)
|
|
|
|
// Ngram: means ngram(true) or words shingles(false)
|
|
|
|
// CaseInsensitive: means should we consider about letter case or not
|
2020-12-25 14:18:41 +00:00
|
|
|
template <bool UTF8, bool Ngram, bool CaseInsensitive>
|
2020-12-24 15:49:05 +00:00
|
|
|
struct MinHashImpl
|
2019-11-06 10:34:13 +00:00
|
|
|
{
|
2020-12-25 18:03:08 +00:00
|
|
|
static constexpr size_t min_word_size = 4;
|
2020-12-25 17:58:44 +00:00
|
|
|
|
2020-12-29 10:16:22 +00:00
|
|
|
template <typename Comp>
|
2020-12-25 14:18:41 +00:00
|
|
|
struct Heap
|
2019-11-06 10:34:13 +00:00
|
|
|
{
|
2020-12-25 16:04:55 +00:00
|
|
|
void update(UInt64 hash, BytesRef ref, size_t limit)
|
2020-12-25 14:18:41 +00:00
|
|
|
{
|
2022-04-18 10:18:43 +00:00
|
|
|
if (values.contains(hash))
|
2020-12-25 14:18:41 +00:00
|
|
|
return;
|
|
|
|
|
|
|
|
values[hash] = ref;
|
2019-11-06 10:34:13 +00:00
|
|
|
|
2020-12-25 14:18:41 +00:00
|
|
|
if (values.size() > limit)
|
|
|
|
values.erase(values.begin());
|
|
|
|
}
|
|
|
|
|
|
|
|
UInt64 getHash()
|
|
|
|
{
|
|
|
|
if (values.empty())
|
|
|
|
return 0;
|
|
|
|
|
2020-12-25 16:11:05 +00:00
|
|
|
UInt64 res = 0;
|
|
|
|
for (auto it = values.begin(); it != values.end(); ++it)
|
2020-12-25 14:18:41 +00:00
|
|
|
res = CityHash_v1_0_2::Hash128to64(CityHash_v1_0_2::uint128(res, it->first));
|
|
|
|
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
void fill(ColumnTuple & strings)
|
2019-11-06 10:34:13 +00:00
|
|
|
{
|
2020-12-25 14:18:41 +00:00
|
|
|
auto it = values.begin();
|
2020-12-25 17:30:44 +00:00
|
|
|
for (size_t i = 0; i < strings.tupleSize(); ++i)
|
2019-11-06 10:34:13 +00:00
|
|
|
{
|
2020-12-25 14:18:41 +00:00
|
|
|
auto & col_string = static_cast<ColumnString &>(strings.getColumn(i));
|
|
|
|
if (it != values.end())
|
|
|
|
{
|
2020-12-25 16:56:51 +00:00
|
|
|
col_string.insertData(reinterpret_cast<const char *>(it->second.data), it->second.size);
|
2020-12-25 14:18:41 +00:00
|
|
|
++it;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
col_string.insertDefault();
|
2019-11-06 10:34:13 +00:00
|
|
|
}
|
2020-12-25 14:18:41 +00:00
|
|
|
}
|
|
|
|
|
2020-12-25 16:04:55 +00:00
|
|
|
std::map<UInt64, BytesRef, Comp> values;
|
2020-12-25 14:18:41 +00:00
|
|
|
};
|
|
|
|
|
2022-04-30 18:11:06 +00:00
|
|
|
using MaxHeap = Heap<std::less<>>;
|
|
|
|
using MinHeap = Heap<std::greater<>>;
|
2020-12-25 14:18:41 +00:00
|
|
|
|
2020-12-25 16:55:15 +00:00
|
|
|
static ALWAYS_INLINE inline void ngramHashASCII(
|
2020-12-25 14:18:41 +00:00
|
|
|
MinHeap & min_heap,
|
|
|
|
MaxHeap & max_heap,
|
|
|
|
const UInt8 * data,
|
|
|
|
size_t size,
|
|
|
|
size_t shingle_size,
|
|
|
|
size_t heap_size)
|
|
|
|
{
|
|
|
|
if (size < shingle_size)
|
2020-12-25 16:55:15 +00:00
|
|
|
{
|
|
|
|
UInt64 hash_value = Hash::shingleHash<CaseInsensitive>(-1ULL, data, size);
|
|
|
|
min_heap.update(hash_value, BytesRef{data, size}, heap_size);
|
|
|
|
max_heap.update(hash_value, BytesRef{data, size}, heap_size);
|
|
|
|
return;
|
|
|
|
}
|
2020-12-25 14:18:41 +00:00
|
|
|
|
|
|
|
const UInt8 * end = data + size;
|
|
|
|
|
|
|
|
for (const UInt8 * pos = data; pos + shingle_size <= end; ++pos)
|
|
|
|
{
|
|
|
|
UInt64 hash_value = Hash::shingleHash<CaseInsensitive>(-1ULL, pos, shingle_size);
|
|
|
|
|
|
|
|
// insert the new hash value into array used to store K minimum value
|
|
|
|
// and K maximum value
|
2020-12-25 16:04:55 +00:00
|
|
|
min_heap.update(hash_value, BytesRef{pos, shingle_size}, heap_size);
|
|
|
|
max_heap.update(hash_value, BytesRef{pos, shingle_size}, heap_size);
|
2020-12-25 14:18:41 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-12-25 16:55:15 +00:00
|
|
|
static ALWAYS_INLINE inline void ngramHashUTF8(
|
2020-12-25 14:18:41 +00:00
|
|
|
MinHeap & min_heap,
|
|
|
|
MaxHeap & max_heap,
|
|
|
|
const UInt8 * data,
|
|
|
|
size_t size,
|
|
|
|
size_t shingle_size,
|
|
|
|
size_t heap_size)
|
|
|
|
{
|
|
|
|
const UInt8 * start = data;
|
|
|
|
const UInt8 * end = data + size;
|
|
|
|
|
|
|
|
const UInt8 * word_start = start;
|
|
|
|
const UInt8 * word_end = start;
|
|
|
|
|
|
|
|
for (size_t i = 0; i < shingle_size; ++i)
|
|
|
|
{
|
|
|
|
if (word_end >= end)
|
2020-12-25 16:55:15 +00:00
|
|
|
{
|
|
|
|
auto hash_value = Hash::shingleHash<CaseInsensitive>(-1ULL, data, size);
|
|
|
|
min_heap.update(hash_value, BytesRef{data, size}, heap_size);
|
|
|
|
max_heap.update(hash_value, BytesRef{data, size}, heap_size);
|
|
|
|
return;
|
|
|
|
}
|
2020-12-25 14:18:41 +00:00
|
|
|
|
|
|
|
ExtractStringImpl::readOneUTF8Code(word_end, end);
|
|
|
|
}
|
|
|
|
|
|
|
|
while (word_end < end)
|
|
|
|
{
|
|
|
|
ExtractStringImpl::readOneUTF8Code(word_start, word_end);
|
|
|
|
ExtractStringImpl::readOneUTF8Code(word_end, end);
|
2019-11-06 10:34:13 +00:00
|
|
|
|
2020-12-25 14:18:41 +00:00
|
|
|
size_t length = word_end - word_start;
|
|
|
|
UInt64 hash_value = Hash::shingleHash<CaseInsensitive>(-1ULL, word_start, length);
|
|
|
|
|
2020-12-25 16:04:55 +00:00
|
|
|
min_heap.update(hash_value, BytesRef{word_start, length}, heap_size);
|
|
|
|
max_heap.update(hash_value, BytesRef{word_start, length}, heap_size);
|
2020-12-25 14:18:41 +00:00
|
|
|
}
|
2019-11-06 10:34:13 +00:00
|
|
|
}
|
|
|
|
|
2020-12-24 15:49:05 +00:00
|
|
|
// MinHash word shingle hash value calculate function: String ->Tuple(UInt64, UInt64)
|
2019-11-06 10:35:55 +00:00
|
|
|
// for each word shingle, we calculate a hash value, but in fact, we just maintain the
|
|
|
|
// K minimum and K maximum hash value
|
2020-12-25 16:55:15 +00:00
|
|
|
static ALWAYS_INLINE inline void wordShingleHash(
|
2020-12-25 14:18:41 +00:00
|
|
|
MinHeap & min_heap,
|
|
|
|
MaxHeap & max_heap,
|
|
|
|
const UInt8 * data,
|
2020-05-22 13:23:49 +00:00
|
|
|
size_t size,
|
2020-12-25 14:18:41 +00:00
|
|
|
size_t shingle_size,
|
|
|
|
size_t heap_size)
|
2019-11-06 10:34:13 +00:00
|
|
|
{
|
2020-12-25 14:18:41 +00:00
|
|
|
const UInt8 * start = data;
|
|
|
|
const UInt8 * end = data + size;
|
|
|
|
|
|
|
|
// An array to store N words.
|
2020-12-25 16:04:55 +00:00
|
|
|
std::vector<BytesRef> words;
|
2020-12-25 14:18:41 +00:00
|
|
|
words.reserve(shingle_size);
|
|
|
|
|
|
|
|
// get first word shingle
|
|
|
|
while (start < end && words.size() < shingle_size)
|
2019-11-06 10:34:13 +00:00
|
|
|
{
|
2020-12-29 10:16:22 +00:00
|
|
|
const UInt8 * word_start = ExtractStringImpl::readOneWord(start, end);
|
2020-12-25 14:18:41 +00:00
|
|
|
size_t length = start - word_start;
|
|
|
|
|
2020-12-25 17:58:44 +00:00
|
|
|
if (length >= min_word_size)
|
2020-12-25 17:00:42 +00:00
|
|
|
words.emplace_back(BytesRef{word_start, length});
|
2019-11-06 10:34:13 +00:00
|
|
|
}
|
|
|
|
|
2020-12-27 09:54:59 +00:00
|
|
|
if (words.empty())
|
|
|
|
return;
|
|
|
|
|
2020-12-25 14:18:41 +00:00
|
|
|
UInt64 hash_value = Hash::shingleHash<CaseInsensitive>(words);
|
|
|
|
{
|
2020-12-25 16:04:55 +00:00
|
|
|
const UInt8 * shingle_start = words.front().data;
|
|
|
|
const UInt8 * shingle_end = words.back().data + words.back().size;
|
|
|
|
BytesRef ref{shingle_start, static_cast<size_t>(shingle_end - shingle_start)};
|
2020-12-25 14:18:41 +00:00
|
|
|
min_heap.update(hash_value, ref, heap_size);
|
|
|
|
max_heap.update(hash_value, ref, heap_size);
|
|
|
|
}
|
2019-11-06 10:34:13 +00:00
|
|
|
|
|
|
|
size_t offset = 0;
|
2020-12-25 14:18:41 +00:00
|
|
|
while (start < end)
|
2019-11-06 10:34:13 +00:00
|
|
|
{
|
2020-12-29 10:16:22 +00:00
|
|
|
const UInt8 * word_start = ExtractStringImpl::readOneWord(start, end);
|
2020-12-25 14:18:41 +00:00
|
|
|
|
|
|
|
size_t length = start - word_start;
|
|
|
|
|
2020-12-25 17:58:44 +00:00
|
|
|
if (length < min_word_size)
|
2020-12-25 14:18:41 +00:00
|
|
|
continue;
|
|
|
|
|
2020-12-25 16:04:55 +00:00
|
|
|
words[offset] = BytesRef{word_start, length};
|
|
|
|
const UInt8 * shingle_end = words[offset].data + length;
|
2020-12-25 14:18:41 +00:00
|
|
|
|
|
|
|
++offset;
|
|
|
|
if (offset >= shingle_size)
|
|
|
|
offset = 0;
|
|
|
|
|
2020-12-25 16:04:55 +00:00
|
|
|
const UInt8 * shingle_start = words[offset].data;
|
2020-12-25 14:18:41 +00:00
|
|
|
|
|
|
|
hash_value = Hash::shingleHash<CaseInsensitive>(words, offset);
|
2020-12-25 16:04:55 +00:00
|
|
|
BytesRef ref{shingle_start, static_cast<size_t>(shingle_end - shingle_start)};
|
2020-12-25 14:18:41 +00:00
|
|
|
min_heap.update(hash_value, ref, heap_size);
|
|
|
|
max_heap.update(hash_value, ref, heap_size);
|
|
|
|
}
|
2019-11-06 10:34:13 +00:00
|
|
|
}
|
|
|
|
|
2020-06-10 15:02:58 +00:00
|
|
|
static void apply(
|
2019-11-06 10:34:13 +00:00
|
|
|
const ColumnString::Chars & data,
|
|
|
|
const ColumnString::Offsets & offsets,
|
2020-12-25 14:18:41 +00:00
|
|
|
size_t shingle_size,
|
|
|
|
size_t heap_size,
|
|
|
|
PaddedPODArray<UInt64> * res1,
|
|
|
|
PaddedPODArray<UInt64> * res2,
|
|
|
|
ColumnTuple * res1_strings,
|
|
|
|
ColumnTuple * res2_strings)
|
2019-11-06 10:34:13 +00:00
|
|
|
{
|
2020-12-25 14:18:41 +00:00
|
|
|
MinHeap min_heap;
|
|
|
|
MaxHeap max_heap;
|
|
|
|
|
2019-11-06 10:34:13 +00:00
|
|
|
for (size_t i = 0; i < offsets.size(); ++i)
|
|
|
|
{
|
2020-12-25 16:11:05 +00:00
|
|
|
const UInt8 * one_data = &data[offsets[i - 1]];
|
2019-11-06 10:34:13 +00:00
|
|
|
const size_t data_size = offsets[i] - offsets[i - 1] - 1;
|
2020-12-24 15:49:05 +00:00
|
|
|
|
2020-12-25 14:18:41 +00:00
|
|
|
min_heap.values.clear();
|
|
|
|
max_heap.values.clear();
|
|
|
|
|
2020-12-24 15:49:05 +00:00
|
|
|
if constexpr (Ngram)
|
2019-11-06 10:34:13 +00:00
|
|
|
{
|
2020-12-24 15:49:05 +00:00
|
|
|
if constexpr (!UTF8)
|
2020-12-25 14:18:41 +00:00
|
|
|
ngramHashASCII(min_heap, max_heap, one_data, data_size, shingle_size, heap_size);
|
2019-11-06 10:34:13 +00:00
|
|
|
else
|
2020-12-25 14:18:41 +00:00
|
|
|
ngramHashUTF8(min_heap, max_heap, one_data, data_size, shingle_size, heap_size);
|
2019-11-06 10:34:13 +00:00
|
|
|
}
|
2020-06-10 15:02:58 +00:00
|
|
|
else
|
2020-12-24 15:49:05 +00:00
|
|
|
{
|
2020-12-25 14:18:41 +00:00
|
|
|
wordShingleHash(min_heap, max_heap, one_data, data_size, shingle_size, heap_size);
|
2020-12-24 15:49:05 +00:00
|
|
|
}
|
2020-12-25 14:18:41 +00:00
|
|
|
|
|
|
|
if (res1)
|
|
|
|
(*res1)[i] = min_heap.getHash();
|
|
|
|
if (res2)
|
|
|
|
(*res2)[i] = max_heap.getHash();
|
|
|
|
|
|
|
|
if (res1_strings)
|
|
|
|
min_heap.fill(*res1_strings);
|
|
|
|
if (res2_strings)
|
|
|
|
max_heap.fill(*res2_strings);
|
2019-11-06 10:34:13 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2020-12-24 15:49:05 +00:00
|
|
|
struct NameNgramSimHash
|
2019-11-06 10:34:13 +00:00
|
|
|
{
|
2020-12-24 15:49:05 +00:00
|
|
|
static constexpr auto name = "ngramSimHash";
|
2019-11-06 10:34:13 +00:00
|
|
|
};
|
|
|
|
|
2020-12-24 15:49:05 +00:00
|
|
|
struct NameNgramSimHashCaseInsensitive
|
2019-11-06 10:34:13 +00:00
|
|
|
{
|
2020-12-24 15:49:05 +00:00
|
|
|
static constexpr auto name = "ngramSimHashCaseInsensitive";
|
2019-11-06 10:34:13 +00:00
|
|
|
};
|
|
|
|
|
2020-12-24 15:49:05 +00:00
|
|
|
struct NameNgramSimHashUTF8
|
2019-11-06 10:34:13 +00:00
|
|
|
{
|
2020-12-24 15:49:05 +00:00
|
|
|
static constexpr auto name = "ngramSimHashUTF8";
|
2019-11-06 10:34:13 +00:00
|
|
|
};
|
|
|
|
|
2020-12-24 15:49:05 +00:00
|
|
|
struct NameNgramSimHashCaseInsensitiveUTF8
|
2019-11-06 10:34:13 +00:00
|
|
|
{
|
2020-12-24 15:49:05 +00:00
|
|
|
static constexpr auto name = "ngramSimHashCaseInsensitiveUTF8";
|
2019-11-06 10:34:13 +00:00
|
|
|
};
|
|
|
|
|
2020-12-24 15:49:05 +00:00
|
|
|
struct NameWordShingleSimHash
|
2019-11-06 10:34:13 +00:00
|
|
|
{
|
2020-12-24 15:49:05 +00:00
|
|
|
static constexpr auto name = "wordShingleSimHash";
|
2019-11-06 10:34:13 +00:00
|
|
|
};
|
|
|
|
|
2020-12-24 15:49:05 +00:00
|
|
|
struct NameWordShingleSimHashCaseInsensitive
|
2019-11-06 10:34:13 +00:00
|
|
|
{
|
2020-12-24 15:49:05 +00:00
|
|
|
static constexpr auto name = "wordShingleSimHashCaseInsensitive";
|
2019-11-06 10:34:13 +00:00
|
|
|
};
|
|
|
|
|
2020-12-24 15:49:05 +00:00
|
|
|
struct NameWordShingleSimHashUTF8
|
2019-11-06 10:34:13 +00:00
|
|
|
{
|
2020-12-24 15:49:05 +00:00
|
|
|
static constexpr auto name = "wordShingleSimHashUTF8";
|
2019-11-06 10:34:13 +00:00
|
|
|
};
|
|
|
|
|
2020-12-24 15:49:05 +00:00
|
|
|
struct NameWordShingleSimHashCaseInsensitiveUTF8
|
2019-11-06 10:34:13 +00:00
|
|
|
{
|
2020-12-24 15:49:05 +00:00
|
|
|
static constexpr auto name = "wordShingleSimHashCaseInsensitiveUTF8";
|
2019-11-06 10:34:13 +00:00
|
|
|
};
|
|
|
|
|
2020-12-24 15:49:05 +00:00
|
|
|
struct NameNgramMinHash
|
2019-11-06 10:34:13 +00:00
|
|
|
{
|
2020-12-24 15:49:05 +00:00
|
|
|
static constexpr auto name = "ngramMinHash";
|
2019-11-06 10:34:13 +00:00
|
|
|
};
|
|
|
|
|
2020-12-24 15:49:05 +00:00
|
|
|
struct NameNgramMinHashCaseInsensitive
|
2019-11-06 10:34:13 +00:00
|
|
|
{
|
2020-12-24 15:49:05 +00:00
|
|
|
static constexpr auto name = "ngramMinHashCaseInsensitive";
|
2019-11-06 10:34:13 +00:00
|
|
|
};
|
|
|
|
|
2020-12-24 15:49:05 +00:00
|
|
|
struct NameNgramMinHashUTF8
|
2019-11-06 10:34:13 +00:00
|
|
|
{
|
2020-12-24 15:49:05 +00:00
|
|
|
static constexpr auto name = "ngramMinHashUTF8";
|
2019-11-06 10:34:13 +00:00
|
|
|
};
|
|
|
|
|
2020-12-24 15:49:05 +00:00
|
|
|
struct NameNgramMinHashCaseInsensitiveUTF8
|
2019-11-06 10:34:13 +00:00
|
|
|
{
|
2020-12-24 15:49:05 +00:00
|
|
|
static constexpr auto name = "ngramMinHashCaseInsensitiveUTF8";
|
2019-11-06 10:34:13 +00:00
|
|
|
};
|
|
|
|
|
2020-12-24 15:49:05 +00:00
|
|
|
struct NameWordShingleMinHash
|
2019-11-06 10:34:13 +00:00
|
|
|
{
|
2020-12-24 15:49:05 +00:00
|
|
|
static constexpr auto name = "wordShingleMinHash";
|
2019-11-06 10:34:13 +00:00
|
|
|
};
|
|
|
|
|
2020-12-24 15:49:05 +00:00
|
|
|
struct NameWordShingleMinHashCaseInsensitive
|
2019-11-06 10:34:13 +00:00
|
|
|
{
|
2020-12-24 15:49:05 +00:00
|
|
|
static constexpr auto name = "wordShingleMinHashCaseInsensitive";
|
2019-11-06 10:34:13 +00:00
|
|
|
};
|
|
|
|
|
2020-12-24 15:49:05 +00:00
|
|
|
struct NameWordShingleMinHashUTF8
|
2019-11-06 10:34:13 +00:00
|
|
|
{
|
2020-12-24 15:49:05 +00:00
|
|
|
static constexpr auto name = "wordShingleMinHashUTF8";
|
2019-11-06 10:34:13 +00:00
|
|
|
};
|
|
|
|
|
2020-12-24 15:49:05 +00:00
|
|
|
struct NameWordShingleMinHashCaseInsensitiveUTF8
|
2019-11-06 10:34:13 +00:00
|
|
|
{
|
2020-12-24 15:49:05 +00:00
|
|
|
static constexpr auto name = "wordShingleMinHashCaseInsensitiveUTF8";
|
2019-11-06 10:34:13 +00:00
|
|
|
};
|
|
|
|
|
2020-12-25 14:18:41 +00:00
|
|
|
struct NameNgramMinHashArg
|
|
|
|
{
|
|
|
|
static constexpr auto name = "ngramMinHashArg";
|
|
|
|
};
|
|
|
|
|
|
|
|
struct NameNgramMinHashArgCaseInsensitive
|
|
|
|
{
|
|
|
|
static constexpr auto name = "ngramMinHashArgCaseInsensitive";
|
|
|
|
};
|
|
|
|
|
|
|
|
struct NameNgramMinHashArgUTF8
|
|
|
|
{
|
|
|
|
static constexpr auto name = "ngramMinHashArgUTF8";
|
|
|
|
};
|
|
|
|
|
|
|
|
struct NameNgramMinHashArgCaseInsensitiveUTF8
|
|
|
|
{
|
|
|
|
static constexpr auto name = "ngramMinHashArgCaseInsensitiveUTF8";
|
|
|
|
};
|
|
|
|
|
|
|
|
struct NameWordShingleMinHashArg
|
|
|
|
{
|
|
|
|
static constexpr auto name = "wordShingleMinHashArg";
|
|
|
|
};
|
|
|
|
|
|
|
|
struct NameWordShingleMinHashArgCaseInsensitive
|
|
|
|
{
|
|
|
|
static constexpr auto name = "wordShingleMinHashArgCaseInsensitive";
|
|
|
|
};
|
|
|
|
|
|
|
|
struct NameWordShingleMinHashArgUTF8
|
|
|
|
{
|
|
|
|
static constexpr auto name = "wordShingleMinHashArgUTF8";
|
|
|
|
};
|
|
|
|
|
|
|
|
struct NameWordShingleMinHashArgCaseInsensitiveUTF8
|
|
|
|
{
|
|
|
|
static constexpr auto name = "wordShingleMinHashArgCaseInsensitiveUTF8";
|
|
|
|
};
|
|
|
|
|
2020-12-24 15:49:05 +00:00
|
|
|
// SimHash
|
2020-12-25 14:18:41 +00:00
|
|
|
using FunctionNgramSimHash = FunctionsStringHash<SimHashImpl<false, true, false>, NameNgramSimHash, true>;
|
2019-11-06 10:34:13 +00:00
|
|
|
|
2020-12-24 15:49:05 +00:00
|
|
|
using FunctionNgramSimHashCaseInsensitive
|
2020-12-25 14:18:41 +00:00
|
|
|
= FunctionsStringHash<SimHashImpl<false, true, true>, NameNgramSimHashCaseInsensitive, true>;
|
2019-11-06 10:34:13 +00:00
|
|
|
|
2020-12-25 14:18:41 +00:00
|
|
|
using FunctionNgramSimHashUTF8 = FunctionsStringHash<SimHashImpl<true, true, false>, NameNgramSimHashUTF8, true>;
|
2019-11-06 10:34:13 +00:00
|
|
|
|
2020-12-24 15:49:05 +00:00
|
|
|
using FunctionNgramSimHashCaseInsensitiveUTF8
|
2020-12-25 14:18:41 +00:00
|
|
|
= FunctionsStringHash<SimHashImpl<true, true, true>, NameNgramSimHashCaseInsensitiveUTF8, true>;
|
2019-11-06 10:34:13 +00:00
|
|
|
|
2020-12-25 14:18:41 +00:00
|
|
|
using FunctionWordShingleSimHash = FunctionsStringHash<SimHashImpl<false, false, false>, NameWordShingleSimHash, true>;
|
2019-11-06 10:34:13 +00:00
|
|
|
|
2020-12-24 15:49:05 +00:00
|
|
|
using FunctionWordShingleSimHashCaseInsensitive
|
2020-12-25 14:18:41 +00:00
|
|
|
= FunctionsStringHash<SimHashImpl<false, false, true>, NameWordShingleSimHashCaseInsensitive, true>;
|
2019-11-06 10:34:13 +00:00
|
|
|
|
2020-12-25 14:18:41 +00:00
|
|
|
using FunctionWordShingleSimHashUTF8 = FunctionsStringHash<SimHashImpl<true, false, false>, NameWordShingleSimHashUTF8, true>;
|
2019-11-06 10:34:13 +00:00
|
|
|
|
2020-12-24 15:49:05 +00:00
|
|
|
using FunctionWordShingleSimHashCaseInsensitiveUTF8
|
2020-12-25 14:18:41 +00:00
|
|
|
= FunctionsStringHash<SimHashImpl<true, false, true>, NameWordShingleSimHashCaseInsensitiveUTF8, true>;
|
2019-11-06 10:34:13 +00:00
|
|
|
|
2020-12-24 15:49:05 +00:00
|
|
|
// MinHash
|
2020-12-25 14:18:41 +00:00
|
|
|
using FunctionNgramMinHash = FunctionsStringHash<MinHashImpl<false, true, false>, NameNgramMinHash, false>;
|
2019-11-06 10:34:13 +00:00
|
|
|
|
2020-12-24 15:49:05 +00:00
|
|
|
using FunctionNgramMinHashCaseInsensitive
|
2020-12-25 14:18:41 +00:00
|
|
|
= FunctionsStringHash<MinHashImpl<false, true, true>, NameNgramMinHashCaseInsensitive, false>;
|
2019-11-06 10:34:13 +00:00
|
|
|
|
2020-12-25 14:18:41 +00:00
|
|
|
using FunctionNgramMinHashUTF8 = FunctionsStringHash<MinHashImpl<true, true, false>, NameNgramMinHashUTF8, false>;
|
2019-11-06 10:34:13 +00:00
|
|
|
|
2020-12-24 15:49:05 +00:00
|
|
|
using FunctionNgramMinHashCaseInsensitiveUTF8
|
2020-12-25 14:18:41 +00:00
|
|
|
= FunctionsStringHash<MinHashImpl<true, true, true>, NameNgramMinHashCaseInsensitiveUTF8, false>;
|
2019-11-06 10:34:13 +00:00
|
|
|
|
2020-12-25 14:18:41 +00:00
|
|
|
using FunctionWordShingleMinHash = FunctionsStringHash<MinHashImpl<false, false, false>, NameWordShingleMinHash, false>;
|
2019-11-06 10:34:13 +00:00
|
|
|
|
2020-12-24 15:49:05 +00:00
|
|
|
using FunctionWordShingleMinHashCaseInsensitive
|
2020-12-25 14:18:41 +00:00
|
|
|
= FunctionsStringHash<MinHashImpl<false, false, true>, NameWordShingleMinHashCaseInsensitive, false>;
|
2019-11-06 10:34:13 +00:00
|
|
|
|
2020-12-24 15:49:05 +00:00
|
|
|
using FunctionWordShingleMinHashUTF8
|
2020-12-25 14:18:41 +00:00
|
|
|
= FunctionsStringHash<MinHashImpl<true, false, false>, NameWordShingleMinHashUTF8, false>;
|
2019-11-06 10:34:13 +00:00
|
|
|
|
2020-12-24 15:49:05 +00:00
|
|
|
using FunctionWordShingleMinHashCaseInsensitiveUTF8
|
2020-12-25 14:18:41 +00:00
|
|
|
= FunctionsStringHash<MinHashImpl<true, false, true>, NameWordShingleMinHashCaseInsensitiveUTF8, false>;
|
|
|
|
|
|
|
|
// MinHasArg
|
|
|
|
|
2020-12-25 15:19:30 +00:00
|
|
|
using FunctionNgramMinHashArg = FunctionsStringHash<MinHashImpl<false, true, false>, NameNgramMinHashArg, false, true>;
|
2020-12-25 14:18:41 +00:00
|
|
|
|
|
|
|
using FunctionNgramMinHashArgCaseInsensitive
|
2020-12-25 15:19:30 +00:00
|
|
|
= FunctionsStringHash<MinHashImpl<false, true, true>, NameNgramMinHashArgCaseInsensitive, false, true>;
|
2020-12-25 14:18:41 +00:00
|
|
|
|
2020-12-25 15:19:30 +00:00
|
|
|
using FunctionNgramMinHashArgUTF8 = FunctionsStringHash<MinHashImpl<true, true, false>, NameNgramMinHashArgUTF8, false, true>;
|
2020-12-25 14:18:41 +00:00
|
|
|
|
|
|
|
using FunctionNgramMinHashArgCaseInsensitiveUTF8
|
2020-12-25 15:19:30 +00:00
|
|
|
= FunctionsStringHash<MinHashImpl<true, true, true>, NameNgramMinHashArgCaseInsensitiveUTF8, false, true>;
|
2020-12-25 14:18:41 +00:00
|
|
|
|
2020-12-25 15:19:30 +00:00
|
|
|
using FunctionWordShingleMinHashArg = FunctionsStringHash<MinHashImpl<false, false, false>, NameWordShingleMinHashArg, false, true>;
|
2020-12-25 14:18:41 +00:00
|
|
|
|
|
|
|
using FunctionWordShingleMinHashArgCaseInsensitive
|
2020-12-25 15:19:30 +00:00
|
|
|
= FunctionsStringHash<MinHashImpl<false, false, true>, NameWordShingleMinHashArgCaseInsensitive, false, true>;
|
2020-12-25 14:18:41 +00:00
|
|
|
|
|
|
|
using FunctionWordShingleMinHashArgUTF8
|
2020-12-25 15:19:30 +00:00
|
|
|
= FunctionsStringHash<MinHashImpl<true, false, false>, NameWordShingleMinHashArgUTF8, false, true>;
|
2020-12-25 14:18:41 +00:00
|
|
|
|
|
|
|
using FunctionWordShingleMinHashArgCaseInsensitiveUTF8
|
2020-12-25 15:19:30 +00:00
|
|
|
= FunctionsStringHash<MinHashImpl<true, false, true>, NameWordShingleMinHashArgCaseInsensitiveUTF8, false, true>;
|
2019-11-06 10:34:13 +00:00
|
|
|
|
2022-07-04 07:01:39 +00:00
|
|
|
REGISTER_FUNCTION(StringHash)
|
2019-11-06 10:34:13 +00:00
|
|
|
{
|
2020-12-24 15:49:05 +00:00
|
|
|
factory.registerFunction<FunctionNgramSimHash>();
|
|
|
|
factory.registerFunction<FunctionNgramSimHashCaseInsensitive>();
|
|
|
|
factory.registerFunction<FunctionNgramSimHashUTF8>();
|
|
|
|
factory.registerFunction<FunctionNgramSimHashCaseInsensitiveUTF8>();
|
|
|
|
factory.registerFunction<FunctionWordShingleSimHash>();
|
|
|
|
factory.registerFunction<FunctionWordShingleSimHashCaseInsensitive>();
|
|
|
|
factory.registerFunction<FunctionWordShingleSimHashUTF8>();
|
|
|
|
factory.registerFunction<FunctionWordShingleSimHashCaseInsensitiveUTF8>();
|
|
|
|
|
|
|
|
factory.registerFunction<FunctionNgramMinHash>();
|
|
|
|
factory.registerFunction<FunctionNgramMinHashCaseInsensitive>();
|
|
|
|
factory.registerFunction<FunctionNgramMinHashUTF8>();
|
|
|
|
factory.registerFunction<FunctionNgramMinHashCaseInsensitiveUTF8>();
|
|
|
|
factory.registerFunction<FunctionWordShingleMinHash>();
|
|
|
|
factory.registerFunction<FunctionWordShingleMinHashCaseInsensitive>();
|
|
|
|
factory.registerFunction<FunctionWordShingleMinHashUTF8>();
|
|
|
|
factory.registerFunction<FunctionWordShingleMinHashCaseInsensitiveUTF8>();
|
2020-12-25 14:18:41 +00:00
|
|
|
|
|
|
|
factory.registerFunction<FunctionNgramMinHashArg>();
|
|
|
|
factory.registerFunction<FunctionNgramMinHashArgCaseInsensitive>();
|
|
|
|
factory.registerFunction<FunctionNgramMinHashArgUTF8>();
|
|
|
|
factory.registerFunction<FunctionNgramMinHashArgCaseInsensitiveUTF8>();
|
|
|
|
factory.registerFunction<FunctionWordShingleMinHashArg>();
|
|
|
|
factory.registerFunction<FunctionWordShingleMinHashArgCaseInsensitive>();
|
|
|
|
factory.registerFunction<FunctionWordShingleMinHashArgUTF8>();
|
|
|
|
factory.registerFunction<FunctionWordShingleMinHashArgCaseInsensitiveUTF8>();
|
2019-11-06 10:34:13 +00:00
|
|
|
}
|
|
|
|
}
|