mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-26 19:50:51 +00:00
Feractoring StringHash
This commit is contained in:
parent
e8f4a19a10
commit
04d55dc495
@ -23,112 +23,51 @@ namespace ErrorCodes
|
||||
|
||||
// used by FunctionsStringSimilarity and FunctionsStringHash
|
||||
// includes extracting ASCII ngram, UTF8 ngram, ASCII word and UTF8 word
|
||||
template <bool CaseInsensitive>
|
||||
struct ExtractStringImpl
|
||||
{
|
||||
/// Padding form ColumnsString. It is a number of bytes we can always read starting from pos if pos < end.
|
||||
static constexpr size_t default_padding = 16;
|
||||
|
||||
const size_t shingle_size;
|
||||
const size_t tail_size;
|
||||
|
||||
/// Functions are read `default_padding - (N - 1)` bytes into the buffer. Window of size N is used.
|
||||
/// Read copies `N - 1` last bytes from buffer into beginning, and then reads new bytes.
|
||||
const size_t buffer_size = default_padding + tail_size;
|
||||
|
||||
explicit ExtractStringImpl(size_t shingle_size_)
|
||||
: shingle_size(shingle_size_)
|
||||
, tail_size(shingle_size > default_padding ? shingle_size : roundUpToPowerOfTwoOrZero(shingle_size - 1))
|
||||
{
|
||||
if (shingle_size == 0)
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Shingle size can't be zero");
|
||||
}
|
||||
|
||||
// read a ASCII word
|
||||
static ALWAYS_INLINE inline size_t readOneASCIIWord(PaddedPODArray<UInt8> & word_buf, const char *& pos, const char * end)
|
||||
static ALWAYS_INLINE inline const UInt8 * readOneASCIIWord(const UInt8 *& pos, const UInt8 * end)
|
||||
{
|
||||
// jump separators
|
||||
while (pos < end && !isAlphaNumericASCII(*pos))
|
||||
++pos;
|
||||
|
||||
// word start from here
|
||||
const char * word_start = pos;
|
||||
const UInt8 * word_start = pos;
|
||||
while (pos < end && isAlphaNumericASCII(*pos))
|
||||
++pos;
|
||||
|
||||
word_buf.assign(word_start, pos);
|
||||
if (CaseInsensitive)
|
||||
{
|
||||
for (auto & symbol : word_buf)
|
||||
symbol = toLowerIfAlphaASCII(symbol);
|
||||
}
|
||||
return word_buf.size();
|
||||
return word_start;
|
||||
}
|
||||
|
||||
// read one UTF8 word from pos to word
|
||||
static ALWAYS_INLINE inline size_t readOneUTF8Word(PaddedPODArray<UInt32> & word_buf, const char *& pos, const char * end)
|
||||
static ALWAYS_INLINE inline const UInt8 * readOneUTF8Word(const UInt8 *& pos, const UInt8 * end)
|
||||
{
|
||||
// jump UTF8 separator
|
||||
while (pos < end && isUTF8Sep(*pos))
|
||||
++pos;
|
||||
word_buf.clear();
|
||||
|
||||
// UTF8 word's character number
|
||||
const UInt8 * word_start = pos;
|
||||
|
||||
while (pos < end && !isUTF8Sep(*pos))
|
||||
{
|
||||
word_buf.push_back(readOneUTF8Code(pos, end));
|
||||
}
|
||||
return word_buf.size();
|
||||
readOneUTF8Code(pos, end);
|
||||
|
||||
return word_start;
|
||||
}
|
||||
|
||||
// we use ASCII non-alphanum character as UTF8 separator
|
||||
static ALWAYS_INLINE inline bool isUTF8Sep(const UInt8 c) { return c < 128 && !isAlphaNumericASCII(c); }
|
||||
|
||||
// read one UTF8 character and return it
|
||||
static ALWAYS_INLINE inline UInt32 readOneUTF8Code(const char *& pos, const char * end)
|
||||
// read one UTF8 character
|
||||
static ALWAYS_INLINE inline void readOneUTF8Code(const UInt8 *& pos, const UInt8 * end)
|
||||
{
|
||||
size_t length = UTF8::seqLength(*pos);
|
||||
|
||||
if (pos + length > end)
|
||||
length = end - pos;
|
||||
UInt32 res;
|
||||
switch (length)
|
||||
{
|
||||
case 1:
|
||||
res = 0;
|
||||
memcpy(&res, pos, 1);
|
||||
break;
|
||||
case 2:
|
||||
res = 0;
|
||||
memcpy(&res, pos, 2);
|
||||
break;
|
||||
case 3:
|
||||
res = 0;
|
||||
memcpy(&res, pos, 3);
|
||||
break;
|
||||
default:
|
||||
memcpy(&res, pos, 4);
|
||||
}
|
||||
|
||||
if constexpr (CaseInsensitive)
|
||||
{
|
||||
switch (length)
|
||||
{
|
||||
case 4:
|
||||
res &= ~(1u << (5 + 3 * CHAR_BIT));
|
||||
[[fallthrough]];
|
||||
case 3:
|
||||
res &= ~(1u << (5 + 2 * CHAR_BIT));
|
||||
[[fallthrough]];
|
||||
case 2:
|
||||
res &= ~(1u);
|
||||
res &= ~(1u << (5 + CHAR_BIT));
|
||||
[[fallthrough]];
|
||||
default:
|
||||
res &= ~(1u << 5);
|
||||
}
|
||||
}
|
||||
pos += length;
|
||||
return res;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
@ -46,6 +46,17 @@ struct Hash
|
||||
#endif
|
||||
}
|
||||
|
||||
static UInt64 crc32u16(UInt64 crc [[maybe_unused]], UInt16 val [[maybe_unused]])
|
||||
{
|
||||
#ifdef __SSE4_2__
|
||||
return _mm_crc32_u16(crc, val);
|
||||
#elif defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
|
||||
return __crc32ch(crc, val);
|
||||
#else
|
||||
throw Exception("String hash is not implemented without sse4.2 support", ErrorCodes::NOT_IMPLEMENTED);
|
||||
#endif
|
||||
}
|
||||
|
||||
static UInt64 crc32u8(UInt64 crc [[maybe_unused]], UInt8 val [[maybe_unused]])
|
||||
{
|
||||
#ifdef __SSE4_2__
|
||||
@ -57,18 +68,70 @@ struct Hash
|
||||
#endif
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE inline UInt64 ngramASCIIHash(const UInt8 * code_points)
|
||||
template <bool CaseInsensitive>
|
||||
static ALWAYS_INLINE inline UInt64 shingleHash(UInt64 crc, const UInt8 * start, size_t size)
|
||||
{
|
||||
return crc32u64(-1ULL, unalignedLoad<UInt32>(code_points));
|
||||
if (size & 1)
|
||||
{
|
||||
UInt8 x = *start;
|
||||
|
||||
if constexpr (CaseInsensitive)
|
||||
x |= 0x20u; /// see toLowerIfAlphaASCII from StringUtils.h
|
||||
|
||||
crc = crc32u8(crc, x);
|
||||
--size;
|
||||
++start;
|
||||
}
|
||||
|
||||
if (size & 2)
|
||||
{
|
||||
UInt16 x = unalignedLoad<UInt16>(start);
|
||||
|
||||
if constexpr (CaseInsensitive)
|
||||
x |= 0x2020u;
|
||||
|
||||
crc = crc32u16(crc, x);
|
||||
size -= 2;
|
||||
start += 2;
|
||||
}
|
||||
|
||||
if (size & 4)
|
||||
{
|
||||
UInt32 x = unalignedLoad<UInt32>(start);
|
||||
|
||||
if constexpr (CaseInsensitive)
|
||||
x |= 0x20202020u;
|
||||
|
||||
crc = crc32u32(crc, x);
|
||||
size -= 4;
|
||||
start += 4;
|
||||
}
|
||||
|
||||
while (size)
|
||||
{
|
||||
UInt64 x = unalignedLoad<UInt64>(start);
|
||||
|
||||
if constexpr (CaseInsensitive)
|
||||
x |= 0x2020202020202020u;
|
||||
|
||||
crc = crc32u64(crc, x);
|
||||
size -= 8;
|
||||
start += 8;
|
||||
}
|
||||
|
||||
return crc;
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE inline UInt64 ngramUTF8Hash(const UInt32 * code_points)
|
||||
template <bool CaseInsensitive>
|
||||
static ALWAYS_INLINE inline UInt64 shingleHash(const std::vector<StringRef> & shingle, size_t offset = 0)
|
||||
{
|
||||
UInt64 crc = -1ULL;
|
||||
crc = crc32u64(crc, code_points[0]);
|
||||
crc = crc32u64(crc, code_points[1]);
|
||||
crc = crc32u64(crc, code_points[2]);
|
||||
return crc;
|
||||
|
||||
for (size_t i = offset; i < shingle.size(); ++i)
|
||||
crc = shingleHash<CaseInsensitive>(crc, shingle[i].data, shingle[i].size);
|
||||
|
||||
for (size_t i = 0; i < offset; ++i)
|
||||
crc = shingleHash<CaseInsensitive>(crc, shingle[i].data, shingle[i].size);
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE inline UInt64 wordShinglesHash(const UInt64 * hashes, size_t size, size_t offset)
|
||||
@ -148,54 +211,82 @@ struct Hash
|
||||
template <size_t N, typename CodePoint, bool UTF8, bool Ngram, bool CaseInsensitive>
|
||||
struct SimHashImpl
|
||||
{
|
||||
using StrOp = ExtractStringImpl<N, CaseInsensitive>;
|
||||
static constexpr size_t simultaneously_codepoints_num = StrOp::buffer_size;
|
||||
//using StrOp = ExtractStringImpl<N, CaseInsensitive>;
|
||||
//static constexpr size_t simultaneously_codepoints_num = StrOp::buffer_size;
|
||||
|
||||
// SimHash ngram calculate function: String ->UInt64
|
||||
/// Update fingerprint according to hash_value bits.
|
||||
static ALWAYS_INLINE inline void updateFingerVector(Int64 * finger_vec, UInt64 hash_value)
|
||||
{
|
||||
for (size_t i = 0; i < 64; ++i)
|
||||
finger_vec[i] += (hash_value & (1ULL << i)) ? 1 : -1;
|
||||
}
|
||||
|
||||
/// Return a 64 bit value according to finger_vec.
|
||||
static ALWAYS_INLINE inline UInt64 getSimHash(const Int64 * finger_vec)
|
||||
{
|
||||
UInt64 res = 0;
|
||||
|
||||
for (size_t i = 0; i < 64; ++i)
|
||||
if (finger_vec[i] > 0)
|
||||
res |= (1ULL << i);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
// SimHash ngram calculate function: String -> UInt64
|
||||
// this function extracting ngram from input string, and maintain a 64-dimensions vector
|
||||
// for each ngram, calculate a 64 bit hash value, and update the vector according the hash value
|
||||
// finally return a 64 bit value(UInt64), i'th bit is 1 means vector[i] > 0, otherwise, vector[i] < 0
|
||||
static ALWAYS_INLINE inline UInt64 ngramCalculateHashValue(
|
||||
const char * data,
|
||||
size_t size,
|
||||
size_t (*read_code_points)(CodePoint *, const char *&, const char *),
|
||||
UInt64 (*hash_functor)(const CodePoint *))
|
||||
|
||||
static ALWAYS_INLINE inline UInt64 ngramHash(const UInt8 * data, size_t size, size_t shingle_size)
|
||||
{
|
||||
const char * start = data;
|
||||
const char * end = data + size;
|
||||
// fingerprint vector, all dimensions initialized to zero at the first
|
||||
if (size < shingle_size)
|
||||
return Hash::shingleHash<CaseInsensitive>(-1ULL, data, size);
|
||||
|
||||
Int64 finger_vec[64] = {};
|
||||
CodePoint cp[simultaneously_codepoints_num] = {};
|
||||
const UInt8 * end = data + size;
|
||||
|
||||
size_t found = read_code_points(cp, start, end);
|
||||
size_t iter = N - 1;
|
||||
|
||||
do
|
||||
for (const UInt8 * pos = data; pos + shingle_size <= end; ++pos)
|
||||
{
|
||||
for (; iter + N <= found; ++iter)
|
||||
{
|
||||
// for each ngram, we can calculate an 64 bit hash
|
||||
// then update finger_vec according to this hash value
|
||||
// if the i'th bit is 1, finger_vec[i] plus 1, otherwise minus 1
|
||||
UInt64 hash_value = hash_functor(cp + iter);
|
||||
std::bitset<64> bits(hash_value);
|
||||
for (size_t i = 0; i < 64; ++i)
|
||||
{
|
||||
finger_vec[i] += ((bits.test(i)) ? 1 : -1);
|
||||
}
|
||||
}
|
||||
iter = 0;
|
||||
} while (start < end && (found = read_code_points(cp, start, end)));
|
||||
|
||||
// finally, we return a 64 bit value according to finger_vec
|
||||
// if finger_vec[i] > 0, the i'th bit of the value is 1, otherwise 0
|
||||
std::bitset<64> res_bit(0u);
|
||||
for (size_t i = 0; i < 64; ++i)
|
||||
{
|
||||
if (finger_vec[i] > 0)
|
||||
res_bit.set(i);
|
||||
UInt64 hash_value = Hash::shingleHash<CaseInsensitive>(-1ULL, pos, shingle_size);
|
||||
updateFingerVector(finger_vec, hash_value);
|
||||
}
|
||||
return res_bit.to_ullong();
|
||||
|
||||
return getSimHash(finger_vec);
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE inline UInt64 ngramHashUTF8(
|
||||
size_t shingle_size,
|
||||
const UInt8 * data,
|
||||
size_t size)
|
||||
{
|
||||
const UInt8 * start = data;
|
||||
const UInt8 * end = data + size;
|
||||
|
||||
const UInt8 * word_start = start;
|
||||
const UInt8 * word_end = start;
|
||||
|
||||
for (size_t i = 0; i < shingle_size; ++i)
|
||||
{
|
||||
if (word_end >= end)
|
||||
return Hash::shingleHash<CaseInsensitive>(-1ULL, data, size);
|
||||
|
||||
ExtractStringImpl::readOneUTF8Code(word_end, end);
|
||||
}
|
||||
|
||||
Int64 finger_vec[64] = {};
|
||||
|
||||
while (word_end < end)
|
||||
{
|
||||
ExtractStringImpl::readOneUTF8Code(word_start, word_end);
|
||||
ExtractStringImpl::readOneUTF8Code(word_end, end);
|
||||
|
||||
size_t length = word_end - word_start;
|
||||
UInt64 hash_value = Hash::shingleHash<CaseInsensitive>(-1ULL, word_start, length);
|
||||
updateFingerVector(finger_vec, hash_value);
|
||||
}
|
||||
|
||||
return getSimHash(finger_vec);
|
||||
}
|
||||
|
||||
// SimHash word shingle calculate function: String -> UInt64
|
||||
@ -208,11 +299,81 @@ struct SimHashImpl
|
||||
// to calculate the first word shingle hash value
|
||||
// 2. next, we extract one word each time, and calculate a new hash value of the new word,then use the latest N hash
|
||||
// values to calculate the next word shingle hash value
|
||||
|
||||
static ALWAYS_INLINE inline UInt64 wordShingleHash(
|
||||
const UInt8 * data,
|
||||
size_t size,
|
||||
size_t shingle_size)
|
||||
{
|
||||
const UInt8 * start = data;
|
||||
const UInt8 * end = data + size;
|
||||
|
||||
// A 64 bit vector initialized to zero.
|
||||
Int64 finger_vec[64] = {};
|
||||
// An array to store N words.
|
||||
std::vector<StringRef> words;
|
||||
words.reserve(shingle_size);
|
||||
|
||||
// get first word shingle
|
||||
while (start < end && words.size() < shingle_size)
|
||||
{
|
||||
const UInt8 * word_start = nullptr;
|
||||
|
||||
if constexpr (UTF8)
|
||||
word_start = ExtractStringImpl::readOneUTF8Word(start, end);
|
||||
else
|
||||
word_start = ExtractStringImpl::readOneASCIIWord(start, end);
|
||||
|
||||
size_t length = start - word_start;
|
||||
|
||||
if (length)
|
||||
words.emplace_back(word_start, length);
|
||||
}
|
||||
|
||||
UInt64 hash_value = Hash::shingleHash<CaseInsensitive>(words);
|
||||
updateFingerVector(finger_vec, hash_value);
|
||||
|
||||
size_t offset = 0;
|
||||
while (start < end)
|
||||
{
|
||||
const UInt8 * word_start = nullptr;
|
||||
|
||||
if constexpr (UTF8)
|
||||
word_start = ExtractStringImpl::readOneUTF8Word(start, end);
|
||||
else
|
||||
word_start = ExtractStringImpl::readOneASCIIWord(start, end);
|
||||
|
||||
size_t length = start - word_start;
|
||||
|
||||
if (length == 0)
|
||||
continue;
|
||||
|
||||
// we need to store the new word hash value to the oldest location.
|
||||
// for example, N = 5, array |a0|a1|a2|a3|a4|, now , a0 is the oldest location,
|
||||
// so we need to store new word hash into location of a0, then ,this array become
|
||||
// |a5|a1|a2|a3|a4|, next time, a1 become the oldest location, we need to store new
|
||||
// word hash value into location of a1, then array become |a5|a6|a2|a3|a4|
|
||||
words[offset] = StringRef(word_start, length);
|
||||
++offset;
|
||||
if (offset >= shingle_size)
|
||||
offset = 0;
|
||||
|
||||
// according to the word hash storation way, in order to not lose the word shingle's
|
||||
// sequence information, when calculation word shingle hash value, we need provide the offset
|
||||
// information, which is the offset of the first word's hash value of the word shingle
|
||||
hash_value = Hash::shingleHash<CaseInsensitive>(words, offset);
|
||||
updateFingerVector(finger_vec, hash_value);
|
||||
}
|
||||
|
||||
return getSimHash(finger_vec);
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE inline UInt64 wordShinglesCalculateHashValue(
|
||||
const char * data,
|
||||
size_t size,
|
||||
size_t (*read_one_word)(PaddedPODArray<CodePoint> &, const char *&, const char *),
|
||||
UInt64 (*hash_functor)(const UInt64 *, size_t, size_t))
|
||||
size_t shingle_size,
|
||||
size_t heap_size,
|
||||
size_t max_word_length)
|
||||
{
|
||||
const char * start = data;
|
||||
const char * end = data + size;
|
||||
@ -220,7 +381,7 @@ struct SimHashImpl
|
||||
// Also, a 64 bit vector initialized to zero
|
||||
Int64 finger_vec[64] = {};
|
||||
// a array to store N word hash values
|
||||
UInt64 nword_hashes[N] = {};
|
||||
std::vector<UInt64> word_hashes(shingle_size, 0);
|
||||
// word buffer to store one word
|
||||
PaddedPODArray<CodePoint> word_buf;
|
||||
// get first word shingle
|
||||
|
Loading…
Reference in New Issue
Block a user