Feractoring StringHash

This commit is contained in:
Nikolai Kochetov 2020-12-25 14:22:48 +03:00
parent e8f4a19a10
commit 04d55dc495
2 changed files with 222 additions and 122 deletions

View File

@ -23,112 +23,51 @@ namespace ErrorCodes
// used by FunctionsStringSimilarity and FunctionsStringHash
// includes extracting ASCII ngram, UTF8 ngram, ASCII word and UTF8 word
template <bool CaseInsensitive>
struct ExtractStringImpl
{
/// Padding form ColumnsString. It is a number of bytes we can always read starting from pos if pos < end.
static constexpr size_t default_padding = 16;
const size_t shingle_size;
const size_t tail_size;
/// Functions are read `default_padding - (N - 1)` bytes into the buffer. Window of size N is used.
/// Read copies `N - 1` last bytes from buffer into beginning, and then reads new bytes.
const size_t buffer_size = default_padding + tail_size;
explicit ExtractStringImpl(size_t shingle_size_)
: shingle_size(shingle_size_)
, tail_size(shingle_size > default_padding ? shingle_size : roundUpToPowerOfTwoOrZero(shingle_size - 1))
{
if (shingle_size == 0)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Shingle size can't be zero");
}
// read a ASCII word
static ALWAYS_INLINE inline size_t readOneASCIIWord(PaddedPODArray<UInt8> & word_buf, const char *& pos, const char * end)
static ALWAYS_INLINE inline const UInt8 * readOneASCIIWord(const UInt8 *& pos, const UInt8 * end)
{
// jump separators
while (pos < end && !isAlphaNumericASCII(*pos))
++pos;
// word start from here
const char * word_start = pos;
const UInt8 * word_start = pos;
while (pos < end && isAlphaNumericASCII(*pos))
++pos;
word_buf.assign(word_start, pos);
if (CaseInsensitive)
{
for (auto & symbol : word_buf)
symbol = toLowerIfAlphaASCII(symbol);
}
return word_buf.size();
return word_start;
}
// read one UTF8 word from pos to word
static ALWAYS_INLINE inline size_t readOneUTF8Word(PaddedPODArray<UInt32> & word_buf, const char *& pos, const char * end)
static ALWAYS_INLINE inline const UInt8 * readOneUTF8Word(const UInt8 *& pos, const UInt8 * end)
{
// jump UTF8 separator
while (pos < end && isUTF8Sep(*pos))
++pos;
word_buf.clear();
// UTF8 word's character number
const UInt8 * word_start = pos;
while (pos < end && !isUTF8Sep(*pos))
{
word_buf.push_back(readOneUTF8Code(pos, end));
}
return word_buf.size();
readOneUTF8Code(pos, end);
return word_start;
}
// we use ASCII non-alphanum character as UTF8 separator
static ALWAYS_INLINE inline bool isUTF8Sep(const UInt8 c) { return c < 128 && !isAlphaNumericASCII(c); }
// read one UTF8 character and return it
static ALWAYS_INLINE inline UInt32 readOneUTF8Code(const char *& pos, const char * end)
// read one UTF8 character
static ALWAYS_INLINE inline void readOneUTF8Code(const UInt8 *& pos, const UInt8 * end)
{
size_t length = UTF8::seqLength(*pos);
if (pos + length > end)
length = end - pos;
UInt32 res;
switch (length)
{
case 1:
res = 0;
memcpy(&res, pos, 1);
break;
case 2:
res = 0;
memcpy(&res, pos, 2);
break;
case 3:
res = 0;
memcpy(&res, pos, 3);
break;
default:
memcpy(&res, pos, 4);
}
if constexpr (CaseInsensitive)
{
switch (length)
{
case 4:
res &= ~(1u << (5 + 3 * CHAR_BIT));
[[fallthrough]];
case 3:
res &= ~(1u << (5 + 2 * CHAR_BIT));
[[fallthrough]];
case 2:
res &= ~(1u);
res &= ~(1u << (5 + CHAR_BIT));
[[fallthrough]];
default:
res &= ~(1u << 5);
}
}
pos += length;
return res;
}
};
}

View File

@ -46,6 +46,17 @@ struct Hash
#endif
}
static UInt64 crc32u16(UInt64 crc [[maybe_unused]], UInt16 val [[maybe_unused]])
{
#ifdef __SSE4_2__
return _mm_crc32_u16(crc, val);
#elif defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
return __crc32ch(crc, val);
#else
throw Exception("String hash is not implemented without sse4.2 support", ErrorCodes::NOT_IMPLEMENTED);
#endif
}
static UInt64 crc32u8(UInt64 crc [[maybe_unused]], UInt8 val [[maybe_unused]])
{
#ifdef __SSE4_2__
@ -57,18 +68,70 @@ struct Hash
#endif
}
static ALWAYS_INLINE inline UInt64 ngramASCIIHash(const UInt8 * code_points)
template <bool CaseInsensitive>
static ALWAYS_INLINE inline UInt64 shingleHash(UInt64 crc, const UInt8 * start, size_t size)
{
return crc32u64(-1ULL, unalignedLoad<UInt32>(code_points));
if (size & 1)
{
UInt8 x = *start;
if constexpr (CaseInsensitive)
x |= 0x20u; /// see toLowerIfAlphaASCII from StringUtils.h
crc = crc32u8(crc, x);
--size;
++start;
}
if (size & 2)
{
UInt16 x = unalignedLoad<UInt16>(start);
if constexpr (CaseInsensitive)
x |= 0x2020u;
crc = crc32u16(crc, x);
size -= 2;
start += 2;
}
if (size & 4)
{
UInt32 x = unalignedLoad<UInt32>(start);
if constexpr (CaseInsensitive)
x |= 0x20202020u;
crc = crc32u32(crc, x);
size -= 4;
start += 4;
}
while (size)
{
UInt64 x = unalignedLoad<UInt64>(start);
if constexpr (CaseInsensitive)
x |= 0x2020202020202020u;
crc = crc32u64(crc, x);
size -= 8;
start += 8;
}
return crc;
}
static ALWAYS_INLINE inline UInt64 ngramUTF8Hash(const UInt32 * code_points)
template <bool CaseInsensitive>
static ALWAYS_INLINE inline UInt64 shingleHash(const std::vector<StringRef> & shingle, size_t offset = 0)
{
UInt64 crc = -1ULL;
crc = crc32u64(crc, code_points[0]);
crc = crc32u64(crc, code_points[1]);
crc = crc32u64(crc, code_points[2]);
return crc;
for (size_t i = offset; i < shingle.size(); ++i)
crc = shingleHash<CaseInsensitive>(crc, shingle[i].data, shingle[i].size);
for (size_t i = 0; i < offset; ++i)
crc = shingleHash<CaseInsensitive>(crc, shingle[i].data, shingle[i].size);
}
static ALWAYS_INLINE inline UInt64 wordShinglesHash(const UInt64 * hashes, size_t size, size_t offset)
@ -148,54 +211,82 @@ struct Hash
template <size_t N, typename CodePoint, bool UTF8, bool Ngram, bool CaseInsensitive>
struct SimHashImpl
{
using StrOp = ExtractStringImpl<N, CaseInsensitive>;
static constexpr size_t simultaneously_codepoints_num = StrOp::buffer_size;
//using StrOp = ExtractStringImpl<N, CaseInsensitive>;
//static constexpr size_t simultaneously_codepoints_num = StrOp::buffer_size;
// SimHash ngram calculate function: String ->UInt64
/// Update fingerprint according to hash_value bits.
static ALWAYS_INLINE inline void updateFingerVector(Int64 * finger_vec, UInt64 hash_value)
{
for (size_t i = 0; i < 64; ++i)
finger_vec[i] += (hash_value & (1ULL << i)) ? 1 : -1;
}
/// Return a 64 bit value according to finger_vec.
static ALWAYS_INLINE inline UInt64 getSimHash(const Int64 * finger_vec)
{
UInt64 res = 0;
for (size_t i = 0; i < 64; ++i)
if (finger_vec[i] > 0)
res |= (1ULL << i);
return res;
}
// SimHash ngram calculate function: String -> UInt64
// this function extracting ngram from input string, and maintain a 64-dimensions vector
// for each ngram, calculate a 64 bit hash value, and update the vector according the hash value
// finally return a 64 bit value(UInt64), i'th bit is 1 means vector[i] > 0, otherwise, vector[i] < 0
static ALWAYS_INLINE inline UInt64 ngramCalculateHashValue(
const char * data,
size_t size,
size_t (*read_code_points)(CodePoint *, const char *&, const char *),
UInt64 (*hash_functor)(const CodePoint *))
static ALWAYS_INLINE inline UInt64 ngramHash(const UInt8 * data, size_t size, size_t shingle_size)
{
const char * start = data;
const char * end = data + size;
// fingerprint vector, all dimensions initialized to zero at the first
if (size < shingle_size)
return Hash::shingleHash<CaseInsensitive>(-1ULL, data, size);
Int64 finger_vec[64] = {};
CodePoint cp[simultaneously_codepoints_num] = {};
const UInt8 * end = data + size;
size_t found = read_code_points(cp, start, end);
size_t iter = N - 1;
do
for (const UInt8 * pos = data; pos + shingle_size <= end; ++pos)
{
for (; iter + N <= found; ++iter)
{
// for each ngram, we can calculate an 64 bit hash
// then update finger_vec according to this hash value
// if the i'th bit is 1, finger_vec[i] plus 1, otherwise minus 1
UInt64 hash_value = hash_functor(cp + iter);
std::bitset<64> bits(hash_value);
for (size_t i = 0; i < 64; ++i)
{
finger_vec[i] += ((bits.test(i)) ? 1 : -1);
}
}
iter = 0;
} while (start < end && (found = read_code_points(cp, start, end)));
// finally, we return a 64 bit value according to finger_vec
// if finger_vec[i] > 0, the i'th bit of the value is 1, otherwise 0
std::bitset<64> res_bit(0u);
for (size_t i = 0; i < 64; ++i)
{
if (finger_vec[i] > 0)
res_bit.set(i);
UInt64 hash_value = Hash::shingleHash<CaseInsensitive>(-1ULL, pos, shingle_size);
updateFingerVector(finger_vec, hash_value);
}
return res_bit.to_ullong();
return getSimHash(finger_vec);
}
static ALWAYS_INLINE inline UInt64 ngramHashUTF8(
size_t shingle_size,
const UInt8 * data,
size_t size)
{
const UInt8 * start = data;
const UInt8 * end = data + size;
const UInt8 * word_start = start;
const UInt8 * word_end = start;
for (size_t i = 0; i < shingle_size; ++i)
{
if (word_end >= end)
return Hash::shingleHash<CaseInsensitive>(-1ULL, data, size);
ExtractStringImpl::readOneUTF8Code(word_end, end);
}
Int64 finger_vec[64] = {};
while (word_end < end)
{
ExtractStringImpl::readOneUTF8Code(word_start, word_end);
ExtractStringImpl::readOneUTF8Code(word_end, end);
size_t length = word_end - word_start;
UInt64 hash_value = Hash::shingleHash<CaseInsensitive>(-1ULL, word_start, length);
updateFingerVector(finger_vec, hash_value);
}
return getSimHash(finger_vec);
}
// SimHash word shingle calculate function: String -> UInt64
@ -208,11 +299,81 @@ struct SimHashImpl
// to calculate the first word shingle hash value
// 2. next, we extract one word each time, and calculate a new hash value of the new word,then use the latest N hash
// values to calculate the next word shingle hash value
static ALWAYS_INLINE inline UInt64 wordShingleHash(
const UInt8 * data,
size_t size,
size_t shingle_size)
{
const UInt8 * start = data;
const UInt8 * end = data + size;
// A 64 bit vector initialized to zero.
Int64 finger_vec[64] = {};
// An array to store N words.
std::vector<StringRef> words;
words.reserve(shingle_size);
// get first word shingle
while (start < end && words.size() < shingle_size)
{
const UInt8 * word_start = nullptr;
if constexpr (UTF8)
word_start = ExtractStringImpl::readOneUTF8Word(start, end);
else
word_start = ExtractStringImpl::readOneASCIIWord(start, end);
size_t length = start - word_start;
if (length)
words.emplace_back(word_start, length);
}
UInt64 hash_value = Hash::shingleHash<CaseInsensitive>(words);
updateFingerVector(finger_vec, hash_value);
size_t offset = 0;
while (start < end)
{
const UInt8 * word_start = nullptr;
if constexpr (UTF8)
word_start = ExtractStringImpl::readOneUTF8Word(start, end);
else
word_start = ExtractStringImpl::readOneASCIIWord(start, end);
size_t length = start - word_start;
if (length == 0)
continue;
// we need to store the new word hash value to the oldest location.
// for example, N = 5, array |a0|a1|a2|a3|a4|, now , a0 is the oldest location,
// so we need to store new word hash into location of a0, then ,this array become
// |a5|a1|a2|a3|a4|, next time, a1 become the oldest location, we need to store new
// word hash value into location of a1, then array become |a5|a6|a2|a3|a4|
words[offset] = StringRef(word_start, length);
++offset;
if (offset >= shingle_size)
offset = 0;
// according to the word hash storation way, in order to not lose the word shingle's
// sequence information, when calculation word shingle hash value, we need provide the offset
// information, which is the offset of the first word's hash value of the word shingle
hash_value = Hash::shingleHash<CaseInsensitive>(words, offset);
updateFingerVector(finger_vec, hash_value);
}
return getSimHash(finger_vec);
}
static ALWAYS_INLINE inline UInt64 wordShinglesCalculateHashValue(
const char * data,
size_t size,
size_t (*read_one_word)(PaddedPODArray<CodePoint> &, const char *&, const char *),
UInt64 (*hash_functor)(const UInt64 *, size_t, size_t))
size_t shingle_size,
size_t heap_size,
size_t max_word_length)
{
const char * start = data;
const char * end = data + size;
@ -220,7 +381,7 @@ struct SimHashImpl
// Also, a 64 bit vector initialized to zero
Int64 finger_vec[64] = {};
// a array to store N word hash values
UInt64 nword_hashes[N] = {};
std::vector<UInt64> word_hashes(shingle_size, 0);
// word buffer to store one word
PaddedPODArray<CodePoint> word_buf;
// get first word shingle