Refactor cose a little bit more.

This commit is contained in:
Nikolai Kochetov 2020-12-21 17:14:05 +03:00
parent c3a99e21bd
commit 29e0b4ec40
2 changed files with 10 additions and 5 deletions

View File

@ -19,11 +19,16 @@ namespace DB
template <size_t N, bool CaseInsensitive>
struct ExtractStringImpl
{
/// Padding form ColumnsString. It is a number of bytes we can always read starting from pos if pos < end.
static constexpr size_t default_padding = 16;
// the length of code_points = default_padding + N -1
/// Functions are read `default_padding - (N - 1)` bytes into the buffer. Window of size N is used.
/// Read copies `N - 1` last bytes from buffer into beginning, and then reads new bytes.
static constexpr size_t buffer_size = default_padding + N - 1;
// the length of code_points = buffer_size
// pos: the current beginning location that we want to copy data
// end: the end loction of the string
// end: the end location of the string
static ALWAYS_INLINE size_t readASCIICodePoints(UInt8 * code_points, const char *& pos, const char * end)
{
/// Offset before which we copy some data.

View File

@ -48,7 +48,7 @@ struct Hash
return crc;
}
static ALWAYS_INLINE inline UInt64 wordShinglesHash(const UInt32 * hashes, size_t size, size_t offset)
static ALWAYS_INLINE inline UInt64 wordShinglesHash(const UInt64 * hashes, size_t size, size_t offset)
{
UInt64 crc1 = -1ULL;
UInt64 crc2 = -1ULL;
@ -122,7 +122,7 @@ struct SimhashImpl
// we made an assumption that the size of one word cann't exceed 128, which may not true
// if some word's size exceed 128, it would be cut up to several word
static constexpr size_t max_string_size = 1u << 15;
static constexpr size_t simultaneously_codepoints_num = StrOp::default_padding + N - 1;
static constexpr size_t simultaneously_codepoints_num = StrOp::buffer_size;
// Simhash ngram calculate function: String ->UInt64
// this function extracting ngram from input string, and maintain a 64-dimensions vector
@ -323,7 +323,7 @@ struct MinhashImpl
using MinHeap = FixedHeap<std::greater<size_t>, K, 0>;
using StrOp = ExtractStringImpl<N, CaseInsensitive>;
static constexpr size_t max_string_size = 1u << 15;
static constexpr size_t simultaneously_codepoints_num = StrOp::default_padding + N - 1;
static constexpr size_t simultaneously_codepoints_num = StrOp::buffer_size;
// Minhash ngram calculate function, String -> Tuple(UInt64, UInt64)
// we extract ngram from input string, and calculate a hash value for each ngram