From 29e0b4ec40e2a09c0c7f30f6918ddb53229067c3 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Mon, 21 Dec 2020 17:14:05 +0300 Subject: [PATCH] Refactor cose a little bit more. --- src/Functions/ExtractString.h | 9 +++++++-- src/Functions/FunctionsStringHash.cpp | 6 +++--- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/src/Functions/ExtractString.h b/src/Functions/ExtractString.h index 51d6f17380c..b659d072887 100644 --- a/src/Functions/ExtractString.h +++ b/src/Functions/ExtractString.h @@ -19,11 +19,16 @@ namespace DB template struct ExtractStringImpl { + /// Padding form ColumnsString. It is a number of bytes we can always read starting from pos if pos < end. static constexpr size_t default_padding = 16; - // the length of code_points = default_padding + N -1 + /// Functions are read `default_padding - (N - 1)` bytes into the buffer. Window of size N is used. + /// Read copies `N - 1` last bytes from buffer into beginning, and then reads new bytes. + static constexpr size_t buffer_size = default_padding + N - 1; + + // the length of code_points = buffer_size // pos: the current beginning location that we want to copy data - // end: the end loction of the string + // end: the end location of the string static ALWAYS_INLINE size_t readASCIICodePoints(UInt8 * code_points, const char *& pos, const char * end) { /// Offset before which we copy some data. diff --git a/src/Functions/FunctionsStringHash.cpp b/src/Functions/FunctionsStringHash.cpp index b27d8601f3a..70e524c5df8 100644 --- a/src/Functions/FunctionsStringHash.cpp +++ b/src/Functions/FunctionsStringHash.cpp @@ -48,7 +48,7 @@ struct Hash return crc; } - static ALWAYS_INLINE inline UInt64 wordShinglesHash(const UInt32 * hashes, size_t size, size_t offset) + static ALWAYS_INLINE inline UInt64 wordShinglesHash(const UInt64 * hashes, size_t size, size_t offset) { UInt64 crc1 = -1ULL; UInt64 crc2 = -1ULL; @@ -122,7 +122,7 @@ struct SimhashImpl // we made an assumption that the size of one word cann't exceed 128, which may not true // if some word's size exceed 128, it would be cut up to several word static constexpr size_t max_string_size = 1u << 15; - static constexpr size_t simultaneously_codepoints_num = StrOp::default_padding + N - 1; + static constexpr size_t simultaneously_codepoints_num = StrOp::buffer_size; // Simhash ngram calculate function: String ->UInt64 // this function extracting ngram from input string, and maintain a 64-dimensions vector @@ -323,7 +323,7 @@ struct MinhashImpl using MinHeap = FixedHeap, K, 0>; using StrOp = ExtractStringImpl; static constexpr size_t max_string_size = 1u << 15; - static constexpr size_t simultaneously_codepoints_num = StrOp::default_padding + N - 1; + static constexpr size_t simultaneously_codepoints_num = StrOp::buffer_size; // Minhash ngram calculate function, String -> Tuple(UInt64, UInt64) // we extract ngram from input string, and calculate a hash value for each ngram