mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-26 09:32:01 +00:00
Refactor cose a little bit more.
This commit is contained in:
parent
c3a99e21bd
commit
29e0b4ec40
@ -19,11 +19,16 @@ namespace DB
|
||||
template <size_t N, bool CaseInsensitive>
|
||||
struct ExtractStringImpl
|
||||
{
|
||||
/// Padding form ColumnsString. It is a number of bytes we can always read starting from pos if pos < end.
|
||||
static constexpr size_t default_padding = 16;
|
||||
|
||||
// the length of code_points = default_padding + N -1
|
||||
/// Functions are read `default_padding - (N - 1)` bytes into the buffer. Window of size N is used.
|
||||
/// Read copies `N - 1` last bytes from buffer into beginning, and then reads new bytes.
|
||||
static constexpr size_t buffer_size = default_padding + N - 1;
|
||||
|
||||
// the length of code_points = buffer_size
|
||||
// pos: the current beginning location that we want to copy data
|
||||
// end: the end loction of the string
|
||||
// end: the end location of the string
|
||||
static ALWAYS_INLINE size_t readASCIICodePoints(UInt8 * code_points, const char *& pos, const char * end)
|
||||
{
|
||||
/// Offset before which we copy some data.
|
||||
|
@ -48,7 +48,7 @@ struct Hash
|
||||
return crc;
|
||||
}
|
||||
|
||||
static ALWAYS_INLINE inline UInt64 wordShinglesHash(const UInt32 * hashes, size_t size, size_t offset)
|
||||
static ALWAYS_INLINE inline UInt64 wordShinglesHash(const UInt64 * hashes, size_t size, size_t offset)
|
||||
{
|
||||
UInt64 crc1 = -1ULL;
|
||||
UInt64 crc2 = -1ULL;
|
||||
@ -122,7 +122,7 @@ struct SimhashImpl
|
||||
// we made an assumption that the size of one word cann't exceed 128, which may not true
|
||||
// if some word's size exceed 128, it would be cut up to several word
|
||||
static constexpr size_t max_string_size = 1u << 15;
|
||||
static constexpr size_t simultaneously_codepoints_num = StrOp::default_padding + N - 1;
|
||||
static constexpr size_t simultaneously_codepoints_num = StrOp::buffer_size;
|
||||
|
||||
// Simhash ngram calculate function: String ->UInt64
|
||||
// this function extracting ngram from input string, and maintain a 64-dimensions vector
|
||||
@ -323,7 +323,7 @@ struct MinhashImpl
|
||||
using MinHeap = FixedHeap<std::greater<size_t>, K, 0>;
|
||||
using StrOp = ExtractStringImpl<N, CaseInsensitive>;
|
||||
static constexpr size_t max_string_size = 1u << 15;
|
||||
static constexpr size_t simultaneously_codepoints_num = StrOp::default_padding + N - 1;
|
||||
static constexpr size_t simultaneously_codepoints_num = StrOp::buffer_size;
|
||||
|
||||
// Minhash ngram calculate function, String -> Tuple(UInt64, UInt64)
|
||||
// we extract ngram from input string, and calculate a hash value for each ngram
|
||||
|
Loading…
Reference in New Issue
Block a user