From 2d198f640ed4627cf6f12d03589ccc8f16b2722e Mon Sep 17 00:00:00 2001 From: fenglv Date: Wed, 6 Nov 2019 18:34:13 +0800 Subject: [PATCH 01/23] add simhash and minhash --- dbms/src/Functions/ExtractString.h | 187 ++++++ dbms/src/Functions/FunctionsStringHash.cpp | 585 ++++++++++++++++++ dbms/src/Functions/FunctionsStringHash.h | 124 ++++ .../Functions/FunctionsStringSimilarity.cpp | 158 +---- 4 files changed, 919 insertions(+), 135 deletions(-) create mode 100644 dbms/src/Functions/ExtractString.h create mode 100644 dbms/src/Functions/FunctionsStringHash.cpp create mode 100644 dbms/src/Functions/FunctionsStringHash.h diff --git a/dbms/src/Functions/ExtractString.h b/dbms/src/Functions/ExtractString.h new file mode 100644 index 00000000000..05566496cba --- /dev/null +++ b/dbms/src/Functions/ExtractString.h @@ -0,0 +1,187 @@ +#include + +#include +#include +#include +#include +#include + +#ifdef __SSE4_2__ +# include +#endif + +namespace DB +{ +//used by FunctionsStringSimilarity and FunctionsStringHash +//includes exacting ASCII ngram, UTF8 ngram, ASCII word and UTF8 word +template +struct ExtractStringImpl +{ + static constexpr size_t default_padding = 16; + + static ALWAYS_INLINE size_t readASCIICodePoints(UInt8 * code_points, const char *& pos, const char * end) + { + /// Offset before which we copy some data. + constexpr size_t padding_offset = default_padding - N + 1; + /// We have an array like this for ASCII (N == 4, other cases are similar) + /// |a0|a1|a2|a3|a4|a5|a6|a7|a8|a9|a10|a11|a12|a13|a14|a15|a16|a17|a18| + /// And we copy ^^^^^^^^^^^^^^^ these bytes to the start + /// Actually it is enough to copy 3 bytes, but memcpy for 4 bytes translates into 1 instruction + memcpy(code_points, code_points + padding_offset, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(UInt8)); + /// Now we have an array + /// |a13|a14|a15|a16|a4|a5|a6|a7|a8|a9|a10|a11|a12|a13|a14|a15|a16|a17|a18| + /// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + /// Doing unaligned read of 16 bytes and copy them like above + /// 16 is also chosen to do two `movups`. + /// Such copying allow us to have 3 codepoints from the previous read to produce the 4-grams with them. + memcpy(code_points + (N - 1), pos, default_padding * sizeof(UInt8)); + + if constexpr (CaseInsensitive) + { + /// We really need template lambdas with C++20 to do it inline + unrollLowering(code_points, std::make_index_sequence()); + } + pos += padding_offset; + if (pos > end) + return default_padding - (pos - end); + return default_padding; + } + + //used by FunctionsStringHash + //it's not easy to add padding for ColumnString, so we need safety check each memcpy + static ALWAYS_INLINE size_t readASCIICodePointsNoPadding(UInt8 * code_points, const char *& pos, const char * end) + { + constexpr size_t padding_offset = default_padding - N + 1; + memcpy(code_points, code_points + padding_offset, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(UInt8)); + + //safety check + size_t cpy_size = (pos + padding_offset > end) ? end - pos : padding_offset; + + memcpy(code_points + (N - 1), pos, cpy_size * sizeof(UInt8)); + + if constexpr (CaseInsensitive) + { + unrollLowering(code_points, std::make_index_sequence()); + } + pos += padding_offset; + if (pos > end) + return default_padding - (pos - end); + return default_padding; + } + + //read a ASCII word from pos to word + //if the word size exceeds max_word_size, only read max_word_size byte + //in FuntionsStringHash, the default value of max_word_size is 128 + static ALWAYS_INLINE inline size_t readOneASCIIWord(UInt8 * word, const char *& pos, const char * end, const size_t & max_word_size) + { + //jump seperators + while (pos < end && !isAlphaNum(*pos)) + ++pos; + + // word start from here + const char * word_start = pos; + while (pos < end && isAlphaNum(*pos)) + ++pos; + + size_t word_size = (static_cast(pos - word_start) <= max_word_size) ? pos - word_start : max_word_size; + + memcpy(word, word_start, word_size); + if (CaseInsensitive) + { + std::transform(word, word + word_size, word, [](UInt8 c) { return std::tolower(c); }); + } + return word_size; + } + + static ALWAYS_INLINE inline size_t readUTF8CodePoints(UInt32 * code_points, const char *& pos, const char * end) + { + memcpy(code_points, code_points + default_padding - N + 1, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(UInt32)); + + size_t num = N - 1; + while (num < default_padding && pos < end) + { + code_points[num++] = readOneUTF8Code(pos, end); + } + return num; + } + + //read one UTF8 word from pos to word + //also, we assume that one word size cann't exceed max_word_size with default value 128 + static ALWAYS_INLINE inline size_t readOneUTF8Word(UInt32 * word, const char *& pos, const char * end, const size_t & max_word_size) + { + // jump UTF8 seperator + while (pos < end && isUTF8Sep(*pos)) + ++pos; + //UTF8 word's character number + size_t num = 0; + while (pos < end && num < max_word_size && !isUTF8Sep(*pos)) + { + word[num++] = readOneUTF8Code(pos, end); + } + return num; + } + +private: + static ALWAYS_INLINE inline bool isAlphaNum(const UInt8 c) + { + return (c >= 48 && c <= 57) || (c >= 65 && c <= 90) || (c >= 97 && c <= 122); + } + + template + static ALWAYS_INLINE inline void unrollLowering(Container & cont, const std::index_sequence &) + { + ((cont[Offset + I] = std::tolower(cont[Offset + I])), ...); + } + + //we use ASCII non-alphanum character as UTF8 seperator + static ALWAYS_INLINE inline bool isUTF8Sep(const UInt8 c) { return c < 128 && !isAlphaNum(c); } + + // read one UTF8 character and return it + static ALWAYS_INLINE inline UInt32 readOneUTF8Code(const char *& pos, const char * end) + { + size_t length = UTF8::seqLength(*pos); + + if (pos + length > end) + length = end - pos; + UInt32 res; + switch (length) + { + case 1: + res = 0; + memcpy(&res, pos, 1); + break; + case 2: + res = 0; + memcpy(&res, pos, 2); + break; + case 3: + res = 0; + memcpy(&res, pos, 3); + break; + default: + memcpy(&res, pos, 4); + } + + if constexpr (CaseInsensitive) + { + switch (length) + { + case 4: + res &= ~(1u << (5 + 3 * CHAR_BIT)); + [[fallthrough]]; + case 3: + res &= ~(1u << (5 + 2 * CHAR_BIT)); + [[fallthrough]]; + case 2: + res &= ~(1u); + res &= ~(1u << (5 + CHAR_BIT)); + [[fallthrough]]; + default: + res &= ~(1u << 5); + } + } + pos += length; + return res; + } +}; +} diff --git a/dbms/src/Functions/FunctionsStringHash.cpp b/dbms/src/Functions/FunctionsStringHash.cpp new file mode 100644 index 00000000000..797d7d30078 --- /dev/null +++ b/dbms/src/Functions/FunctionsStringHash.cpp @@ -0,0 +1,585 @@ +#include + +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +namespace DB +{ +struct Hash +{ + static ALWAYS_INLINE inline UInt64 ngramASCIIHash(const UInt8 * code_points) + { + return intHashCRC32(unalignedLoad(code_points)); + } + + static ALWAYS_INLINE inline UInt64 ngramUTF8Hash(const UInt32 * code_points) + { + UInt64 combined = (static_cast(code_points[0]) << 32) | code_points[1]; +#ifdef __SSE4_2__ + return _mm_crc32_u64(code_points[2], combined); +#else + return (intHashCRC32(combined) ^ intHashCRC32(code_points[2])); +#endif + } + + static ALWAYS_INLINE inline UInt64 wordShinglesHash(const UInt64 * hashes, const size_t & size, const size_t & offset) + { + UInt64 res = 0; + UInt8 flag = 0; + for (size_t i = offset; i < size; ++i) + { + if (flag) + res &= intHashCRC32(hashes[i]); + else + res |= intHashCRC32(hashes[i]); + flag = (flag + 1) % 2; + } + for (size_t i = 0; i < offset; ++i) + { + if (flag) + res &= intHashCRC32(hashes[i]); + else + res |= intHashCRC32(hashes[i]); + flag = (flag + 1) % 2; + } + return res; + } + + template + static ALWAYS_INLINE inline UInt64 hashSum(const CodePoint * hashes, const size_t & K) + { + UInt64 even = 0; + UInt64 odd = 0; + size_t i = 0; + for (; i + 1 < K; i += 2) + { + even |= intHashCRC32(hashes[i]); + odd |= intHashCRC32(hashes[i + 1]); + } + if (i < K) + even |= intHashCRC32(hashes[K - 1]); +#ifdef __SSE4_2__ + return _mm_crc32_u64(even, odd); +#else + return (intHashCRC32(even) ^ intHashCRC32(odd)); +#endif + } +}; + +//Sinhash String -> UInt64 +template +struct SimhashImpl +{ + using ResultType = UInt64; + using StrOp = ExtractStringImpl; + // we made an assumption that the size of one word cann't exceed 128, which may not true + // if some word's size exceed 128, it would be cut up to several word + static constexpr size_t max_word_size = 1u << 7; + static constexpr size_t max_string_size = 1u << 15; + static constexpr size_t simultaneously_codepoints_num = StrOp::default_padding + N - 1; + + // Simhash ngram calculate function: String ->UInt64 + // this function extracting ngram from input string, and maintain a 64-dimensions vector + // for each ngram, calculate a 64 bit hash value, and update the vector according the hash value + // finally return a 64 bit value(UInt64), i'th bit is 1 means vector[i] > 0, otherwise, vector[i] < 0 + static ALWAYS_INLINE inline UInt64 ngramCalculateHashValue( + const char * data, + const size_t size, + size_t (*read_code_points)(CodePoint *, const char *&, const char *), + UInt64 (*hash_functor)(const CodePoint *)) + { + const char * start = data; + const char * end = data + size; + // fingerprint vector, all dimensions initialized to zero at the first + Int64 finger_vec[64] = {}; + CodePoint cp[simultaneously_codepoints_num] = {}; + + size_t found = read_code_points(cp, start, end); + size_t iter = N - 1; + + do + { + for (; iter + N <= found; ++iter) + { + // for each ngram, we can calculate an 64 bit hash + // then update finger_vec according to this hash value + // if the i'th bit is 1, finger_vec[i] plus 1, otherwise minus 1 + UInt64 hash_value = hash_functor(cp + iter); + std::bitset<64> bits(hash_value); + for (size_t i = 0; i < 64; ++i) + { + finger_vec[i] += ((bits.test(i)) ? 1 : -1); + } + } + iter = 0; + } while (start < end && (found = read_code_points(cp, start, end))); + + //finally, we return a 64 bit value according to finger_vec + //if finger_vec[i] > 0, the i'th bit of the value is 1, otherwise 0 + std::bitset<64> res_bit(0u); + for (size_t i = 0; i < 64; ++i) + { + if (finger_vec[i] > 0) + res_bit.set(i); + } + return res_bit.to_ullong(); + } + + // Simhash word shingle calculate funtion: String -> UInt64 + // this function extracting n word shingle from input string, and maintain a 64-dimensions vector as well + // for each word shingle, calculate a 64 bit hash value, and update the vector according the hash value + // finally return a 64 bit value(UInt64), i'th bit is 1 means vector[i] > 0, otherwise, vector[i] < 0 + // + // word shingle hash value calculate: + // 1. at the first, extracts N word shingles and calculate N hash values, store into an array, use this N hash values + // to calculate the first word shingle hash value + // 2. next, we extrac one word each time, and calculate a new hash value of the new word,then use the latest N hash + // values to caculate the next word shingle hash value + static ALWAYS_INLINE inline UInt64 wordShinglesCalculateHashValue( + const char * data, + const size_t size, + size_t (*read_one_word)(CodePoint *, const char *&, const char *, const size_t &), + UInt64 (*hash_functor)(const UInt64 *, const size_t &, const size_t &)) + { + const char * start = data; + const char * end = data + size; + + // Also, a 64 bit vector initialized to zero + Int64 finger_vec[64] = {}; + // a array to store N word hash values + UInt64 nwordHashes[N] = {}; + // word buffer to store one word + CodePoint word_buf[max_word_size] = {}; + size_t word_size; + //get first word shingle + for (size_t i = 0; i < N && start < end; ++i) + { + word_size = read_one_word(word_buf, start, end, max_word_size); + if (word_size) + { + // for each word, calculate a hash value and stored into the array + nwordHashes[i++] = Hash::hashSum(word_buf, word_size); + } + } + + // calculate the first word shingle hash value + UInt64 hash_value = hash_functor(nwordHashes, N, 0); + std::bitset<64> bits_(hash_value); + for (size_t i = 0; i < 64; ++i) + { + finger_vec[i] += ((bits_.test(i)) ? 1 : -1); + } + + size_t offset = 0; + while (start < end && (word_size = read_one_word(word_buf, start, end, max_word_size))) + { + // we need to store the new word hash value to the oldest location. + // for example, N = 5, array |a0|a1|a2|a3|a4|, now , a0 is the oldest location, + // so we need to store new word hash into location of a0, then ,this array become + // |a5|a1|a2|a3|a4|, next time, a1 become the oldest location, we need to store new + // word hash value into locaion of a1, then array become |a5|a6|a2|a3|a4| + nwordHashes[offset] = Hash::hashSum(word_buf, word_size); + offset = (offset + 1) % N; + //according to the word hash storation way, in order to not lose the word shingle's + //sequence information, when calculation word shingle hash value, we need provide the offset + //inforation, which is the offset of the first word's hash value of the word shingle + hash_value = hash_functor(nwordHashes, N, offset); + std::bitset<64> bits(hash_value); + for (size_t i = 0; i < 64; ++i) + { + finger_vec[i] += ((bits.test(i)) ? 1 : -1); + } + } + + std::bitset<64> res_bit(0u); + for (size_t i = 0; i < 64; ++i) + { + if (finger_vec[i] > 0) + res_bit.set(i); + } + return res_bit.to_ullong(); + } + + template + static ALWAYS_INLINE inline auto dispatch(CalcFunc calc_func, Args &&... args) + { + if constexpr (Ngram) + { + if constexpr (!UTF8) + return calc_func(std::forward(args)..., StrOp::readASCIICodePointsNoPadding, Hash::ngramASCIIHash); + else + return calc_func(std::forward(args)..., StrOp::readUTF8CodePoints, Hash::ngramUTF8Hash); + } + else + { + if constexpr (!UTF8) + return calc_func(std::forward(args)..., StrOp::readOneASCIIWord, Hash::wordShinglesHash); + else + return calc_func(std::forward(args)..., StrOp::readOneUTF8Word, Hash::wordShinglesHash); + } + } + + // constant string + static inline void constant(const String data, UInt64 & res) + { + if constexpr (Ngram) + res = dispatch(ngramCalculateHashValue, data.data(), data.size()); + else + res = dispatch(wordShinglesCalculateHashValue, data.data(), data.size()); + } + + //non-constant string + static void vector(const ColumnString::Chars & data, const ColumnString::Offsets & offsets, PaddedPODArray & res) + { + for (size_t i = 0; i < offsets.size(); ++i) + { + const char * one_data = reinterpret_cast(&data[offsets[i - 1]]); + const size_t data_size = offsets[i] - offsets[i - 1] - 1; + if (data_size <= max_string_size) + { + if constexpr (Ngram) + res[i] = dispatch(ngramCalculateHashValue, one_data, data_size); + else + res[i] = dispatch(wordShinglesCalculateHashValue, one_data, data_size); + } + } + } +}; + +//Minhash: String -> Tuple(UInt64, UInt64) +//for each string, we extract ngram or word shingle, +//for each ngram or word shingle, calculate a hash value, +//then we take the K minimum hash values to calculate a hashsum, +//and take the K maximum hash values to calculate another hashsum, +//return this two hashsum: Tuple(hashsum1, hashsum2) +template +struct MinhashImpl +{ + using ResultType = UInt64; + using StrOp = ExtractStringImpl; + static constexpr size_t max_word_size = 1u << 7; + static constexpr size_t max_string_size = 1u << 15; + static constexpr size_t simultaneously_codepoints_num = StrOp::default_padding + N - 1; + + // insert a new value into K minimum hash array if this value + // is smaller than the greatest value in the array + static ALWAYS_INLINE inline void insert_minValue(UInt64 * hashes, UInt64 v) + { + size_t i = 0; + for (; i < K && hashes[i] <= v; ++i) + ; + if (i == K) + return; + for (size_t j = K - 2; j >= i; --j) + hashes[j + 1] = hashes[j]; + hashes[i] = v; + } + + // insert a new value into K maximum hash array if this value + // is greater than the smallest value in the array + static ALWAYS_INLINE inline void insert_maxValue(UInt64 * hashes, UInt64 v) + { + int i = K - 1; + for (; i >= 0 && hashes[i] >= v; --i) + ; + if (i < 0) + return; + for (int j = 1; j <= i; ++j) + hashes[j - 1] = hashes[j]; + hashes[i] = v; + } + + //Minhash ngram calculate function, String -> Tuple(UInt64, UInt64) + //we extract ngram from input string, and calculate a hash value for each ngram + //then we take the K minimum hash values to calculate a hashsum, + //and take the K maximum hash values to calculate another hashsum, + //return this two hashsum: Tuple(hashsum1, hashsum2) + static ALWAYS_INLINE inline std::tuple ngramCalculateHashValue( + const char * data, + const size_t size, + size_t (*read_code_points)(CodePoint *, const char *&, const char *), + UInt64 (*hash_functor)(const CodePoint *)) + { + const char * start = data; + const char * end = data + size; + // we just maintain the K minimu and K maximum hash values + UInt64 k_minimum[K] = {}; + UInt64 k_maxinum[K] = {}; + CodePoint cp[simultaneously_codepoints_num] = {}; + + size_t found = read_code_points(cp, start, end); + size_t iter = N - 1; + + do + { + for (; iter + N <= found; ++iter) + { + auto new_hash = hash_functor(cp + iter); + // insert the new hash value into array used to store K minimum value + // and K maximum value + insert_minValue(k_minimum, new_hash); + insert_maxValue(k_maxinum, new_hash); + } + iter = 0; + } while (start < end && (found = read_code_points(cp, start, end))); + + // calculate hashsum of the K minimum hash values and K maximum hash values + UInt64 res1 = Hash::hashSum(k_maxinum, K); + UInt64 res2 = Hash::hashSum(k_maxinum, K); + return std::make_tuple(res1, res2); + } + + // Minhash word shingle hash value calculate function: String ->Tuple(UInt64, UInt64) + //for each word shingle, we calculate a hash value, but in fact, we just maintain the + //K minimum and K maximum hash value + static ALWAYS_INLINE inline std::tuple wordShinglesCalculateHashValue( + const char * data, + const size_t size, + size_t (*read_one_word)(CodePoint *, const char *&, const char *, const size_t &), + UInt64 (*hash_functor)(const UInt64 *, const size_t &, const size_t &)) + { + const char * start = data; + const char * end = start + size; + //also we just store the K minimu and K maximum hash values + UInt64 k_minimum[K] = {}; + UInt64 k_maxinum[K] = {}; + // array to store n word hashes + UInt64 nwordHashes[N] = {}; + // word buffer to store one word + CodePoint word_buf[max_word_size] = {}; + size_t word_size; + //how word shingle hash value calculation and word hash storation is same as we + //have descripted in Simhash wordShinglesCalculateHashValue function + for (size_t i = 0; i < N && start < end; ++i) + { + word_size = read_one_word(word_buf, start, end, max_word_size); + if (word_size) + { + nwordHashes[i++] = Hash::hashSum(word_buf, word_size); + } + } + + auto new_hash = hash_functor(nwordHashes, N, 0); + insert_minValue(k_minimum, new_hash); + insert_maxValue(k_maxinum, new_hash); + + size_t offset = 0; + while (start < end && (word_size = read_one_word(word_buf, start, end, max_word_size))) + { + nwordHashes[offset] = Hash::hashSum(word_buf, word_size); + offset = (offset + 1) % N; + new_hash = hash_functor(nwordHashes, N, offset); + insert_minValue(k_minimum, new_hash); + insert_maxValue(k_maxinum, new_hash); + } + + // calculate hashsum + UInt64 res1 = Hash::hashSum(k_minimum, K); + UInt64 res2 = Hash::hashSum(k_maxinum, K); + return std::make_tuple(res1, res2); + } + + template + static ALWAYS_INLINE inline auto dispatch(CalcFunc calc_func, Args &&... args) + { + if constexpr (Ngram) + { + if constexpr (!UTF8) + return calc_func(std::forward(args)..., StrOp::readASCIICodePointsNoPadding, Hash::ngramASCIIHash); + else + return calc_func(std::forward(args)..., StrOp::readUTF8CodePoints, Hash::ngramUTF8Hash); + } + else + { + if constexpr (!UTF8) + return calc_func(std::forward(args)..., StrOp::readOneASCIIWord, Hash::wordShinglesHash); + else + return calc_func(std::forward(args)..., StrOp::readOneUTF8Word, Hash::wordShinglesHash); + } + } + + // constant string + static void constant(const String data, UInt64 & res1, UInt64 & res2) + { + if constexpr (Ngram) + std::tie(res1, res2) = dispatch(ngramCalculateHashValue, data.data(), data.size()); + else + std::tie(res1, res2) = dispatch(wordShinglesCalculateHashValue, data.data(), data.size()); + } + + //non-constant string + static void vector( + const ColumnString::Chars & data, + const ColumnString::Offsets & offsets, + PaddedPODArray & res1, + PaddedPODArray & res2) + { + for (size_t i = 0; i < offsets.size(); ++i) + { + const char * one_data = reinterpret_cast(&data[offsets[i - 1]]); + const size_t data_size = offsets[i] - offsets[i - 1] - 1; + if (data_size <= max_string_size) + { + if constexpr (Ngram) + std::tie(res1[i], res2[i]) = dispatch(ngramCalculateHashValue, one_data, data_size); + else + std::tie(res1[i], res2[i]) = dispatch(wordShinglesCalculateHashValue, one_data, data_size); + } + } + } +}; + +struct NameNgramSimhash +{ + static constexpr auto name = "ngramSimhash"; +}; + +struct NameNgramSimhashCaseInsensitive +{ + static constexpr auto name = "ngramSimhashCaseInsensitive"; +}; + +struct NameNgramSimhashUTF8 +{ + static constexpr auto name = "ngramSimhashUTF8"; +}; + +struct NameNgramSimhashCaseInsensitiveUTF8 +{ + static constexpr auto name = "ngramSimhashCaseInsensitiveUTF8"; +}; + +struct NameWordShingleSimhash +{ + static constexpr auto name = "wordShingleSimhash"; +}; + +struct NameWordShingleSimhashCaseInsensitive +{ + static constexpr auto name = "wordShingleSimhashCaseInsensitive"; +}; + +struct NameWordShingleSimhashUTF8 +{ + static constexpr auto name = "wordShingleSimhashUTF8"; +}; + +struct NameWordShingleSimhashCaseInsensitiveUTF8 +{ + static constexpr auto name = "wordShingleSimhashCaseInsensitiveUTF8"; +}; + +struct NameNgramMinhash +{ + static constexpr auto name = "ngramMinhash"; +}; + +struct NameNgramMinhashCaseInsensitive +{ + static constexpr auto name = "ngramMinhashCaseInsensitive"; +}; + +struct NameNgramMinhashUTF8 +{ + static constexpr auto name = "ngramMinhashUTF8"; +}; + +struct NameNgramMinhashCaseInsensitiveUTF8 +{ + static constexpr auto name = "ngramMinhashCaseInsensitiveUTF8"; +}; + +struct NameWordShingleMinhash +{ + static constexpr auto name = "wordShingleMinhash"; +}; + +struct NameWordShingleMinhashCaseInsensitive +{ + static constexpr auto name = "wordShingleMinhashCaseInsensitive"; +}; + +struct NameWordShingleMinhashUTF8 +{ + static constexpr auto name = "wordShingleMinhashUTF8"; +}; + +struct NameWordShingleMinhashCaseInsensitiveUTF8 +{ + static constexpr auto name = "wordShingleMinhashCaseInsensitiveUTF8"; +}; + +//Simhash +using FunctionNgramSimhash = FunctionsStringHash, NameNgramSimhash, true>; + +using FunctionNgramSimhashCaseInsensitive + = FunctionsStringHash, NameNgramSimhashCaseInsensitive, true>; + +using FunctionNgramSimhashUTF8 = FunctionsStringHash, NameNgramSimhashUTF8, true>; + +using FunctionNgramSimhashCaseInsensitiveUTF8 + = FunctionsStringHash, NameNgramSimhashCaseInsensitiveUTF8, true>; + +using FunctionWordShingleSimhash = FunctionsStringHash, NameWordShingleSimhash, true>; + +using FunctionWordShingleSimhashCaseInsensitive + = FunctionsStringHash, NameWordShingleSimhashCaseInsensitive, true>; + +using FunctionWordShingleSimhashUTF8 = FunctionsStringHash, NameWordShingleSimhashUTF8, true>; + +using FunctionWordShingleSimhashCaseInsensitiveUTF8 + = FunctionsStringHash, NameWordShingleSimhashCaseInsensitiveUTF8, true>; + +//Minhash +using FunctionNgramMinhash = FunctionsStringHash, NameNgramMinhash, false>; + +using FunctionNgramMinhashCaseInsensitive + = FunctionsStringHash, NameNgramMinhashCaseInsensitive, false>; + +using FunctionNgramMinhashUTF8 = FunctionsStringHash, NameNgramMinhashUTF8, false>; + +using FunctionNgramMinhashCaseInsensitiveUTF8 + = FunctionsStringHash, NameNgramMinhashCaseInsensitiveUTF8, false>; + +using FunctionWordShingleMinhash = FunctionsStringHash, NameWordShingleMinhash, false>; + +using FunctionWordShingleMinhashCaseInsensitive + = FunctionsStringHash, NameWordShingleMinhashCaseInsensitive, false>; + +using FunctionWordShingleMinhashUTF8 + = FunctionsStringHash, NameWordShingleMinhashUTF8, false>; + +using FunctionWordShingleMinhashCaseInsensitiveUTF8 + = FunctionsStringHash, NameWordShingleMinhashCaseInsensitiveUTF8, false>; + +void registerFunctionsStringHash(FunctionFactory & factory) +{ + factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); + + factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); +} +} + diff --git a/dbms/src/Functions/FunctionsStringHash.h b/dbms/src/Functions/FunctionsStringHash.h new file mode 100644 index 00000000000..185097ade99 --- /dev/null +++ b/dbms/src/Functions/FunctionsStringHash.h @@ -0,0 +1,124 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ +namespace ErrorCodes +{ + extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int ILLEGAL_COLUMN; + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int TOO_LARGE_STRING_SIZE; +} + +//FunctionStringHash +//Simhash: String -> UInt64 +//Minhash: String -> (UInt64, UInt64) +template +class FunctionsStringHash : public IFunction +{ +public: + static constexpr auto name = Name::name; + + static FunctionPtr create(const Context &) { return std::make_shared(); } + + String getName() const override { return name; } + + size_t getNumberOfArguments() const override { return 1; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + if (!isString(arguments[0])) + throw Exception( + "Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + if (IsSimhash) + return std::make_shared>(); + auto element = DataTypeFactory::instance().get("UInt64"); + return std::make_shared(DataTypes{element, element}); + } + + void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t) override + { + const ColumnPtr & column = block.getByPosition(arguments[0]).column; + const ColumnConst * col_const = typeid_cast(&*column); + using ResultType = typename Impl::ResultType; + if constexpr (IsSimhash) + { + if (col_const) + { + ResultType res{}; + const String & str_data = col_const->getValue(); + if (str_data.size() > Impl::max_string_size) + { + throw Exception( + "String size is too big for function " + getName() + ". Should be at most " + std::to_string(Impl::max_string_size), + ErrorCodes::TOO_LARGE_STRING_SIZE); + } + Impl::constant(str_data, res); + block.getByPosition(result).column = block.getByPosition(result).type->createColumnConst(1, toField(res)); + } + else + { + // non const string + auto col_res = ColumnVector::create(); + typename ColumnVector::Container & vec_res = col_res->getData(); + vec_res.resize(column->size()); + const ColumnString * col_str_vector = checkAndGetColumn(&*column); + Impl::vector(col_str_vector->getChars(), col_str_vector->getOffsets(), vec_res); + block.getByPosition(result).column = std::move(col_res); + } + } + else // Min hash + { + if (col_const) + { + ResultType h1, h2; + const String & str_data = col_const->getValue(); + if (str_data.size() > Impl::max_string_size) + { + throw Exception( + "String size is too big for function " + getName() + ". Should be at most " + std::to_string(Impl::max_string_size), + ErrorCodes::TOO_LARGE_STRING_SIZE); + } + Impl::constant(str_data, h1, h2); + auto h1_col = ColumnVector::create(1); + auto h2_col = ColumnVector::create(1); + typename ColumnVector::Container & h1_data = h1_col->getData(); + typename ColumnVector::Container & h2_data = h2_col->getData(); + h1_data[0] = h1; + h2_data[0] = h2; + MutableColumns tuple_columns; + tuple_columns.emplace_back(std::move(h1_col)); + tuple_columns.emplace_back(std::move(h2_col)); + block.getByPosition(result).column = ColumnTuple::create(std::move(tuple_columns)); + } + else + { + //non const string + auto col_h1 = ColumnVector::create(); + auto col_h2 = ColumnVector::create(); + typename ColumnVector::Container & vec_h1 = col_h1->getData(); + typename ColumnVector::Container & vec_h2 = col_h2->getData(); + vec_h1.resize(column->size()); + vec_h2.resize(column->size()); + const ColumnString * col_str_vector = checkAndGetColumn(&*column); + Impl::vector(col_str_vector->getChars(), col_str_vector->getOffsets(), vec_h1, vec_h2); + MutableColumns tuple_columns; + tuple_columns.emplace_back(std::move(col_h1)); + tuple_columns.emplace_back(std::move(col_h2)); + block.getByPosition(result).column = ColumnTuple::create(std::move(tuple_columns)); + } + } + } +}; +} + diff --git a/dbms/src/Functions/FunctionsStringSimilarity.cpp b/dbms/src/Functions/FunctionsStringSimilarity.cpp index 9dda521cd29..c6327ad59b4 100644 --- a/dbms/src/Functions/FunctionsStringSimilarity.cpp +++ b/dbms/src/Functions/FunctionsStringSimilarity.cpp @@ -1,5 +1,6 @@ #include +#include #include #include #include @@ -10,13 +11,6 @@ #include -#include -#include -#include -#include -#include -#include - #ifdef __SSE4_2__ # include #endif @@ -36,6 +30,7 @@ template ; /// map_size for ngram difference. static constexpr size_t map_size = 1u << 16; @@ -44,7 +39,7 @@ struct NgramDistanceImpl static constexpr size_t max_string_size = 1u << 15; /// Default padding to read safely. - static constexpr size_t default_padding = 16; + static constexpr size_t default_padding = StrOp::default_padding; /// Max codepoints to store at once. 16 is for batching usage and PODArray has this padding. static constexpr size_t simultaneously_codepoints_num = default_padding + N - 1; @@ -70,102 +65,6 @@ struct NgramDistanceImpl #endif } - template - static ALWAYS_INLINE inline void unrollLowering(Container & cont, const std::index_sequence &) - { - ((cont[Offset + I] = std::tolower(cont[Offset + I])), ...); - } - - static ALWAYS_INLINE size_t readASCIICodePoints(CodePoint * code_points, const char *& pos, const char * end) - { - /// Offset before which we copy some data. - constexpr size_t padding_offset = default_padding - N + 1; - /// We have an array like this for ASCII (N == 4, other cases are similar) - /// |a0|a1|a2|a3|a4|a5|a6|a7|a8|a9|a10|a11|a12|a13|a14|a15|a16|a17|a18| - /// And we copy ^^^^^^^^^^^^^^^ these bytes to the start - /// Actually it is enough to copy 3 bytes, but memcpy for 4 bytes translates into 1 instruction - memcpy(code_points, code_points + padding_offset, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(CodePoint)); - /// Now we have an array - /// |a13|a14|a15|a16|a4|a5|a6|a7|a8|a9|a10|a11|a12|a13|a14|a15|a16|a17|a18| - /// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - /// Doing unaligned read of 16 bytes and copy them like above - /// 16 is also chosen to do two `movups`. - /// Such copying allow us to have 3 codepoints from the previous read to produce the 4-grams with them. - memcpy(code_points + (N - 1), pos, default_padding * sizeof(CodePoint)); - - if constexpr (case_insensitive) - { - /// We really need template lambdas with C++20 to do it inline - unrollLowering(code_points, std::make_index_sequence()); - } - pos += padding_offset; - if (pos > end) - return default_padding - (pos - end); - return default_padding; - } - - static ALWAYS_INLINE size_t readUTF8CodePoints(CodePoint * code_points, const char *& pos, const char * end) - { - /// The same copying as described in the function above. - memcpy(code_points, code_points + default_padding - N + 1, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(CodePoint)); - - size_t num = N - 1; - while (num < default_padding && pos < end) - { - size_t length = UTF8::seqLength(*pos); - - if (pos + length > end) - length = end - pos; - - CodePoint res; - /// This is faster than just memcpy because of compiler optimizations with moving bytes. - switch (length) - { - case 1: - res = 0; - memcpy(&res, pos, 1); - break; - case 2: - res = 0; - memcpy(&res, pos, 2); - break; - case 3: - res = 0; - memcpy(&res, pos, 3); - break; - default: - memcpy(&res, pos, 4); - } - - /// This is not a really true case insensitive utf8. We zero the 5-th bit of every byte. - /// And first bit of first byte if there are two bytes. - /// For ASCII it works https://catonmat.net/ascii-case-conversion-trick. For most cyrrilic letters also does. - /// For others, we don't care now. Lowering UTF is not a cheap operation. - if constexpr (case_insensitive) - { - switch (length) - { - case 4: - res &= ~(1u << (5 + 3 * CHAR_BIT)); - [[fallthrough]]; - case 3: - res &= ~(1u << (5 + 2 * CHAR_BIT)); - [[fallthrough]]; - case 2: - res &= ~(1u); - res &= ~(1u << (5 + CHAR_BIT)); - [[fallthrough]]; - default: - res &= ~(1u << 5); - } - } - - pos += length; - code_points[num++] = res; - } - return num; - } - template static ALWAYS_INLINE inline size_t calculateNeedleStats( const char * data, @@ -250,9 +149,9 @@ struct NgramDistanceImpl static inline auto dispatchSearcher(Callback callback, Args &&... args) { if constexpr (!UTF8) - return callback(std::forward(args)..., readASCIICodePoints, ASCIIHash); + return callback(std::forward(args)..., StrOp::readASCIICodePoints, ASCIIHash); else - return callback(std::forward(args)..., readUTF8CodePoints, UTF8Hash); + return callback(std::forward(args)..., StrOp::readUTF8CodePoints, UTF8Hash); } static void constant_constant(std::string data, std::string needle, Float32 & res) @@ -269,7 +168,8 @@ struct NgramDistanceImpl size_t distance = second_size; if (data_size <= max_string_size) { - size_t first_size = dispatchSearcher(calculateHaystackStatsAndMetric, data.data(), data_size, common_stats, distance, nullptr); + size_t first_size + = dispatchSearcher(calculateHaystackStatsAndMetric, data.data(), data_size, common_stats, distance, nullptr); /// For !symmetric version we should not use first_size. if constexpr (symmetric) res = distance * 1.f / std::max(first_size + second_size, size_t(1)); @@ -313,23 +213,14 @@ struct NgramDistanceImpl if (needle_size <= max_string_size && haystack_size <= max_string_size) { /// Get needle stats. - const size_t needle_stats_size = dispatchSearcher( - calculateNeedleStats, - needle, - needle_size, - common_stats, - needle_ngram_storage.get()); + const size_t needle_stats_size + = dispatchSearcher(calculateNeedleStats, needle, needle_size, common_stats, needle_ngram_storage.get()); size_t distance = needle_stats_size; /// Combine with haystack stats, return to initial needle stats. const size_t haystack_stats_size = dispatchSearcher( - calculateHaystackStatsAndMetric, - haystack, - haystack_size, - common_stats, - distance, - haystack_ngram_storage.get()); + calculateHaystackStatsAndMetric, haystack, haystack_size, common_stats, distance, haystack_ngram_storage.get()); /// Return to zero array stats. for (size_t j = 0; j < needle_stats_size; ++j) @@ -391,12 +282,8 @@ struct NgramDistanceImpl if (needle_size <= max_string_size && haystack_size <= max_string_size) { - const size_t needle_stats_size = dispatchSearcher( - calculateNeedleStats, - needle, - needle_size, - common_stats, - needle_ngram_storage.get()); + const size_t needle_stats_size + = dispatchSearcher(calculateNeedleStats, needle, needle_size, common_stats, needle_ngram_storage.get()); size_t distance = needle_stats_size; @@ -420,15 +307,11 @@ struct NgramDistanceImpl prev_offset = needle_offsets[i]; } - } } static void vector_constant( - const ColumnString::Chars & data, - const ColumnString::Offsets & offsets, - std::string needle, - PaddedPODArray & res) + const ColumnString::Chars & data, const ColumnString::Offsets & offsets, std::string needle, PaddedPODArray & res) { /// zeroing our map NgramStats common_stats = {}; @@ -454,7 +337,8 @@ struct NgramDistanceImpl size_t haystack_stats_size = dispatchSearcher( calculateHaystackStatsAndMetric, reinterpret_cast(haystack), - haystack_size, common_stats, + haystack_size, + common_stats, distance, ngram_storage.get()); /// For !symmetric version we should not use haystack_stats_size. @@ -516,14 +400,18 @@ struct NameNgramSearchUTF8CaseInsensitive }; using FunctionNgramDistance = FunctionsStringSimilarity, NameNgramDistance>; -using FunctionNgramDistanceCaseInsensitive = FunctionsStringSimilarity, NameNgramDistanceCaseInsensitive>; +using FunctionNgramDistanceCaseInsensitive + = FunctionsStringSimilarity, NameNgramDistanceCaseInsensitive>; using FunctionNgramDistanceUTF8 = FunctionsStringSimilarity, NameNgramDistanceUTF8>; -using FunctionNgramDistanceCaseInsensitiveUTF8 = FunctionsStringSimilarity, NameNgramDistanceUTF8CaseInsensitive>; +using FunctionNgramDistanceCaseInsensitiveUTF8 + = FunctionsStringSimilarity, NameNgramDistanceUTF8CaseInsensitive>; using FunctionNgramSearch = FunctionsStringSimilarity, NameNgramSearch>; -using FunctionNgramSearchCaseInsensitive = FunctionsStringSimilarity, NameNgramSearchCaseInsensitive>; +using FunctionNgramSearchCaseInsensitive + = FunctionsStringSimilarity, NameNgramSearchCaseInsensitive>; using FunctionNgramSearchUTF8 = FunctionsStringSimilarity, NameNgramSearchUTF8>; -using FunctionNgramSearchCaseInsensitiveUTF8 = FunctionsStringSimilarity, NameNgramSearchUTF8CaseInsensitive>; +using FunctionNgramSearchCaseInsensitiveUTF8 + = FunctionsStringSimilarity, NameNgramSearchUTF8CaseInsensitive>; void registerFunctionsStringSimilarity(FunctionFactory & factory) From e0cf07e958c77cf3b7f1faeb727ba3541ae00f18 Mon Sep 17 00:00:00 2001 From: fenglv Date: Wed, 6 Nov 2019 18:35:23 +0800 Subject: [PATCH 02/23] add hammingdistance function --- dbms/src/Functions/bitHammingDistance.cpp | 174 ++++++++++++++ dbms/src/Functions/registerFunctions.cpp | 2 + .../Functions/registerFunctionsArithmetic.cpp | 4 + dbms/src/Functions/tupleHammingDistance.cpp | 224 ++++++++++++++++++ 4 files changed, 404 insertions(+) create mode 100644 dbms/src/Functions/bitHammingDistance.cpp create mode 100644 dbms/src/Functions/tupleHammingDistance.cpp diff --git a/dbms/src/Functions/bitHammingDistance.cpp b/dbms/src/Functions/bitHammingDistance.cpp new file mode 100644 index 00000000000..2572720bb4e --- /dev/null +++ b/dbms/src/Functions/bitHammingDistance.cpp @@ -0,0 +1,174 @@ +#include +#include +#include +#include +#include +#include + +namespace DB +{ +namespace ErrorCodes +{ + extern const int ILLEGAL_COLUMN; + extern const int ILLEGAL_TYPE_OF_ARGUMENT; +} + + +template +struct BitHammingDistanceImpl +{ + using ResultType = UInt8; + + static void NO_INLINE vector_vector(const PaddedPODArray & a, const PaddedPODArray & b, PaddedPODArray & c) + { + size_t size = a.size(); + for (size_t i = 0; i < size; ++i) + c[i] = apply(a[i], b[i]); + } + + static void NO_INLINE vector_constant(const PaddedPODArray & a, B b, PaddedPODArray & c) + { + size_t size = a.size(); + for (size_t i = 0; i < size; ++i) + c[i] = apply(a[i], b); + } + + static void NO_INLINE constant_vector(A a, const PaddedPODArray & b, PaddedPODArray & c) + { + size_t size = b.size(); + for (size_t i = 0; i < size; ++i) + c[i] = apply(a, b[i]); + } + + static ResultType constant_constant(A a, B b) { return apply(a, b); } + +private: + static UInt8 pop_cnt(UInt64 res) + { + UInt8 count = 0; + for (; res; res >>= 1) + count += res & 1u; + return count; + } + + static inline UInt8 apply(UInt64 a, UInt64 b) + { + UInt64 res = a ^ b; + return pop_cnt(res); + } +}; + +template +bool castType(const IDataType * type, F && f) +{ + return castTypeToEither< + DataTypeInt8, + DataTypeInt16, + DataTypeInt32, + DataTypeInt64, + DataTypeUInt8, + DataTypeUInt16, + DataTypeUInt32, + DataTypeUInt64>(type, std::forward(f)); +} + +template +static bool castBothTypes(const IDataType * left, const IDataType * right, F && f) +{ + return castType(left, [&](const auto & left_) { return castType(right, [&](const auto & right_) { return f(left_, right_); }); }); +} + +//bitHammingDistance function: (Integer, Integer) -> UInt8 +class FunctionBitHammingDistance : public IFunction +{ +public: + static constexpr auto name = "bitHammingDistance"; + using ResultType = UInt8; + static FunctionPtr create(const Context &) { return std::make_shared(); } + + String getName() const override { return name; } + + size_t getNumberOfArguments() const override { return 2; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + if (!isInteger(arguments[0])) + throw Exception( + "Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + if (!isInteger(arguments[1])) + throw Exception( + "Illegal type " + arguments[1]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + return std::make_shared(); + } + + void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t) override + { + auto * left_generic = block.getByPosition(arguments[0]).type.get(); + auto * right_generic = block.getByPosition(arguments[1]).type.get(); + bool valid = castBothTypes(left_generic, right_generic, [&](const auto & left, const auto & right) { + using LeftDataType = std::decay_t; + using RightDataType = std::decay_t; + using T0 = typename LeftDataType::FieldType; + using T1 = typename RightDataType::FieldType; + using ColVecT0 = ColumnVector; + using ColVecT1 = ColumnVector; + using ColVecResult = ColumnVector; + + using OpImpl = BitHammingDistanceImpl; + + auto col_left_raw = block.getByPosition(arguments[0]).column.get(); + auto col_right_raw = block.getByPosition(arguments[1]).column.get(); + if (auto col_left = checkAndGetColumnConst(col_left_raw)) + { + if (auto col_right = checkAndGetColumnConst(col_right_raw)) + { + //constant integer - constant integer + auto res = OpImpl::constant_constant(col_left->template getValue(), col_right->template getValue()); + block.getByPosition(result).column = DataTypeUInt8().createColumnConst(col_left->size(), toField(res)); + return true; + } + } + + typename ColVecResult::MutablePtr col_res = nullptr; + col_res = ColVecResult::create(); + + auto & vec_res = col_res->getData(); + vec_res.resize(block.rows()); + + if (auto col_left_const = checkAndGetColumnConst(col_left_raw)) + { + if (auto col_right = checkAndGetColumn(col_right_raw)) + { + // constant integer - non-constant integer + OpImpl::constant_vector(col_left_const->template getValue(), col_right->getData(), vec_res); + } + else + return false; + } + else if (auto col_left = checkAndGetColumn(col_left_raw)) + { + if (auto col_right = checkAndGetColumn(col_right_raw)) + //non-constant integer - non-constant integer + OpImpl::vector_vector(col_left->getData(), col_right->getData(), vec_res); + else if (auto col_right_const = checkAndGetColumnConst(col_right_raw)) + //non-constant integer - constant integer + OpImpl::vector_constant(col_left->getData(), col_right_const->template getValue(), vec_res); + else + return false; + } + else + return false; + + block.getByPosition(result).column = std::move(col_res); + return true; + }); + if (!valid) + throw Exception(getName() + "'s arguments do not match the expected data types", ErrorCodes::ILLEGAL_COLUMN); + } +}; + +void registerFunctionBitHammingDistance(FunctionFactory & factory) +{ + factory.registerFunction(); +} +} diff --git a/dbms/src/Functions/registerFunctions.cpp b/dbms/src/Functions/registerFunctions.cpp index 501f8e7f90a..09000a1dadd 100644 --- a/dbms/src/Functions/registerFunctions.cpp +++ b/dbms/src/Functions/registerFunctions.cpp @@ -41,6 +41,7 @@ void registerFunctionsFindCluster(FunctionFactory &); void registerFunctionsJSON(FunctionFactory &); void registerFunctionsIntrospection(FunctionFactory &); void registerFunctionsConsistentHashing(FunctionFactory & factory); +void registerFunctionsStringHash(FunctionFactory & factory); void registerFunctions() { @@ -80,6 +81,7 @@ void registerFunctions() registerFunctionsJSON(factory); registerFunctionsIntrospection(factory); registerFunctionsConsistentHashing(factory); + registerFunctionsStringHash(factory); } } diff --git a/dbms/src/Functions/registerFunctionsArithmetic.cpp b/dbms/src/Functions/registerFunctionsArithmetic.cpp index 1faa28e395e..a03058c37e9 100644 --- a/dbms/src/Functions/registerFunctionsArithmetic.cpp +++ b/dbms/src/Functions/registerFunctionsArithmetic.cpp @@ -32,6 +32,8 @@ void registerFunctionIntExp10(FunctionFactory & factory); void registerFunctionRoundToExp2(FunctionFactory & factory); void registerFunctionRoundDuration(FunctionFactory & factory); void registerFunctionRoundAge(FunctionFactory & factory); +void registerFunctionBitHammingDistance(FunctionFactory & factory); +void registerFunctionTupleHammingDistance(FunctionFactory & factory); void registerFunctionBitBoolMaskOr(FunctionFactory & factory); void registerFunctionBitBoolMaskAnd(FunctionFactory & factory); @@ -69,6 +71,8 @@ void registerFunctionsArithmetic(FunctionFactory & factory) registerFunctionRoundToExp2(factory); registerFunctionRoundDuration(factory); registerFunctionRoundAge(factory); + registerFunctionBitHammingDistance(factory); + registerFunctionTupleHammingDistance(factory); /// Not for external use. registerFunctionBitBoolMaskOr(factory); diff --git a/dbms/src/Functions/tupleHammingDistance.cpp b/dbms/src/Functions/tupleHammingDistance.cpp new file mode 100644 index 00000000000..4a727aef59a --- /dev/null +++ b/dbms/src/Functions/tupleHammingDistance.cpp @@ -0,0 +1,224 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ +namespace ErrorCodes +{ + extern const int ILLEGAL_COLUMN; + extern const int ILLEGAL_TYPE_OF_ARGUMENT; +} + +template +struct TupleHammingDistanceImpl +{ + using ResultType = UInt8; + + static void NO_INLINE vector_vector( + const PaddedPODArray & a1, + const PaddedPODArray & b1, + const PaddedPODArray & a2, + const PaddedPODArray & b2, + PaddedPODArray & c) + { + size_t size = a1.size(); + for (size_t i = 0; i < size; ++i) + c[i] = apply(a1[i], a2[i]) + apply(b1[i], b2[i]); + } + + static void NO_INLINE + vector_constant(const PaddedPODArray & a1, const PaddedPODArray & b1, UInt64 a2, UInt64 b2, PaddedPODArray & c) + { + size_t size = a1.size(); + for (size_t i = 0; i < size; ++i) + c[i] = apply(a1[i], a2) + apply(b1[i], b2); + } + + static void NO_INLINE + constant_vector(UInt64 a1, UInt64 b1, const PaddedPODArray & a2, const PaddedPODArray & b2, PaddedPODArray & c) + { + size_t size = a2.size(); + for (size_t i = 0; i < size; ++i) + c[i] = apply(a1, a2[i]) + apply(b1, b2[i]); + } + + static ResultType constant_constant(UInt64 a1, UInt64 b1, UInt64 a2, UInt64 b2) { return apply(a1, a2) + apply(b1, b2); } + +private: + static UInt8 pop_cnt(UInt64 res) + { + UInt8 count = 0; + for (; res; res >>= 1) + count += res & 1u; + return count; + } + + static inline UInt8 apply(UInt64 a, UInt64 b) + { + UInt64 res = a ^ b; + return pop_cnt(res); + } +}; + +template +bool castType(const IDataType * type, F && f) +{ + return castTypeToEither< + DataTypeInt8, + DataTypeInt16, + DataTypeInt32, + DataTypeInt64, + DataTypeUInt8, + DataTypeUInt16, + DataTypeUInt32, + DataTypeUInt64>(type, std::forward(f)); +} + +template +static bool castBothTypes(const IDataType * left, const IDataType * right, F && f) +{ + return castType(left, [&](const auto & left_) { return castType(right, [&](const auto & right_) { return f(left_, right_); }); }); +} + +//tupleHammingDistance function: (Tuple(Integer, Integer), Tuple(Integer, Integer))->UInt8 +//in order to avoid code bloating, for non-constant tuple, we make sure that the elements +//in the tuple should have same data type, and for constant tuple, elements can be any integer +//data type, we cast all of them into UInt64 +class FunctionTupleHammingDistance : public IFunction +{ +public: + static constexpr auto name = "tupleHammingDistance"; + using ResultType = UInt8; + static FunctionPtr create(const Context &) { return std::make_shared(); } + + String getName() const override { return name; } + + size_t getNumberOfArguments() const override { return 2; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + if (!isTuple(arguments[0])) + throw Exception( + "Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + if (!isTuple(arguments[1])) + throw Exception( + "Illegal type " + arguments[1]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + return std::make_shared(); + } + + void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t) override + { + const ColumnWithTypeAndName & arg1 = block.getByPosition(arguments[0]); + const ColumnWithTypeAndName & arg2 = block.getByPosition(arguments[1]); + const DataTypeTuple & type1 = static_cast(*arg1.type); + const DataTypeTuple & type2 = static_cast(*arg2.type); + auto & left_elems = type1.getElements(); + auto & right_elems = type2.getElements(); + if (left_elems.size() != 2 || right_elems.size() != 2) + throw Exception( + "Illegal column of arguments of function " + getName() + ", tuple should have exactly two elements.", + ErrorCodes::ILLEGAL_COLUMN); + bool valid = castBothTypes(left_elems[0].get(), right_elems[0].get(), [&](const auto & left, const auto & right) { + using LeftDataType = std::decay_t; + using RightDataType = std::decay_t; + using T0 = typename LeftDataType::FieldType; + using T1 = typename RightDataType::FieldType; + using ColVecT0 = ColumnVector; + using ColVecT1 = ColumnVector; + using ColVecResult = ColumnVector; + + using OpImpl = TupleHammingDistanceImpl; + + // constant tuple - constant tuple + if (const ColumnConst * const_col_left = checkAndGetColumnConst(arg1.column.get())) + { + if (const ColumnConst * const_col_right = checkAndGetColumnConst(arg2.column.get())) + { + auto cols1 = convertConstTupleToConstantElements(*const_col_left); + auto cols2 = convertConstTupleToConstantElements(*const_col_right); + Field a1, b1, a2, b2; + cols1[0]->get(0, a1); + cols1[1]->get(0, b1); + cols2[0]->get(0, a2); + cols2[1]->get(0, b2); + auto res = OpImpl::constant_constant(a1.get(), b1.get(), a2.get(), b2.get()); + block.getByPosition(result).column = DataTypeUInt8().createColumnConst(const_col_left->size(), toField(res)); + return true; + } + } + + typename ColVecResult::MutablePtr col_res = nullptr; + col_res = ColVecResult::create(); + auto & vec_res = col_res->getData(); + vec_res.resize(block.rows()); + // constant tuple - non-constant tuple + if (const ColumnConst * const_col_left = checkAndGetColumnConst(arg1.column.get())) + { + if (const ColumnTuple * col_right = typeid_cast(arg2.column.get())) + { + auto const_cols = convertConstTupleToConstantElements(*const_col_left); + Field a1, b1; + const_cols[0]->get(0, a1); + const_cols[1]->get(0, b1); + auto col_r1 = checkAndGetColumn(&col_right->getColumn(0)); + auto col_r2 = checkAndGetColumn(&col_right->getColumn(1)); + if (col_r1 && col_r2) + OpImpl::constant_vector(a1.get(), b1.get(), col_r1->getData(), col_r2->getData(), vec_res); + else + return false; + } + else + return false; + } + else if (const ColumnTuple * col_left = typeid_cast(arg1.column.get())) + { + auto col_l1 = checkAndGetColumn(&col_left->getColumn(0)); + auto col_l2 = checkAndGetColumn(&col_left->getColumn(1)); + if (col_l1 && col_l2) + { + // non-constant tuple - constant tuple + if (const ColumnConst * const_col_right = checkAndGetColumnConst(arg2.column.get())) + { + auto const_cols = convertConstTupleToConstantElements(*const_col_right); + Field a2, b2; + const_cols[0]->get(0, a2); + const_cols[1]->get(0, b2); + OpImpl::vector_constant(col_l1->getData(), col_l2->getData(), a2.get(), a2.get(), vec_res); + } + // non-constant tuple - non-constant tuple + else if (const ColumnTuple * col_right = typeid_cast(arg2.column.get())) + { + auto col_r1 = checkAndGetColumn(&col_right->getColumn(0)); + auto col_r2 = checkAndGetColumn(&col_right->getColumn(1)); + if (col_r1 && col_r2) + OpImpl::vector_vector(col_l1->getData(), col_l2->getData(), col_r1->getData(), col_r2->getData(), vec_res); + else + return false; + } + else + return false; + } + else + return false; + } + else + return false; + block.getByPosition(result).column = std::move(col_res); + return true; + }); + if (!valid) + throw Exception(getName() + "'s arguments do not match the expected data types", ErrorCodes::ILLEGAL_COLUMN); + } +}; + +void registerFunctionTupleHammingDistance(FunctionFactory & factory) +{ + factory.registerFunction(); +} +} From 9403dd1520dfd7a887a159ab0af0da699747e2ec Mon Sep 17 00:00:00 2001 From: fenglv Date: Wed, 6 Nov 2019 18:35:55 +0800 Subject: [PATCH 03/23] add test fix comment style fix lambda function style --- dbms/src/Functions/ExtractString.h | 26 ++++----- dbms/src/Functions/FunctionsStringHash.cpp | 54 +++++++++---------- dbms/src/Functions/FunctionsStringHash.h | 10 ++-- dbms/src/Functions/bitHammingDistance.cpp | 15 +++--- dbms/src/Functions/tupleHammingDistance.cpp | 15 +++--- .../01016_simhash_minhash.reference | 50 +++++++++++++++++ .../0_stateless/01016_simhash_minhash.sql | 47 ++++++++++++++++ .../01017_bithamming_distance.reference | 15 ++++++ .../0_stateless/01017_bithamming_distance.sql | 20 +++++++ .../01017_tuplehamming_distance.reference | 15 ++++++ .../01017_tuplehamming_distance.sql | 19 +++++++ 11 files changed, 228 insertions(+), 58 deletions(-) create mode 100644 dbms/tests/queries/0_stateless/01016_simhash_minhash.reference create mode 100644 dbms/tests/queries/0_stateless/01016_simhash_minhash.sql create mode 100644 dbms/tests/queries/0_stateless/01017_bithamming_distance.reference create mode 100644 dbms/tests/queries/0_stateless/01017_bithamming_distance.sql create mode 100644 dbms/tests/queries/0_stateless/01017_tuplehamming_distance.reference create mode 100644 dbms/tests/queries/0_stateless/01017_tuplehamming_distance.sql diff --git a/dbms/src/Functions/ExtractString.h b/dbms/src/Functions/ExtractString.h index 05566496cba..c74b5175ea6 100644 --- a/dbms/src/Functions/ExtractString.h +++ b/dbms/src/Functions/ExtractString.h @@ -12,8 +12,8 @@ namespace DB { -//used by FunctionsStringSimilarity and FunctionsStringHash -//includes exacting ASCII ngram, UTF8 ngram, ASCII word and UTF8 word +// used by FunctionsStringSimilarity and FunctionsStringHash +// includes exacting ASCII ngram, UTF8 ngram, ASCII word and UTF8 word template struct ExtractStringImpl { @@ -47,14 +47,14 @@ struct ExtractStringImpl return default_padding; } - //used by FunctionsStringHash - //it's not easy to add padding for ColumnString, so we need safety check each memcpy + // used by FunctionsStringHash + // it's not easy to add padding for ColumnString, so we need safety check each memcpy static ALWAYS_INLINE size_t readASCIICodePointsNoPadding(UInt8 * code_points, const char *& pos, const char * end) { constexpr size_t padding_offset = default_padding - N + 1; memcpy(code_points, code_points + padding_offset, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(UInt8)); - //safety check + // safety check size_t cpy_size = (pos + padding_offset > end) ? end - pos : padding_offset; memcpy(code_points + (N - 1), pos, cpy_size * sizeof(UInt8)); @@ -69,12 +69,12 @@ struct ExtractStringImpl return default_padding; } - //read a ASCII word from pos to word - //if the word size exceeds max_word_size, only read max_word_size byte - //in FuntionsStringHash, the default value of max_word_size is 128 + // read a ASCII word from pos to word + // if the word size exceeds max_word_size, only read max_word_size byte + // in FuntionsStringHash, the default value of max_word_size is 128 static ALWAYS_INLINE inline size_t readOneASCIIWord(UInt8 * word, const char *& pos, const char * end, const size_t & max_word_size) { - //jump seperators + // jump seperators while (pos < end && !isAlphaNum(*pos)) ++pos; @@ -105,14 +105,14 @@ struct ExtractStringImpl return num; } - //read one UTF8 word from pos to word - //also, we assume that one word size cann't exceed max_word_size with default value 128 + // read one UTF8 word from pos to word + // also, we assume that one word size cann't exceed max_word_size with default value 128 static ALWAYS_INLINE inline size_t readOneUTF8Word(UInt32 * word, const char *& pos, const char * end, const size_t & max_word_size) { // jump UTF8 seperator while (pos < end && isUTF8Sep(*pos)) ++pos; - //UTF8 word's character number + // UTF8 word's character number size_t num = 0; while (pos < end && num < max_word_size && !isUTF8Sep(*pos)) { @@ -133,7 +133,7 @@ private: ((cont[Offset + I] = std::tolower(cont[Offset + I])), ...); } - //we use ASCII non-alphanum character as UTF8 seperator + // we use ASCII non-alphanum character as UTF8 seperator static ALWAYS_INLINE inline bool isUTF8Sep(const UInt8 c) { return c < 128 && !isAlphaNum(c); } // read one UTF8 character and return it diff --git a/dbms/src/Functions/FunctionsStringHash.cpp b/dbms/src/Functions/FunctionsStringHash.cpp index 797d7d30078..215d49544cb 100644 --- a/dbms/src/Functions/FunctionsStringHash.cpp +++ b/dbms/src/Functions/FunctionsStringHash.cpp @@ -75,7 +75,7 @@ struct Hash } }; -//Sinhash String -> UInt64 +// Sinhash String -> UInt64 template struct SimhashImpl { @@ -123,8 +123,8 @@ struct SimhashImpl iter = 0; } while (start < end && (found = read_code_points(cp, start, end))); - //finally, we return a 64 bit value according to finger_vec - //if finger_vec[i] > 0, the i'th bit of the value is 1, otherwise 0 + // finally, we return a 64 bit value according to finger_vec + // if finger_vec[i] > 0, the i'th bit of the value is 1, otherwise 0 std::bitset<64> res_bit(0u); for (size_t i = 0; i < 64; ++i) { @@ -160,7 +160,7 @@ struct SimhashImpl // word buffer to store one word CodePoint word_buf[max_word_size] = {}; size_t word_size; - //get first word shingle + // get first word shingle for (size_t i = 0; i < N && start < end; ++i) { word_size = read_one_word(word_buf, start, end, max_word_size); @@ -189,9 +189,9 @@ struct SimhashImpl // word hash value into locaion of a1, then array become |a5|a6|a2|a3|a4| nwordHashes[offset] = Hash::hashSum(word_buf, word_size); offset = (offset + 1) % N; - //according to the word hash storation way, in order to not lose the word shingle's - //sequence information, when calculation word shingle hash value, we need provide the offset - //inforation, which is the offset of the first word's hash value of the word shingle + // according to the word hash storation way, in order to not lose the word shingle's + // sequence information, when calculation word shingle hash value, we need provide the offset + // inforation, which is the offset of the first word's hash value of the word shingle hash_value = hash_functor(nwordHashes, N, offset); std::bitset<64> bits(hash_value); for (size_t i = 0; i < 64; ++i) @@ -237,7 +237,7 @@ struct SimhashImpl res = dispatch(wordShinglesCalculateHashValue, data.data(), data.size()); } - //non-constant string + // non-constant string static void vector(const ColumnString::Chars & data, const ColumnString::Offsets & offsets, PaddedPODArray & res) { for (size_t i = 0; i < offsets.size(); ++i) @@ -255,12 +255,12 @@ struct SimhashImpl } }; -//Minhash: String -> Tuple(UInt64, UInt64) -//for each string, we extract ngram or word shingle, -//for each ngram or word shingle, calculate a hash value, -//then we take the K minimum hash values to calculate a hashsum, -//and take the K maximum hash values to calculate another hashsum, -//return this two hashsum: Tuple(hashsum1, hashsum2) +// Minhash: String -> Tuple(UInt64, UInt64) +// for each string, we extract ngram or word shingle, +// for each ngram or word shingle, calculate a hash value, +// then we take the K minimum hash values to calculate a hashsum, +// and take the K maximum hash values to calculate another hashsum, +// return this two hashsum: Tuple(hashsum1, hashsum2) template struct MinhashImpl { @@ -298,11 +298,11 @@ struct MinhashImpl hashes[i] = v; } - //Minhash ngram calculate function, String -> Tuple(UInt64, UInt64) - //we extract ngram from input string, and calculate a hash value for each ngram - //then we take the K minimum hash values to calculate a hashsum, - //and take the K maximum hash values to calculate another hashsum, - //return this two hashsum: Tuple(hashsum1, hashsum2) + // Minhash ngram calculate function, String -> Tuple(UInt64, UInt64) + // we extract ngram from input string, and calculate a hash value for each ngram + // then we take the K minimum hash values to calculate a hashsum, + // and take the K maximum hash values to calculate another hashsum, + // return this two hashsum: Tuple(hashsum1, hashsum2) static ALWAYS_INLINE inline std::tuple ngramCalculateHashValue( const char * data, const size_t size, @@ -339,8 +339,8 @@ struct MinhashImpl } // Minhash word shingle hash value calculate function: String ->Tuple(UInt64, UInt64) - //for each word shingle, we calculate a hash value, but in fact, we just maintain the - //K minimum and K maximum hash value + // for each word shingle, we calculate a hash value, but in fact, we just maintain the + // K minimum and K maximum hash value static ALWAYS_INLINE inline std::tuple wordShinglesCalculateHashValue( const char * data, const size_t size, @@ -349,7 +349,7 @@ struct MinhashImpl { const char * start = data; const char * end = start + size; - //also we just store the K minimu and K maximum hash values + // also we just store the K minimu and K maximum hash values UInt64 k_minimum[K] = {}; UInt64 k_maxinum[K] = {}; // array to store n word hashes @@ -357,8 +357,8 @@ struct MinhashImpl // word buffer to store one word CodePoint word_buf[max_word_size] = {}; size_t word_size; - //how word shingle hash value calculation and word hash storation is same as we - //have descripted in Simhash wordShinglesCalculateHashValue function + // how word shingle hash value calculation and word hash storation is same as we + // have descripted in Simhash wordShinglesCalculateHashValue function for (size_t i = 0; i < N && start < end; ++i) { word_size = read_one_word(word_buf, start, end, max_word_size); @@ -416,7 +416,7 @@ struct MinhashImpl std::tie(res1, res2) = dispatch(wordShinglesCalculateHashValue, data.data(), data.size()); } - //non-constant string + // non-constant string static void vector( const ColumnString::Chars & data, const ColumnString::Offsets & offsets, @@ -518,7 +518,7 @@ struct NameWordShingleMinhashCaseInsensitiveUTF8 static constexpr auto name = "wordShingleMinhashCaseInsensitiveUTF8"; }; -//Simhash +// Simhash using FunctionNgramSimhash = FunctionsStringHash, NameNgramSimhash, true>; using FunctionNgramSimhashCaseInsensitive @@ -539,7 +539,7 @@ using FunctionWordShingleSimhashUTF8 = FunctionsStringHash, NameWordShingleSimhashCaseInsensitiveUTF8, true>; -//Minhash +// Minhash using FunctionNgramMinhash = FunctionsStringHash, NameNgramMinhash, false>; using FunctionNgramMinhashCaseInsensitive diff --git a/dbms/src/Functions/FunctionsStringHash.h b/dbms/src/Functions/FunctionsStringHash.h index 185097ade99..bb1e42ab5fa 100644 --- a/dbms/src/Functions/FunctionsStringHash.h +++ b/dbms/src/Functions/FunctionsStringHash.h @@ -15,14 +15,12 @@ namespace DB namespace ErrorCodes { extern const int ILLEGAL_TYPE_OF_ARGUMENT; - extern const int ILLEGAL_COLUMN; - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int TOO_LARGE_STRING_SIZE; } -//FunctionStringHash -//Simhash: String -> UInt64 -//Minhash: String -> (UInt64, UInt64) +// FunctionStringHash +// Simhash: String -> UInt64 +// Minhash: String -> (UInt64, UInt64) template class FunctionsStringHash : public IFunction { @@ -103,7 +101,7 @@ public: } else { - //non const string + // non const string auto col_h1 = ColumnVector::create(); auto col_h2 = ColumnVector::create(); typename ColumnVector::Container & vec_h1 = col_h1->getData(); diff --git a/dbms/src/Functions/bitHammingDistance.cpp b/dbms/src/Functions/bitHammingDistance.cpp index 2572720bb4e..fdef72d4c43 100644 --- a/dbms/src/Functions/bitHammingDistance.cpp +++ b/dbms/src/Functions/bitHammingDistance.cpp @@ -75,10 +75,12 @@ bool castType(const IDataType * type, F && f) template static bool castBothTypes(const IDataType * left, const IDataType * right, F && f) { - return castType(left, [&](const auto & left_) { return castType(right, [&](const auto & right_) { return f(left_, right_); }); }); + return castType(left, [&](const auto & left_) { + return castType(right, [&](const auto & right_) { return f(left_, right_); }); + }); } -//bitHammingDistance function: (Integer, Integer) -> UInt8 +// bitHammingDistance function: (Integer, Integer) -> UInt8 class FunctionBitHammingDistance : public IFunction { public: @@ -105,7 +107,8 @@ public: { auto * left_generic = block.getByPosition(arguments[0]).type.get(); auto * right_generic = block.getByPosition(arguments[1]).type.get(); - bool valid = castBothTypes(left_generic, right_generic, [&](const auto & left, const auto & right) { + bool valid = castBothTypes(left_generic, right_generic, [&](const auto & left, const auto & right) + { using LeftDataType = std::decay_t; using RightDataType = std::decay_t; using T0 = typename LeftDataType::FieldType; @@ -122,7 +125,7 @@ public: { if (auto col_right = checkAndGetColumnConst(col_right_raw)) { - //constant integer - constant integer + // constant integer - constant integer auto res = OpImpl::constant_constant(col_left->template getValue(), col_right->template getValue()); block.getByPosition(result).column = DataTypeUInt8().createColumnConst(col_left->size(), toField(res)); return true; @@ -148,10 +151,10 @@ public: else if (auto col_left = checkAndGetColumn(col_left_raw)) { if (auto col_right = checkAndGetColumn(col_right_raw)) - //non-constant integer - non-constant integer + // non-constant integer - non-constant integer OpImpl::vector_vector(col_left->getData(), col_right->getData(), vec_res); else if (auto col_right_const = checkAndGetColumnConst(col_right_raw)) - //non-constant integer - constant integer + // non-constant integer - constant integer OpImpl::vector_constant(col_left->getData(), col_right_const->template getValue(), vec_res); else return false; diff --git a/dbms/src/Functions/tupleHammingDistance.cpp b/dbms/src/Functions/tupleHammingDistance.cpp index 4a727aef59a..45c113edad4 100644 --- a/dbms/src/Functions/tupleHammingDistance.cpp +++ b/dbms/src/Functions/tupleHammingDistance.cpp @@ -83,13 +83,15 @@ bool castType(const IDataType * type, F && f) template static bool castBothTypes(const IDataType * left, const IDataType * right, F && f) { - return castType(left, [&](const auto & left_) { return castType(right, [&](const auto & right_) { return f(left_, right_); }); }); + return castType(left, [&](const auto & left_) { + return castType(right, [&](const auto & right_) { return f(left_, right_); }); + }); } -//tupleHammingDistance function: (Tuple(Integer, Integer), Tuple(Integer, Integer))->UInt8 -//in order to avoid code bloating, for non-constant tuple, we make sure that the elements -//in the tuple should have same data type, and for constant tuple, elements can be any integer -//data type, we cast all of them into UInt64 +// tupleHammingDistance function: (Tuple(Integer, Integer), Tuple(Integer, Integer))->UInt8 +// in order to avoid code bloating, for non-constant tuple, we make sure that the elements +// in the tuple should have same data type, and for constant tuple, elements can be any integer +// data type, we cast all of them into UInt64 class FunctionTupleHammingDistance : public IFunction { public: @@ -124,7 +126,8 @@ public: throw Exception( "Illegal column of arguments of function " + getName() + ", tuple should have exactly two elements.", ErrorCodes::ILLEGAL_COLUMN); - bool valid = castBothTypes(left_elems[0].get(), right_elems[0].get(), [&](const auto & left, const auto & right) { + bool valid = castBothTypes(left_elems[0].get(), right_elems[0].get(), [&](const auto & left, const auto & right) + { using LeftDataType = std::decay_t; using RightDataType = std::decay_t; using T0 = typename LeftDataType::FieldType; diff --git a/dbms/tests/queries/0_stateless/01016_simhash_minhash.reference b/dbms/tests/queries/0_stateless/01016_simhash_minhash.reference new file mode 100644 index 00000000000..fa62adde45c --- /dev/null +++ b/dbms/tests/queries/0_stateless/01016_simhash_minhash.reference @@ -0,0 +1,50 @@ +0 +2718169299 +2718169299 +3333471646 +26585365 +4151513063 +4151513063 +4151513063 +3150464485 +(0,0) +(2736268688,2736268688) +(2736268688,2736268688) +(916562399,916562399) +(3436376151,3436376151) +(0,3423682776) +(0,3423682776) +(0,3423682776) +(0,2393737641) +2548869326 +2548869326 +401385678 +401385710 +4258739090 +4260836242 +718415633 +718681881 +4026448893 +4026449917 +4026466301 +4026466301 +4026448893 +4026449917 +3957325823 +4217372671 +(3946088007,3946088007) +(3946088007,3946088007) +(2332295796,2332295796) +(535012010,535012010) +(3696559901,3696559901) +(3696559901,3696559901) +(169287209,169287209) +(169287209,169287209) +(0,1509393235) +(0,1509393235) +(0,1509393235) +(0,1509393235) +(0,1509393235) +(0,1509393235) +(0,1975937193) +(0,1975937193) diff --git a/dbms/tests/queries/0_stateless/01016_simhash_minhash.sql b/dbms/tests/queries/0_stateless/01016_simhash_minhash.sql new file mode 100644 index 00000000000..9e87216d26f --- /dev/null +++ b/dbms/tests/queries/0_stateless/01016_simhash_minhash.sql @@ -0,0 +1,47 @@ +SELECT ngramSimhash(''); +SELECT ngramSimhash('what a cute cat.'); +SELECT ngramSimhashCaseInsensitive('what a cute cat.'); +SELECT ngramSimhashUTF8('what a cute cat.'); +SELECT ngramSimhashCaseInsensitiveUTF8('what a cute cat.'); +SELECT wordShingleSimhash('what a cute cat.'); +SELECT wordShingleSimhashCaseInsensitive('what a cute cat.'); +SELECT wordShingleSimhashUTF8('what a cute cat.'); +SELECT wordShingleSimhashCaseInsensitiveUTF8('what a cute cat.'); + +SELECT ngramMinhash(''); +SELECT ngramMinhash('what a cute cat.'); +SELECT ngramMinhashCaseInsensitive('what a cute cat.'); +SELECT ngramMinhashUTF8('what a cute cat.'); +SELECT ngramMinhashCaseInsensitiveUTF8('what a cute cat.'); +SELECT wordShingleMinhash('what a cute cat.'); +SELECT wordShingleMinhashCaseInsensitive('what a cute cat.'); +SELECT wordShingleMinhashUTF8('what a cute cat.'); +SELECT wordShingleMinhashCaseInsensitiveUTF8('what a cute cat.'); + +DROP TABLE IF EXISTS defaults; +CREATE TABLE defaults +( + s String +)ENGINE = Memory(); + +INSERT INTO defaults values ('It is the latest occurrence of the Southeast European haze, the issue that occurs in constant intensity during every wet season. It has mainly been caused by forest fires resulting from illegal slash-and-burn clearing performed on behalf of the palm oil industry in Kazakhstan, principally on the islands, which then spread quickly in the dry season.') ('It is the latest occurrence of the Southeast Asian haze, the issue that occurs in constant intensity during every wet season. It has mainly been caused by forest fires resulting from illegal slash-and-burn clearing performed on behalf of the palm oil industry in Kazakhstan, principally on the islands, which then spread quickly in the dry season.'); + +SELECT ngramSimhash(s) FROM defaults; +SELECT ngramSimhashCaseInsensitive(s) FROM defaults; +SELECT ngramSimhashUTF8(s) FROM defaults; +SELECT ngramSimhashCaseInsensitiveUTF8(s) FROM defaults; +SELECT wordShingleSimhash(s) FROM defaults; +SELECT wordShingleSimhashCaseInsensitive(s) FROM defaults; +SELECT wordShingleSimhashUTF8(s) FROM defaults; +SELECT wordShingleSimhashCaseInsensitiveUTF8(s) FROM defaults; + +SELECT ngramMinhash(s) FROM defaults; +SELECT ngramMinhashCaseInsensitive(s) FROM defaults; +SELECT ngramMinhashUTF8(s) FROM defaults; +SELECT ngramMinhashCaseInsensitiveUTF8(s) FROM defaults; +SELECT wordShingleMinhash(s) FROM defaults; +SELECT wordShingleMinhashCaseInsensitive(s) FROM defaults; +SELECT wordShingleMinhashUTF8(s) FROM defaults; +SELECT wordShingleMinhashCaseInsensitiveUTF8(s) FROM defaults; + +DROP TABLE defaults; diff --git a/dbms/tests/queries/0_stateless/01017_bithamming_distance.reference b/dbms/tests/queries/0_stateless/01017_bithamming_distance.reference new file mode 100644 index 00000000000..cc2d4f39154 --- /dev/null +++ b/dbms/tests/queries/0_stateless/01017_bithamming_distance.reference @@ -0,0 +1,15 @@ +1 +7 +63 +2 +1 +3 +5 +4 +6 +6 +6 +3 +5 +9 +9 diff --git a/dbms/tests/queries/0_stateless/01017_bithamming_distance.sql b/dbms/tests/queries/0_stateless/01017_bithamming_distance.sql new file mode 100644 index 00000000000..4b36894b97c --- /dev/null +++ b/dbms/tests/queries/0_stateless/01017_bithamming_distance.sql @@ -0,0 +1,20 @@ +SELECT bitHammingDistance(1, 5); +SELECT bitHammingDistance(100, 100000); +SELECT bitHammingDistance(-1, 1); + +DROP TABLE IF EXISTS defaults; +CREATE TABLE defaults +( + n1 UInt8, + n2 UInt16, + n3 UInt32, + n4 UInt64 +)ENGINE = Memory(); + +INSERT INTO defaults VALUES (1, 2, 3, 4) (12, 4345, 435, 1233) (45, 675, 32343, 54566) (90, 784, 9034, 778752); + +SELECT bitHammingDistance(4, n1) FROM defaults; +SELECT bitHammingDistance(n2, 100) FROM defaults; +SELECT bitHammingDistance(n3, n4) FROM defaults; + +DROP TABLE defaults; diff --git a/dbms/tests/queries/0_stateless/01017_tuplehamming_distance.reference b/dbms/tests/queries/0_stateless/01017_tuplehamming_distance.reference new file mode 100644 index 00000000000..eee1a7eee3b --- /dev/null +++ b/dbms/tests/queries/0_stateless/01017_tuplehamming_distance.reference @@ -0,0 +1,15 @@ +3 +5 +60 +5 +3 +10 +10 +114 +119 +111 +104 +69 +13 +65 +25 diff --git a/dbms/tests/queries/0_stateless/01017_tuplehamming_distance.sql b/dbms/tests/queries/0_stateless/01017_tuplehamming_distance.sql new file mode 100644 index 00000000000..0db73232bb3 --- /dev/null +++ b/dbms/tests/queries/0_stateless/01017_tuplehamming_distance.sql @@ -0,0 +1,19 @@ +SELECT tupleHammingDistance((1, 2), (3, 4)); +SELECT tupleHammingDistance((120, 2434), (123, 434)); +SELECT tupleHammingDistance((-12, 434), (987, 432)); + +DROP TABLE IF EXISTS defaults; +CREATE TABLE defaults +( + t1 Tuple(UInt16, UInt16), + t2 Tuple(UInt32, UInt32), + t3 Tuple(Int64, Int64) +)ENGINE = Memory(); + +INSERT INTO defaults VALUES ((12, 43), (12312, 43453) ,(-10, 32)) ((1, 4), (546, 12345), (123, 456)) ((90, 9875), (43456, 234203), (1231, -123)) ((87, 987), (545645, 768354634), (9123, 909)); + +SELECT tupleHammingDistance((1, 3), t1) FROM defaults; +SELECT tupleHammingDistance(t2, (-1, 1)) FROM defaults; +SELECT tupleHammingDistance(t2, t3) FROM defaults; + +DROP TABLE defaults; From ced7fe59dbe48f261abb3fec427eadbc50ba7c5f Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Thu, 5 Dec 2019 06:48:40 +0300 Subject: [PATCH 04/23] Update ExtractString.h --- dbms/src/Functions/ExtractString.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Functions/ExtractString.h b/dbms/src/Functions/ExtractString.h index c74b5175ea6..040e62d9580 100644 --- a/dbms/src/Functions/ExtractString.h +++ b/dbms/src/Functions/ExtractString.h @@ -13,7 +13,7 @@ namespace DB { // used by FunctionsStringSimilarity and FunctionsStringHash -// includes exacting ASCII ngram, UTF8 ngram, ASCII word and UTF8 word +// includes extracting ASCII ngram, UTF8 ngram, ASCII word and UTF8 word template struct ExtractStringImpl { From 241fd556576fc7833174c5346568732e1742a8d8 Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Thu, 5 Dec 2019 07:08:35 +0300 Subject: [PATCH 05/23] Update FunctionsStringHash.cpp --- dbms/src/Functions/FunctionsStringHash.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Functions/FunctionsStringHash.cpp b/dbms/src/Functions/FunctionsStringHash.cpp index 215d49544cb..d7277fcb98b 100644 --- a/dbms/src/Functions/FunctionsStringHash.cpp +++ b/dbms/src/Functions/FunctionsStringHash.cpp @@ -75,7 +75,7 @@ struct Hash } }; -// Sinhash String -> UInt64 +// Simhash String -> UInt64 template struct SimhashImpl { From 83c0807b43d7ca5587b16c43a577fec6ee51ec75 Mon Sep 17 00:00:00 2001 From: feng lv Date: Fri, 22 May 2020 21:23:49 +0800 Subject: [PATCH 06/23] update update name --- src/Functions/ExtractString.h | 14 +-- src/Functions/FunctionsStringHash.cpp | 106 ++++++++---------- src/Functions/FunctionsStringHash.h | 8 +- src/Functions/bitHammingDistance.cpp | 31 +---- src/Functions/registerFunctions.cpp | 7 +- src/Functions/tupleHammingDistance.cpp | 27 ++--- .../01016_simhash_minhash.reference | 59 ++++++++++ .../0_stateless/01016_simhash_minhash.sql | 47 ++++++++ .../01017_bithamming_distance.reference | 15 +++ .../0_stateless/01017_bithamming_distance.sql | 20 ++++ .../01017_tuplehamming_distance.reference | 15 +++ .../01017_tuplehamming_distance.sql | 19 ++++ 12 files changed, 251 insertions(+), 117 deletions(-) create mode 100644 tests/queries/0_stateless/01016_simhash_minhash.reference create mode 100644 tests/queries/0_stateless/01016_simhash_minhash.sql create mode 100644 tests/queries/0_stateless/01017_bithamming_distance.reference create mode 100644 tests/queries/0_stateless/01017_bithamming_distance.sql create mode 100644 tests/queries/0_stateless/01017_tuplehamming_distance.reference create mode 100644 tests/queries/0_stateless/01017_tuplehamming_distance.sql diff --git a/src/Functions/ExtractString.h b/src/Functions/ExtractString.h index 040e62d9580..f6a7394a9fc 100644 --- a/src/Functions/ExtractString.h +++ b/src/Functions/ExtractString.h @@ -72,18 +72,18 @@ struct ExtractStringImpl // read a ASCII word from pos to word // if the word size exceeds max_word_size, only read max_word_size byte // in FuntionsStringHash, the default value of max_word_size is 128 - static ALWAYS_INLINE inline size_t readOneASCIIWord(UInt8 * word, const char *& pos, const char * end, const size_t & max_word_size) + static ALWAYS_INLINE inline size_t readOneASCIIWord(UInt8 * word, const char *& pos, const char * end, size_t max_word_size) { // jump seperators - while (pos < end && !isAlphaNum(*pos)) + while (pos < end && !isAlphaNumericASCII(*pos)) ++pos; // word start from here const char * word_start = pos; - while (pos < end && isAlphaNum(*pos)) + while (pos < end && isAlphaNumericASCII(*pos)) ++pos; - size_t word_size = (static_cast(pos - word_start) <= max_word_size) ? pos - word_start : max_word_size; + size_t word_size = std::min(pos - word_start, max_word_size); memcpy(word, word_start, word_size); if (CaseInsensitive) @@ -107,7 +107,7 @@ struct ExtractStringImpl // read one UTF8 word from pos to word // also, we assume that one word size cann't exceed max_word_size with default value 128 - static ALWAYS_INLINE inline size_t readOneUTF8Word(UInt32 * word, const char *& pos, const char * end, const size_t & max_word_size) + static ALWAYS_INLINE inline size_t readOneUTF8Word(UInt32 * word, const char *& pos, const char * end, size_t max_word_size) { // jump UTF8 seperator while (pos < end && isUTF8Sep(*pos)) @@ -122,7 +122,7 @@ struct ExtractStringImpl } private: - static ALWAYS_INLINE inline bool isAlphaNum(const UInt8 c) + static ALWAYS_INLINE inline bool isAlphaNumericASCII(const UInt8 c) { return (c >= 48 && c <= 57) || (c >= 65 && c <= 90) || (c >= 97 && c <= 122); } @@ -134,7 +134,7 @@ private: } // we use ASCII non-alphanum character as UTF8 seperator - static ALWAYS_INLINE inline bool isUTF8Sep(const UInt8 c) { return c < 128 && !isAlphaNum(c); } + static ALWAYS_INLINE inline bool isUTF8Sep(const UInt8 c) { return c < 128 && !isAlphaNumericASCII(c); } // read one UTF8 character and return it static ALWAYS_INLINE inline UInt32 readOneUTF8Code(const char *& pos, const char * end) diff --git a/src/Functions/FunctionsStringHash.cpp b/src/Functions/FunctionsStringHash.cpp index d7277fcb98b..2195ff7c703 100644 --- a/src/Functions/FunctionsStringHash.cpp +++ b/src/Functions/FunctionsStringHash.cpp @@ -31,47 +31,35 @@ struct Hash #endif } - static ALWAYS_INLINE inline UInt64 wordShinglesHash(const UInt64 * hashes, const size_t & size, const size_t & offset) + static ALWAYS_INLINE inline UInt64 wordShinglesHash(const UInt64 * hashes, size_t size, size_t offset) { - UInt64 res = 0; - UInt8 flag = 0; + UInt64 crc = -1ULL; +#ifdef __SSE4_2__ for (size_t i = offset; i < size; ++i) - { - if (flag) - res &= intHashCRC32(hashes[i]); - else - res |= intHashCRC32(hashes[i]); - flag = (flag + 1) % 2; - } + crc = _mm_crc32_u64(crc, hashes[i]); for (size_t i = 0; i < offset; ++i) - { - if (flag) - res &= intHashCRC32(hashes[i]); - else - res |= intHashCRC32(hashes[i]); - flag = (flag + 1) % 2; - } - return res; + crc = _mm_crc32_u64(crc, hashes[i]); +#else + for (size_t i = offset; i < size; ++i) + crc = intHashCRC32(crc) ^ intHashCRC32(hashes[i]); + for (size_t i = 0; i < offset; ++i) + crc = intHashCRC32(crc) ^ intHashCRC32(hashes[i]); +#endif + return crc; } template - static ALWAYS_INLINE inline UInt64 hashSum(const CodePoint * hashes, const size_t & K) + static ALWAYS_INLINE inline UInt64 hashSum(const CodePoint * hashes, size_t K) { - UInt64 even = 0; - UInt64 odd = 0; - size_t i = 0; - for (; i + 1 < K; i += 2) - { - even |= intHashCRC32(hashes[i]); - odd |= intHashCRC32(hashes[i + 1]); - } - if (i < K) - even |= intHashCRC32(hashes[K - 1]); + UInt64 crc = -1ULL; #ifdef __SSE4_2__ - return _mm_crc32_u64(even, odd); + for (size_t i = 0; i < K; ++i) + crc = _mm_crc32_u64(crc, hashes[i]); #else - return (intHashCRC32(even) ^ intHashCRC32(odd)); + for (size_t i = 0; i < K; ++i) + crc = intHashCRC32(crc) ^ intHashCRC32(hashes[i]); #endif + return crc; } }; @@ -93,7 +81,7 @@ struct SimhashImpl // finally return a 64 bit value(UInt64), i'th bit is 1 means vector[i] > 0, otherwise, vector[i] < 0 static ALWAYS_INLINE inline UInt64 ngramCalculateHashValue( const char * data, - const size_t size, + size_t size, size_t (*read_code_points)(CodePoint *, const char *&, const char *), UInt64 (*hash_functor)(const CodePoint *)) { @@ -146,9 +134,9 @@ struct SimhashImpl // values to caculate the next word shingle hash value static ALWAYS_INLINE inline UInt64 wordShinglesCalculateHashValue( const char * data, - const size_t size, - size_t (*read_one_word)(CodePoint *, const char *&, const char *, const size_t &), - UInt64 (*hash_functor)(const UInt64 *, const size_t &, const size_t &)) + size_t size, + size_t (*read_one_word)(CodePoint *, const char *&, const char *, size_t), + UInt64 (*hash_functor)(const UInt64 *, size_t, size_t)) { const char * start = data; const char * end = data + size; @@ -156,7 +144,7 @@ struct SimhashImpl // Also, a 64 bit vector initialized to zero Int64 finger_vec[64] = {}; // a array to store N word hash values - UInt64 nwordHashes[N] = {}; + UInt64 nword_hashes[N] = {}; // word buffer to store one word CodePoint word_buf[max_word_size] = {}; size_t word_size; @@ -167,16 +155,16 @@ struct SimhashImpl if (word_size) { // for each word, calculate a hash value and stored into the array - nwordHashes[i++] = Hash::hashSum(word_buf, word_size); + nword_hashes[i++] = Hash::hashSum(word_buf, word_size); } } // calculate the first word shingle hash value - UInt64 hash_value = hash_functor(nwordHashes, N, 0); - std::bitset<64> bits_(hash_value); + UInt64 hash_value = hash_functor(nword_hashes, N, 0); + std::bitset<64> first_bits(hash_value); for (size_t i = 0; i < 64; ++i) { - finger_vec[i] += ((bits_.test(i)) ? 1 : -1); + finger_vec[i] += ((first_bits.test(i)) ? 1 : -1); } size_t offset = 0; @@ -187,12 +175,12 @@ struct SimhashImpl // so we need to store new word hash into location of a0, then ,this array become // |a5|a1|a2|a3|a4|, next time, a1 become the oldest location, we need to store new // word hash value into locaion of a1, then array become |a5|a6|a2|a3|a4| - nwordHashes[offset] = Hash::hashSum(word_buf, word_size); + nword_hashes[offset] = Hash::hashSum(word_buf, word_size); offset = (offset + 1) % N; // according to the word hash storation way, in order to not lose the word shingle's // sequence information, when calculation word shingle hash value, we need provide the offset // inforation, which is the offset of the first word's hash value of the word shingle - hash_value = hash_functor(nwordHashes, N, offset); + hash_value = hash_functor(nword_hashes, N, offset); std::bitset<64> bits(hash_value); for (size_t i = 0; i < 64; ++i) { @@ -272,7 +260,7 @@ struct MinhashImpl // insert a new value into K minimum hash array if this value // is smaller than the greatest value in the array - static ALWAYS_INLINE inline void insert_minValue(UInt64 * hashes, UInt64 v) + static ALWAYS_INLINE inline void insertMinValue(UInt64 * hashes, UInt64 v) { size_t i = 0; for (; i < K && hashes[i] <= v; ++i) @@ -286,7 +274,7 @@ struct MinhashImpl // insert a new value into K maximum hash array if this value // is greater than the smallest value in the array - static ALWAYS_INLINE inline void insert_maxValue(UInt64 * hashes, UInt64 v) + static ALWAYS_INLINE inline void insertMaxValue(UInt64 * hashes, UInt64 v) { int i = K - 1; for (; i >= 0 && hashes[i] >= v; --i) @@ -305,7 +293,7 @@ struct MinhashImpl // return this two hashsum: Tuple(hashsum1, hashsum2) static ALWAYS_INLINE inline std::tuple ngramCalculateHashValue( const char * data, - const size_t size, + size_t size, size_t (*read_code_points)(CodePoint *, const char *&, const char *), UInt64 (*hash_functor)(const CodePoint *)) { @@ -326,8 +314,8 @@ struct MinhashImpl auto new_hash = hash_functor(cp + iter); // insert the new hash value into array used to store K minimum value // and K maximum value - insert_minValue(k_minimum, new_hash); - insert_maxValue(k_maxinum, new_hash); + insertMinValue(k_minimum, new_hash); + insertMaxValue(k_maxinum, new_hash); } iter = 0; } while (start < end && (found = read_code_points(cp, start, end))); @@ -343,9 +331,9 @@ struct MinhashImpl // K minimum and K maximum hash value static ALWAYS_INLINE inline std::tuple wordShinglesCalculateHashValue( const char * data, - const size_t size, - size_t (*read_one_word)(CodePoint *, const char *&, const char *, const size_t &), - UInt64 (*hash_functor)(const UInt64 *, const size_t &, const size_t &)) + size_t size, + size_t (*read_one_word)(CodePoint *, const char *&, const char *, size_t), + UInt64 (*hash_functor)(const UInt64 *, size_t, size_t)) { const char * start = data; const char * end = start + size; @@ -353,7 +341,7 @@ struct MinhashImpl UInt64 k_minimum[K] = {}; UInt64 k_maxinum[K] = {}; // array to store n word hashes - UInt64 nwordHashes[N] = {}; + UInt64 nword_hashes[N] = {}; // word buffer to store one word CodePoint word_buf[max_word_size] = {}; size_t word_size; @@ -364,22 +352,22 @@ struct MinhashImpl word_size = read_one_word(word_buf, start, end, max_word_size); if (word_size) { - nwordHashes[i++] = Hash::hashSum(word_buf, word_size); + nword_hashes[i++] = Hash::hashSum(word_buf, word_size); } } - auto new_hash = hash_functor(nwordHashes, N, 0); - insert_minValue(k_minimum, new_hash); - insert_maxValue(k_maxinum, new_hash); + auto new_hash = hash_functor(nword_hashes, N, 0); + insertMinValue(k_minimum, new_hash); + insertMaxValue(k_maxinum, new_hash); size_t offset = 0; while (start < end && (word_size = read_one_word(word_buf, start, end, max_word_size))) { - nwordHashes[offset] = Hash::hashSum(word_buf, word_size); + nword_hashes[offset] = Hash::hashSum(word_buf, word_size); offset = (offset + 1) % N; - new_hash = hash_functor(nwordHashes, N, offset); - insert_minValue(k_minimum, new_hash); - insert_maxValue(k_maxinum, new_hash); + new_hash = hash_functor(nword_hashes, N, offset); + insertMinValue(k_minimum, new_hash); + insertMaxValue(k_maxinum, new_hash); } // calculate hashsum diff --git a/src/Functions/FunctionsStringHash.h b/src/Functions/FunctionsStringHash.h index bb1e42ab5fa..bada7490288 100644 --- a/src/Functions/FunctionsStringHash.h +++ b/src/Functions/FunctionsStringHash.h @@ -8,7 +8,7 @@ #include #include #include -#include +#include namespace DB { @@ -21,7 +21,7 @@ namespace ErrorCodes // FunctionStringHash // Simhash: String -> UInt64 // Minhash: String -> (UInt64, UInt64) -template +template class FunctionsStringHash : public IFunction { public: @@ -38,7 +38,7 @@ public: if (!isString(arguments[0])) throw Exception( "Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - if (IsSimhash) + if constexpr (is_simhash) return std::make_shared>(); auto element = DataTypeFactory::instance().get("UInt64"); return std::make_shared(DataTypes{element, element}); @@ -49,7 +49,7 @@ public: const ColumnPtr & column = block.getByPosition(arguments[0]).column; const ColumnConst * col_const = typeid_cast(&*column); using ResultType = typename Impl::ResultType; - if constexpr (IsSimhash) + if constexpr (is_simhash) { if (col_const) { diff --git a/src/Functions/bitHammingDistance.cpp b/src/Functions/bitHammingDistance.cpp index fdef72d4c43..5c13a57c426 100644 --- a/src/Functions/bitHammingDistance.cpp +++ b/src/Functions/bitHammingDistance.cpp @@ -40,21 +40,11 @@ struct BitHammingDistanceImpl c[i] = apply(a, b[i]); } - static ResultType constant_constant(A a, B b) { return apply(a, b); } - private: - static UInt8 pop_cnt(UInt64 res) - { - UInt8 count = 0; - for (; res; res >>= 1) - count += res & 1u; - return count; - } - static inline UInt8 apply(UInt64 a, UInt64 b) { UInt64 res = a ^ b; - return pop_cnt(res); + return __builtin_popcountll(res); } }; @@ -75,9 +65,7 @@ bool castType(const IDataType * type, F && f) template static bool castBothTypes(const IDataType * left, const IDataType * right, F && f) { - return castType(left, [&](const auto & left_) { - return castType(right, [&](const auto & right_) { return f(left_, right_); }); - }); + return castType(left, [&](const auto & left_) { return castType(right, [&](const auto & right_) { return f(left_, right_); }); }); } // bitHammingDistance function: (Integer, Integer) -> UInt8 @@ -103,12 +91,13 @@ public: return std::make_shared(); } + bool useDefaultImplementationForConstants() const override { return true; } + void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t) override { auto * left_generic = block.getByPosition(arguments[0]).type.get(); auto * right_generic = block.getByPosition(arguments[1]).type.get(); - bool valid = castBothTypes(left_generic, right_generic, [&](const auto & left, const auto & right) - { + bool valid = castBothTypes(left_generic, right_generic, [&](const auto & left, const auto & right) { using LeftDataType = std::decay_t; using RightDataType = std::decay_t; using T0 = typename LeftDataType::FieldType; @@ -121,16 +110,6 @@ public: auto col_left_raw = block.getByPosition(arguments[0]).column.get(); auto col_right_raw = block.getByPosition(arguments[1]).column.get(); - if (auto col_left = checkAndGetColumnConst(col_left_raw)) - { - if (auto col_right = checkAndGetColumnConst(col_right_raw)) - { - // constant integer - constant integer - auto res = OpImpl::constant_constant(col_left->template getValue(), col_right->template getValue()); - block.getByPosition(result).column = DataTypeUInt8().createColumnConst(col_left->size(), toField(res)); - return true; - } - } typename ColVecResult::MutablePtr col_res = nullptr; col_res = ColVecResult::create(); diff --git a/src/Functions/registerFunctions.cpp b/src/Functions/registerFunctions.cpp index 02013e33d16..f3e2883a179 100644 --- a/src/Functions/registerFunctions.cpp +++ b/src/Functions/registerFunctions.cpp @@ -3,7 +3,6 @@ namespace DB { - void registerFunctionsArithmetic(FunctionFactory &); void registerFunctionsArray(FunctionFactory &); void registerFunctionsTuple(FunctionFactory &); @@ -37,6 +36,9 @@ void registerFunctionsIntrospection(FunctionFactory &); void registerFunctionsNull(FunctionFactory &); void registerFunctionsJSON(FunctionFactory &); void registerFunctionsConsistentHashing(FunctionFactory & factory); +void registerFunctionBitHammingDistance(FunctionFactory & factory); +void registerFunctionTupleHammingDistance(FunctionFactory & factory); +void registerFunctionsStringHash(FunctionFactory & factory); void registerFunctions() @@ -78,6 +80,9 @@ void registerFunctions() registerFunctionsJSON(factory); registerFunctionsIntrospection(factory); registerFunctionsConsistentHashing(factory); + registerFunctionBitHammingDistance(factory); + registerFunctionTupleHammingDistance(factory); + registerFunctionsStringHash(factory); } } diff --git a/src/Functions/tupleHammingDistance.cpp b/src/Functions/tupleHammingDistance.cpp index 45c113edad4..8b3f9a696aa 100644 --- a/src/Functions/tupleHammingDistance.cpp +++ b/src/Functions/tupleHammingDistance.cpp @@ -51,19 +51,7 @@ struct TupleHammingDistanceImpl static ResultType constant_constant(UInt64 a1, UInt64 b1, UInt64 a2, UInt64 b2) { return apply(a1, a2) + apply(b1, b2); } private: - static UInt8 pop_cnt(UInt64 res) - { - UInt8 count = 0; - for (; res; res >>= 1) - count += res & 1u; - return count; - } - - static inline UInt8 apply(UInt64 a, UInt64 b) - { - UInt64 res = a ^ b; - return pop_cnt(res); - } + static inline UInt8 apply(UInt64 a, UInt64 b) { return a != b; } }; template @@ -83,12 +71,10 @@ bool castType(const IDataType * type, F && f) template static bool castBothTypes(const IDataType * left, const IDataType * right, F && f) { - return castType(left, [&](const auto & left_) { - return castType(right, [&](const auto & right_) { return f(left_, right_); }); - }); + return castType(left, [&](const auto & left_) { return castType(right, [&](const auto & right_) { return f(left_, right_); }); }); } -// tupleHammingDistance function: (Tuple(Integer, Integer), Tuple(Integer, Integer))->UInt8 +// tupleHammingDistance function: (Tuple(Integer, Integer), Tuple(Integer, Integer))->0/1/2 // in order to avoid code bloating, for non-constant tuple, we make sure that the elements // in the tuple should have same data type, and for constant tuple, elements can be any integer // data type, we cast all of them into UInt64 @@ -126,8 +112,7 @@ public: throw Exception( "Illegal column of arguments of function " + getName() + ", tuple should have exactly two elements.", ErrorCodes::ILLEGAL_COLUMN); - bool valid = castBothTypes(left_elems[0].get(), right_elems[0].get(), [&](const auto & left, const auto & right) - { + bool valid = castBothTypes(left_elems[0].get(), right_elems[0].get(), [&](const auto & left, const auto & right) { using LeftDataType = std::decay_t; using RightDataType = std::decay_t; using T0 = typename LeftDataType::FieldType; @@ -138,7 +123,9 @@ public: using OpImpl = TupleHammingDistanceImpl; - // constant tuple - constant tuple + // we can not useDefaultImplementationForConstants, + // because with that, tupleHammingDistance((10, 300), (10, 20)) does not work, + // since 10 has data type UInt8, and 300 has data type UInt16 if (const ColumnConst * const_col_left = checkAndGetColumnConst(arg1.column.get())) { if (const ColumnConst * const_col_right = checkAndGetColumnConst(arg2.column.get())) diff --git a/tests/queries/0_stateless/01016_simhash_minhash.reference b/tests/queries/0_stateless/01016_simhash_minhash.reference new file mode 100644 index 00000000000..7fa70b343a4 --- /dev/null +++ b/tests/queries/0_stateless/01016_simhash_minhash.reference @@ -0,0 +1,59 @@ +0 +2718169299 +2718169299 +3333471646 +26585365 +4124079607 +4124079607 +4124079607 +979945684 +(3614688582,3614688582) +(3614688582,3614688582) +(765622645,765622645) +(765622645,765622645) +(765622645,765622645) +(765622645,765622645) +(3573094983,3573094983) +(3573094983,3573094983) +(3604768422,3604768422) +(3604768422,3604768422) +(3614688582,1599892600) +(3614688582,1599892600) +(3614688582,1599892600) +(3614688582,1599892600) +(3614688582,1599892600) +(3614688582,1599892600) +(3614688582,996508363) +(3614688582,996508363) +2548869326 +2548869326 +401385678 +401385710 +4258739090 +4260836242 +718415633 +718681881 +2314703251 +1238864275 +3900085650 +3907425682 +2314703251 +1238864275 +3569207545 +3568143609 +(1436198067,1436198067) +(1436198067,1436198067) +(3846780865,3846780865) +(1956854492,1956854492) +(2929435161,2929435161) +(2929435161,2929435161) +(3310088565,3310088565) +(3310088565,3310088565) +(3614688582,1294895121) +(3614688582,1294895121) +(3614688582,1138551650) +(3614688582,1138551650) +(3614688582,1294895121) +(3614688582,1294895121) +(3614688582,2840007763) +(3614688582,929186815) diff --git a/tests/queries/0_stateless/01016_simhash_minhash.sql b/tests/queries/0_stateless/01016_simhash_minhash.sql new file mode 100644 index 00000000000..9e87216d26f --- /dev/null +++ b/tests/queries/0_stateless/01016_simhash_minhash.sql @@ -0,0 +1,47 @@ +SELECT ngramSimhash(''); +SELECT ngramSimhash('what a cute cat.'); +SELECT ngramSimhashCaseInsensitive('what a cute cat.'); +SELECT ngramSimhashUTF8('what a cute cat.'); +SELECT ngramSimhashCaseInsensitiveUTF8('what a cute cat.'); +SELECT wordShingleSimhash('what a cute cat.'); +SELECT wordShingleSimhashCaseInsensitive('what a cute cat.'); +SELECT wordShingleSimhashUTF8('what a cute cat.'); +SELECT wordShingleSimhashCaseInsensitiveUTF8('what a cute cat.'); + +SELECT ngramMinhash(''); +SELECT ngramMinhash('what a cute cat.'); +SELECT ngramMinhashCaseInsensitive('what a cute cat.'); +SELECT ngramMinhashUTF8('what a cute cat.'); +SELECT ngramMinhashCaseInsensitiveUTF8('what a cute cat.'); +SELECT wordShingleMinhash('what a cute cat.'); +SELECT wordShingleMinhashCaseInsensitive('what a cute cat.'); +SELECT wordShingleMinhashUTF8('what a cute cat.'); +SELECT wordShingleMinhashCaseInsensitiveUTF8('what a cute cat.'); + +DROP TABLE IF EXISTS defaults; +CREATE TABLE defaults +( + s String +)ENGINE = Memory(); + +INSERT INTO defaults values ('It is the latest occurrence of the Southeast European haze, the issue that occurs in constant intensity during every wet season. It has mainly been caused by forest fires resulting from illegal slash-and-burn clearing performed on behalf of the palm oil industry in Kazakhstan, principally on the islands, which then spread quickly in the dry season.') ('It is the latest occurrence of the Southeast Asian haze, the issue that occurs in constant intensity during every wet season. It has mainly been caused by forest fires resulting from illegal slash-and-burn clearing performed on behalf of the palm oil industry in Kazakhstan, principally on the islands, which then spread quickly in the dry season.'); + +SELECT ngramSimhash(s) FROM defaults; +SELECT ngramSimhashCaseInsensitive(s) FROM defaults; +SELECT ngramSimhashUTF8(s) FROM defaults; +SELECT ngramSimhashCaseInsensitiveUTF8(s) FROM defaults; +SELECT wordShingleSimhash(s) FROM defaults; +SELECT wordShingleSimhashCaseInsensitive(s) FROM defaults; +SELECT wordShingleSimhashUTF8(s) FROM defaults; +SELECT wordShingleSimhashCaseInsensitiveUTF8(s) FROM defaults; + +SELECT ngramMinhash(s) FROM defaults; +SELECT ngramMinhashCaseInsensitive(s) FROM defaults; +SELECT ngramMinhashUTF8(s) FROM defaults; +SELECT ngramMinhashCaseInsensitiveUTF8(s) FROM defaults; +SELECT wordShingleMinhash(s) FROM defaults; +SELECT wordShingleMinhashCaseInsensitive(s) FROM defaults; +SELECT wordShingleMinhashUTF8(s) FROM defaults; +SELECT wordShingleMinhashCaseInsensitiveUTF8(s) FROM defaults; + +DROP TABLE defaults; diff --git a/tests/queries/0_stateless/01017_bithamming_distance.reference b/tests/queries/0_stateless/01017_bithamming_distance.reference new file mode 100644 index 00000000000..cc2d4f39154 --- /dev/null +++ b/tests/queries/0_stateless/01017_bithamming_distance.reference @@ -0,0 +1,15 @@ +1 +7 +63 +2 +1 +3 +5 +4 +6 +6 +6 +3 +5 +9 +9 diff --git a/tests/queries/0_stateless/01017_bithamming_distance.sql b/tests/queries/0_stateless/01017_bithamming_distance.sql new file mode 100644 index 00000000000..4b36894b97c --- /dev/null +++ b/tests/queries/0_stateless/01017_bithamming_distance.sql @@ -0,0 +1,20 @@ +SELECT bitHammingDistance(1, 5); +SELECT bitHammingDistance(100, 100000); +SELECT bitHammingDistance(-1, 1); + +DROP TABLE IF EXISTS defaults; +CREATE TABLE defaults +( + n1 UInt8, + n2 UInt16, + n3 UInt32, + n4 UInt64 +)ENGINE = Memory(); + +INSERT INTO defaults VALUES (1, 2, 3, 4) (12, 4345, 435, 1233) (45, 675, 32343, 54566) (90, 784, 9034, 778752); + +SELECT bitHammingDistance(4, n1) FROM defaults; +SELECT bitHammingDistance(n2, 100) FROM defaults; +SELECT bitHammingDistance(n3, n4) FROM defaults; + +DROP TABLE defaults; diff --git a/tests/queries/0_stateless/01017_tuplehamming_distance.reference b/tests/queries/0_stateless/01017_tuplehamming_distance.reference new file mode 100644 index 00000000000..017ffb0cd33 --- /dev/null +++ b/tests/queries/0_stateless/01017_tuplehamming_distance.reference @@ -0,0 +1,15 @@ +2 +1 +1 +0 +2 +2 +2 +2 +1 +2 +2 +2 +0 +2 +2 diff --git a/tests/queries/0_stateless/01017_tuplehamming_distance.sql b/tests/queries/0_stateless/01017_tuplehamming_distance.sql new file mode 100644 index 00000000000..d0ed1cee096 --- /dev/null +++ b/tests/queries/0_stateless/01017_tuplehamming_distance.sql @@ -0,0 +1,19 @@ +SELECT tupleHammingDistance((1, 2), (3, 4)); +SELECT tupleHammingDistance((120, 243), (120, 434)); +SELECT tupleHammingDistance((-12, 434), (434, 434)); + +DROP TABLE IF EXISTS defaults; +CREATE TABLE defaults +( + t1 Tuple(UInt16, UInt16), + t2 Tuple(UInt32, UInt32), + t3 Tuple(Int64, Int64) +)ENGINE = Memory(); + +INSERT INTO defaults VALUES ((12, 43), (12312, 43453) ,(-10, 32)) ((1, 4), (546, 12345), (546, 12345)) ((90, 9875), (43456, 234203), (1231, -123)) ((87, 987), (545645, 768354634), (9123, 909)); + +SELECT tupleHammingDistance((12, 43), t1) FROM defaults; +SELECT tupleHammingDistance(t2, (546, 456)) FROM defaults; +SELECT tupleHammingDistance(t2, t3) FROM defaults; + +DROP TABLE defaults; From 7b4fc7300c85b38a272a276ff860e226f33a578a Mon Sep 17 00:00:00 2001 From: feng lv Date: Wed, 10 Jun 2020 23:02:58 +0800 Subject: [PATCH 07/23] update fix fix fix --- src/Functions/ExtractString.h | 57 ++---- src/Functions/FunctionsStringHash.cpp | 167 +++++++++--------- src/Functions/FunctionsStringHash.h | 84 +++------ .../01016_simhash_minhash.reference | 59 +++---- 4 files changed, 145 insertions(+), 222 deletions(-) diff --git a/src/Functions/ExtractString.h b/src/Functions/ExtractString.h index f6a7394a9fc..51d6f17380c 100644 --- a/src/Functions/ExtractString.h +++ b/src/Functions/ExtractString.h @@ -1,3 +1,5 @@ +#include +#include #include #include @@ -19,6 +21,9 @@ struct ExtractStringImpl { static constexpr size_t default_padding = 16; + // the length of code_points = default_padding + N -1 + // pos: the current beginning location that we want to copy data + // end: the end loction of the string static ALWAYS_INLINE size_t readASCIICodePoints(UInt8 * code_points, const char *& pos, const char * end) { /// Offset before which we copy some data. @@ -47,32 +52,8 @@ struct ExtractStringImpl return default_padding; } - // used by FunctionsStringHash - // it's not easy to add padding for ColumnString, so we need safety check each memcpy - static ALWAYS_INLINE size_t readASCIICodePointsNoPadding(UInt8 * code_points, const char *& pos, const char * end) - { - constexpr size_t padding_offset = default_padding - N + 1; - memcpy(code_points, code_points + padding_offset, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(UInt8)); - - // safety check - size_t cpy_size = (pos + padding_offset > end) ? end - pos : padding_offset; - - memcpy(code_points + (N - 1), pos, cpy_size * sizeof(UInt8)); - - if constexpr (CaseInsensitive) - { - unrollLowering(code_points, std::make_index_sequence()); - } - pos += padding_offset; - if (pos > end) - return default_padding - (pos - end); - return default_padding; - } - - // read a ASCII word from pos to word - // if the word size exceeds max_word_size, only read max_word_size byte - // in FuntionsStringHash, the default value of max_word_size is 128 - static ALWAYS_INLINE inline size_t readOneASCIIWord(UInt8 * word, const char *& pos, const char * end, size_t max_word_size) + // read a ASCII word + static ALWAYS_INLINE inline size_t readOneASCIIWord(PaddedPODArray & word_buf, const char *& pos, const char * end) { // jump seperators while (pos < end && !isAlphaNumericASCII(*pos)) @@ -83,14 +64,12 @@ struct ExtractStringImpl while (pos < end && isAlphaNumericASCII(*pos)) ++pos; - size_t word_size = std::min(pos - word_start, max_word_size); - - memcpy(word, word_start, word_size); + word_buf.assign(word_start, pos); if (CaseInsensitive) { - std::transform(word, word + word_size, word, [](UInt8 c) { return std::tolower(c); }); + std::transform(word_buf.begin(), word_buf.end(), word_buf.begin(), [](UInt8 c) { return std::tolower(c); }); } - return word_size; + return word_buf.size(); } static ALWAYS_INLINE inline size_t readUTF8CodePoints(UInt32 * code_points, const char *& pos, const char * end) @@ -106,27 +85,21 @@ struct ExtractStringImpl } // read one UTF8 word from pos to word - // also, we assume that one word size cann't exceed max_word_size with default value 128 - static ALWAYS_INLINE inline size_t readOneUTF8Word(UInt32 * word, const char *& pos, const char * end, size_t max_word_size) + static ALWAYS_INLINE inline size_t readOneUTF8Word(PaddedPODArray & word_buf, const char *& pos, const char * end) { // jump UTF8 seperator while (pos < end && isUTF8Sep(*pos)) ++pos; + word_buf.clear(); // UTF8 word's character number - size_t num = 0; - while (pos < end && num < max_word_size && !isUTF8Sep(*pos)) + while (pos < end && !isUTF8Sep(*pos)) { - word[num++] = readOneUTF8Code(pos, end); + word_buf.push_back(readOneUTF8Code(pos, end)); } - return num; + return word_buf.size(); } private: - static ALWAYS_INLINE inline bool isAlphaNumericASCII(const UInt8 c) - { - return (c >= 48 && c <= 57) || (c >= 65 && c <= 90) || (c >= 97 && c <= 122); - } - template static ALWAYS_INLINE inline void unrollLowering(Container & cont, const std::index_sequence &) { diff --git a/src/Functions/FunctionsStringHash.cpp b/src/Functions/FunctionsStringHash.cpp index 2195ff7c703..f8c78a808b3 100644 --- a/src/Functions/FunctionsStringHash.cpp +++ b/src/Functions/FunctionsStringHash.cpp @@ -5,11 +5,15 @@ #include #include #include +#include #include #include +#include +#include #include +#include #include namespace DB @@ -64,6 +68,11 @@ struct Hash }; // Simhash String -> UInt64 +// N: the length of ngram or words shingles +// CodePoint: UInt8(ASCII) or UInt32(UTF8) +// UTF8: means ASCII or UTF8, these two parameters CodePoint and UTF8 can only be (UInt8, false) or (UInt32, true) +// Ngram: means ngram(true) or words shingles(false) +// CaseInsensitive: means should we consider about letter case or not template struct SimhashImpl { @@ -71,7 +80,6 @@ struct SimhashImpl using StrOp = ExtractStringImpl; // we made an assumption that the size of one word cann't exceed 128, which may not true // if some word's size exceed 128, it would be cut up to several word - static constexpr size_t max_word_size = 1u << 7; static constexpr size_t max_string_size = 1u << 15; static constexpr size_t simultaneously_codepoints_num = StrOp::default_padding + N - 1; @@ -135,7 +143,7 @@ struct SimhashImpl static ALWAYS_INLINE inline UInt64 wordShinglesCalculateHashValue( const char * data, size_t size, - size_t (*read_one_word)(CodePoint *, const char *&, const char *, size_t), + size_t (*read_one_word)(PaddedPODArray &, const char *&, const char *), UInt64 (*hash_functor)(const UInt64 *, size_t, size_t)) { const char * start = data; @@ -146,16 +154,15 @@ struct SimhashImpl // a array to store N word hash values UInt64 nword_hashes[N] = {}; // word buffer to store one word - CodePoint word_buf[max_word_size] = {}; - size_t word_size; + PaddedPODArray word_buf; // get first word shingle for (size_t i = 0; i < N && start < end; ++i) { - word_size = read_one_word(word_buf, start, end, max_word_size); - if (word_size) + read_one_word(word_buf, start, end); + if (!word_buf.empty()) { // for each word, calculate a hash value and stored into the array - nword_hashes[i++] = Hash::hashSum(word_buf, word_size); + nword_hashes[i++] = Hash::hashSum(word_buf.data(), word_buf.size()); } } @@ -168,14 +175,14 @@ struct SimhashImpl } size_t offset = 0; - while (start < end && (word_size = read_one_word(word_buf, start, end, max_word_size))) + while (start < end && read_one_word(word_buf, start, end)) { // we need to store the new word hash value to the oldest location. // for example, N = 5, array |a0|a1|a2|a3|a4|, now , a0 is the oldest location, // so we need to store new word hash into location of a0, then ,this array become // |a5|a1|a2|a3|a4|, next time, a1 become the oldest location, we need to store new // word hash value into locaion of a1, then array become |a5|a6|a2|a3|a4| - nword_hashes[offset] = Hash::hashSum(word_buf, word_size); + nword_hashes[offset] = Hash::hashSum(word_buf.data(), word_buf.size()); offset = (offset + 1) % N; // according to the word hash storation way, in order to not lose the word shingle's // sequence information, when calculation word shingle hash value, we need provide the offset @@ -203,7 +210,7 @@ struct SimhashImpl if constexpr (Ngram) { if constexpr (!UTF8) - return calc_func(std::forward(args)..., StrOp::readASCIICodePointsNoPadding, Hash::ngramASCIIHash); + return calc_func(std::forward(args)..., StrOp::readASCIICodePoints, Hash::ngramASCIIHash); else return calc_func(std::forward(args)..., StrOp::readUTF8CodePoints, Hash::ngramUTF8Hash); } @@ -216,17 +223,7 @@ struct SimhashImpl } } - // constant string - static inline void constant(const String data, UInt64 & res) - { - if constexpr (Ngram) - res = dispatch(ngramCalculateHashValue, data.data(), data.size()); - else - res = dispatch(wordShinglesCalculateHashValue, data.data(), data.size()); - } - - // non-constant string - static void vector(const ColumnString::Chars & data, const ColumnString::Offsets & offsets, PaddedPODArray & res) + static void apply(const ColumnString::Chars & data, const ColumnString::Offsets & offsets, PaddedPODArray & res) { for (size_t i = 0; i < offsets.size(); ++i) { @@ -239,53 +236,64 @@ struct SimhashImpl else res[i] = dispatch(wordShinglesCalculateHashValue, one_data, data_size); } + else + res[i] = -1ull; } } }; +template +class FixedHeap +{ +public: + FixedHeap() = delete; + + explicit FixedHeap(F f_) : f(f_), data_t(std::make_shared>(K, v)) + { + std::make_heap(data_t->begin(), data_t->end(), f); + } + + void insertAndReplace(size_t new_v) + { + data_t->push_back(new_v); + std::push_heap(data_t->begin(), data_t->end(), f); + std::pop_heap(data_t->begin(), data_t->end(), f); + data_t->pop_back(); + } + + const size_t * data() { return data_t->data(); } + +private: + F f; + std::shared_ptr> data_t; +}; + + // Minhash: String -> Tuple(UInt64, UInt64) // for each string, we extract ngram or word shingle, // for each ngram or word shingle, calculate a hash value, // then we take the K minimum hash values to calculate a hashsum, // and take the K maximum hash values to calculate another hashsum, // return this two hashsum: Tuple(hashsum1, hashsum2) +// +// N: the length of ngram or words shingles +// K: the number of minimum hashes and maximum hashes that we keep +// CodePoint: UInt8(ASCII) or UInt32(UTF8) +// UTF8: means ASCII or UTF8, these two parameters CodePoint and UTF8 can only be (UInt8, false) or (UInt32, true) +// Ngram: means ngram(true) or words shingles(false) +// CaseInsensitive: means should we consider about letter case or not template struct MinhashImpl { + using Less = std::less; + using Greater = std::greater; + using MaxHeap = FixedHeap, K, -1ULL>; + using MinHeap = FixedHeap, K, 0>; using ResultType = UInt64; using StrOp = ExtractStringImpl; - static constexpr size_t max_word_size = 1u << 7; static constexpr size_t max_string_size = 1u << 15; static constexpr size_t simultaneously_codepoints_num = StrOp::default_padding + N - 1; - // insert a new value into K minimum hash array if this value - // is smaller than the greatest value in the array - static ALWAYS_INLINE inline void insertMinValue(UInt64 * hashes, UInt64 v) - { - size_t i = 0; - for (; i < K && hashes[i] <= v; ++i) - ; - if (i == K) - return; - for (size_t j = K - 2; j >= i; --j) - hashes[j + 1] = hashes[j]; - hashes[i] = v; - } - - // insert a new value into K maximum hash array if this value - // is greater than the smallest value in the array - static ALWAYS_INLINE inline void insertMaxValue(UInt64 * hashes, UInt64 v) - { - int i = K - 1; - for (; i >= 0 && hashes[i] >= v; --i) - ; - if (i < 0) - return; - for (int j = 1; j <= i; ++j) - hashes[j - 1] = hashes[j]; - hashes[i] = v; - } - // Minhash ngram calculate function, String -> Tuple(UInt64, UInt64) // we extract ngram from input string, and calculate a hash value for each ngram // then we take the K minimum hash values to calculate a hashsum, @@ -300,8 +308,8 @@ struct MinhashImpl const char * start = data; const char * end = data + size; // we just maintain the K minimu and K maximum hash values - UInt64 k_minimum[K] = {}; - UInt64 k_maxinum[K] = {}; + MaxHeap k_minimum_hashes(Less{}); + MinHeap k_maximum_hashes(Greater{}); CodePoint cp[simultaneously_codepoints_num] = {}; size_t found = read_code_points(cp, start, end); @@ -314,15 +322,15 @@ struct MinhashImpl auto new_hash = hash_functor(cp + iter); // insert the new hash value into array used to store K minimum value // and K maximum value - insertMinValue(k_minimum, new_hash); - insertMaxValue(k_maxinum, new_hash); + k_minimum_hashes.insertAndReplace(new_hash); + k_maximum_hashes.insertAndReplace(new_hash); } iter = 0; } while (start < end && (found = read_code_points(cp, start, end))); // calculate hashsum of the K minimum hash values and K maximum hash values - UInt64 res1 = Hash::hashSum(k_maxinum, K); - UInt64 res2 = Hash::hashSum(k_maxinum, K); + UInt64 res1 = Hash::hashSum(k_minimum_hashes.data(), K); + UInt64 res2 = Hash::hashSum(k_maximum_hashes.data(), K); return std::make_tuple(res1, res2); } @@ -332,47 +340,46 @@ struct MinhashImpl static ALWAYS_INLINE inline std::tuple wordShinglesCalculateHashValue( const char * data, size_t size, - size_t (*read_one_word)(CodePoint *, const char *&, const char *, size_t), + size_t (*read_one_word)(PaddedPODArray &, const char *&, const char *), UInt64 (*hash_functor)(const UInt64 *, size_t, size_t)) { const char * start = data; const char * end = start + size; // also we just store the K minimu and K maximum hash values - UInt64 k_minimum[K] = {}; - UInt64 k_maxinum[K] = {}; + MaxHeap k_minimum_hashes(Less{}); + MinHeap k_maximum_hashes(Greater{}); // array to store n word hashes UInt64 nword_hashes[N] = {}; // word buffer to store one word - CodePoint word_buf[max_word_size] = {}; - size_t word_size; + PaddedPODArray word_buf; // how word shingle hash value calculation and word hash storation is same as we // have descripted in Simhash wordShinglesCalculateHashValue function for (size_t i = 0; i < N && start < end; ++i) { - word_size = read_one_word(word_buf, start, end, max_word_size); - if (word_size) + read_one_word(word_buf, start, end); + if (!word_buf.empty()) { - nword_hashes[i++] = Hash::hashSum(word_buf, word_size); + nword_hashes[i++] = Hash::hashSum(word_buf.data(), word_buf.size()); } } auto new_hash = hash_functor(nword_hashes, N, 0); - insertMinValue(k_minimum, new_hash); - insertMaxValue(k_maxinum, new_hash); + k_minimum_hashes.insertAndReplace(new_hash); + k_maximum_hashes.insertAndReplace(new_hash); size_t offset = 0; - while (start < end && (word_size = read_one_word(word_buf, start, end, max_word_size))) + while (start < end && read_one_word(word_buf, start, end)) { - nword_hashes[offset] = Hash::hashSum(word_buf, word_size); + nword_hashes[offset] = Hash::hashSum(word_buf.data(), word_buf.size()); offset = (offset + 1) % N; new_hash = hash_functor(nword_hashes, N, offset); - insertMinValue(k_minimum, new_hash); - insertMaxValue(k_maxinum, new_hash); + k_minimum_hashes.insertAndReplace(new_hash); + k_maximum_hashes.insertAndReplace(new_hash); } // calculate hashsum - UInt64 res1 = Hash::hashSum(k_minimum, K); - UInt64 res2 = Hash::hashSum(k_maxinum, K); + UInt64 res1 = Hash::hashSum(k_minimum_hashes.data(), K); + UInt64 res2 = Hash::hashSum(k_maximum_hashes.data(), K); return std::make_tuple(res1, res2); } @@ -382,7 +389,7 @@ struct MinhashImpl if constexpr (Ngram) { if constexpr (!UTF8) - return calc_func(std::forward(args)..., StrOp::readASCIICodePointsNoPadding, Hash::ngramASCIIHash); + return calc_func(std::forward(args)..., StrOp::readASCIICodePoints, Hash::ngramASCIIHash); else return calc_func(std::forward(args)..., StrOp::readUTF8CodePoints, Hash::ngramUTF8Hash); } @@ -395,17 +402,7 @@ struct MinhashImpl } } - // constant string - static void constant(const String data, UInt64 & res1, UInt64 & res2) - { - if constexpr (Ngram) - std::tie(res1, res2) = dispatch(ngramCalculateHashValue, data.data(), data.size()); - else - std::tie(res1, res2) = dispatch(wordShinglesCalculateHashValue, data.data(), data.size()); - } - - // non-constant string - static void vector( + static void apply( const ColumnString::Chars & data, const ColumnString::Offsets & offsets, PaddedPODArray & res1, @@ -422,6 +419,8 @@ struct MinhashImpl else std::tie(res1[i], res2[i]) = dispatch(wordShinglesCalculateHashValue, one_data, data_size); } + else + std::tie(res1[i], res2[i]) = std::make_tuple(-1ull, -1ull); } } }; diff --git a/src/Functions/FunctionsStringHash.h b/src/Functions/FunctionsStringHash.h index bada7490288..23c6db51e8e 100644 --- a/src/Functions/FunctionsStringHash.h +++ b/src/Functions/FunctionsStringHash.h @@ -44,77 +44,37 @@ public: return std::make_shared(DataTypes{element, element}); } + bool useDefaultImplementationForConstants() const override { return true; } + void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t) override { const ColumnPtr & column = block.getByPosition(arguments[0]).column; - const ColumnConst * col_const = typeid_cast(&*column); using ResultType = typename Impl::ResultType; if constexpr (is_simhash) { - if (col_const) - { - ResultType res{}; - const String & str_data = col_const->getValue(); - if (str_data.size() > Impl::max_string_size) - { - throw Exception( - "String size is too big for function " + getName() + ". Should be at most " + std::to_string(Impl::max_string_size), - ErrorCodes::TOO_LARGE_STRING_SIZE); - } - Impl::constant(str_data, res); - block.getByPosition(result).column = block.getByPosition(result).type->createColumnConst(1, toField(res)); - } - else - { - // non const string - auto col_res = ColumnVector::create(); - typename ColumnVector::Container & vec_res = col_res->getData(); - vec_res.resize(column->size()); - const ColumnString * col_str_vector = checkAndGetColumn(&*column); - Impl::vector(col_str_vector->getChars(), col_str_vector->getOffsets(), vec_res); - block.getByPosition(result).column = std::move(col_res); - } + // non const string, const case is handled by useDefaultImplementationForConstants. + auto col_res = ColumnVector::create(); + typename ColumnVector::Container & vec_res = col_res->getData(); + vec_res.resize(column->size()); + const ColumnString * col_str_vector = checkAndGetColumn(&*column); + Impl::apply(col_str_vector->getChars(), col_str_vector->getOffsets(), vec_res); + block.getByPosition(result).column = std::move(col_res); } else // Min hash { - if (col_const) - { - ResultType h1, h2; - const String & str_data = col_const->getValue(); - if (str_data.size() > Impl::max_string_size) - { - throw Exception( - "String size is too big for function " + getName() + ". Should be at most " + std::to_string(Impl::max_string_size), - ErrorCodes::TOO_LARGE_STRING_SIZE); - } - Impl::constant(str_data, h1, h2); - auto h1_col = ColumnVector::create(1); - auto h2_col = ColumnVector::create(1); - typename ColumnVector::Container & h1_data = h1_col->getData(); - typename ColumnVector::Container & h2_data = h2_col->getData(); - h1_data[0] = h1; - h2_data[0] = h2; - MutableColumns tuple_columns; - tuple_columns.emplace_back(std::move(h1_col)); - tuple_columns.emplace_back(std::move(h2_col)); - block.getByPosition(result).column = ColumnTuple::create(std::move(tuple_columns)); - } - else - { - // non const string - auto col_h1 = ColumnVector::create(); - auto col_h2 = ColumnVector::create(); - typename ColumnVector::Container & vec_h1 = col_h1->getData(); - typename ColumnVector::Container & vec_h2 = col_h2->getData(); - vec_h1.resize(column->size()); - vec_h2.resize(column->size()); - const ColumnString * col_str_vector = checkAndGetColumn(&*column); - Impl::vector(col_str_vector->getChars(), col_str_vector->getOffsets(), vec_h1, vec_h2); - MutableColumns tuple_columns; - tuple_columns.emplace_back(std::move(col_h1)); - tuple_columns.emplace_back(std::move(col_h2)); - block.getByPosition(result).column = ColumnTuple::create(std::move(tuple_columns)); - } + // non const string + auto col_h1 = ColumnVector::create(); + auto col_h2 = ColumnVector::create(); + typename ColumnVector::Container & vec_h1 = col_h1->getData(); + typename ColumnVector::Container & vec_h2 = col_h2->getData(); + vec_h1.resize(column->size()); + vec_h2.resize(column->size()); + const ColumnString * col_str_vector = checkAndGetColumn(&*column); + Impl::apply(col_str_vector->getChars(), col_str_vector->getOffsets(), vec_h1, vec_h2); + MutableColumns tuple_columns; + tuple_columns.emplace_back(std::move(col_h1)); + tuple_columns.emplace_back(std::move(col_h2)); + block.getByPosition(result).column = ColumnTuple::create(std::move(tuple_columns)); } } }; diff --git a/tests/queries/0_stateless/01016_simhash_minhash.reference b/tests/queries/0_stateless/01016_simhash_minhash.reference index 7fa70b343a4..2ababa29d1e 100644 --- a/tests/queries/0_stateless/01016_simhash_minhash.reference +++ b/tests/queries/0_stateless/01016_simhash_minhash.reference @@ -7,24 +7,15 @@ 4124079607 4124079607 979945684 -(3614688582,3614688582) -(3614688582,3614688582) -(765622645,765622645) -(765622645,765622645) -(765622645,765622645) -(765622645,765622645) -(3573094983,3573094983) -(3573094983,3573094983) -(3604768422,3604768422) -(3604768422,3604768422) -(3614688582,1599892600) -(3614688582,1599892600) -(3614688582,1599892600) -(3614688582,1599892600) -(3614688582,1599892600) -(3614688582,1599892600) -(3614688582,996508363) -(3614688582,996508363) +(3700739653,3614688582) +(2594676265,556335836) +(2594676265,556335836) +(3157724679,410999184) +(1378962320,1336242123) +(3277652371,1284714580) +(3277652371,1284714580) +(3277652371,1284714580) +(3140472415,3787127930) 2548869326 2548869326 401385678 @@ -41,19 +32,19 @@ 1238864275 3569207545 3568143609 -(1436198067,1436198067) -(1436198067,1436198067) -(3846780865,3846780865) -(1956854492,1956854492) -(2929435161,2929435161) -(2929435161,2929435161) -(3310088565,3310088565) -(3310088565,3310088565) -(3614688582,1294895121) -(3614688582,1294895121) -(3614688582,1138551650) -(3614688582,1138551650) -(3614688582,1294895121) -(3614688582,1294895121) -(3614688582,2840007763) -(3614688582,929186815) +(1525603924,509999509) +(1525603924,3764233597) +(1525603924,2706466536) +(1525603924,1315689278) +(3824755630,2122451089) +(946380879,2122451089) +(3295904092,4129673330) +(3295904092,4129673330) +(138351420,974287950) +(824220170,974287950) +(3300081739,2402902535) +(3300081739,3993394872) +(138351420,974287950) +(824220170,974287950) +(3083836461,957058619) +(4120380459,90533100) From 61817b30fc5474599b48b16456f0d2f55f756b59 Mon Sep 17 00:00:00 2001 From: feng lv Date: Wed, 24 Jun 2020 00:28:17 +0800 Subject: [PATCH 08/23] fix --- src/Functions/FunctionsStringSimilarity.cpp | 49 +++++++++++++-------- 1 file changed, 30 insertions(+), 19 deletions(-) diff --git a/src/Functions/FunctionsStringSimilarity.cpp b/src/Functions/FunctionsStringSimilarity.cpp index 81adb1de26f..cf9d4d6e42a 100644 --- a/src/Functions/FunctionsStringSimilarity.cpp +++ b/src/Functions/FunctionsStringSimilarity.cpp @@ -1,6 +1,6 @@ +#include #include #include -#include #include #include #include @@ -268,8 +268,7 @@ struct NgramDistanceImpl size_t distance = second_size; if (data_size <= max_string_size) { - size_t first_size - = dispatchSearcher(calculateHaystackStatsAndMetric, data.data(), data_size, common_stats, distance, nullptr); + size_t first_size = dispatchSearcher(calculateHaystackStatsAndMetric, data.data(), data_size, common_stats, distance, nullptr); /// For !symmetric version we should not use first_size. if constexpr (symmetric) res = distance * 1.f / std::max(first_size + second_size, size_t(1)); @@ -313,14 +312,23 @@ struct NgramDistanceImpl if (needle_size <= max_string_size && haystack_size <= max_string_size) { /// Get needle stats. - const size_t needle_stats_size - = dispatchSearcher(calculateNeedleStats, needle, needle_size, common_stats, needle_ngram_storage.get()); + const size_t needle_stats_size = dispatchSearcher( + calculateNeedleStats, + needle, + needle_size, + common_stats, + needle_ngram_storage.get()); size_t distance = needle_stats_size; /// Combine with haystack stats, return to initial needle stats. const size_t haystack_stats_size = dispatchSearcher( - calculateHaystackStatsAndMetric, haystack, haystack_size, common_stats, distance, haystack_ngram_storage.get()); + calculateHaystackStatsAndMetric, + haystack, + haystack_size, + common_stats, + distance, + haystack_ngram_storage.get()); /// Return to zero array stats. for (size_t j = 0; j < needle_stats_size; ++j) @@ -382,8 +390,12 @@ struct NgramDistanceImpl if (needle_size <= max_string_size && haystack_size <= max_string_size) { - const size_t needle_stats_size - = dispatchSearcher(calculateNeedleStats, needle, needle_size, common_stats, needle_ngram_storage.get()); + const size_t needle_stats_size = dispatchSearcher( + calculateNeedleStats, + needle, + needle_size, + common_stats, + needle_ngram_storage.get()); size_t distance = needle_stats_size; @@ -407,11 +419,15 @@ struct NgramDistanceImpl prev_offset = needle_offsets[i]; } + } } static void vectorConstant( - const ColumnString::Chars & data, const ColumnString::Offsets & offsets, std::string needle, PaddedPODArray & res) + const ColumnString::Chars & data, + const ColumnString::Offsets & offsets, + std::string needle, + PaddedPODArray & res) { /// zeroing our map NgramStats common_stats = {}; @@ -437,8 +453,7 @@ struct NgramDistanceImpl size_t haystack_stats_size = dispatchSearcher( calculateHaystackStatsAndMetric, reinterpret_cast(haystack), - haystack_size, - common_stats, + haystack_size, common_stats, distance, ngram_storage.get()); /// For !symmetric version we should not use haystack_stats_size. @@ -500,18 +515,14 @@ struct NameNgramSearchUTF8CaseInsensitive }; using FunctionNgramDistance = FunctionsStringSimilarity, NameNgramDistance>; -using FunctionNgramDistanceCaseInsensitive - = FunctionsStringSimilarity, NameNgramDistanceCaseInsensitive>; +using FunctionNgramDistanceCaseInsensitive = FunctionsStringSimilarity, NameNgramDistanceCaseInsensitive>; using FunctionNgramDistanceUTF8 = FunctionsStringSimilarity, NameNgramDistanceUTF8>; -using FunctionNgramDistanceCaseInsensitiveUTF8 - = FunctionsStringSimilarity, NameNgramDistanceUTF8CaseInsensitive>; +using FunctionNgramDistanceCaseInsensitiveUTF8 = FunctionsStringSimilarity, NameNgramDistanceUTF8CaseInsensitive>; using FunctionNgramSearch = FunctionsStringSimilarity, NameNgramSearch>; -using FunctionNgramSearchCaseInsensitive - = FunctionsStringSimilarity, NameNgramSearchCaseInsensitive>; +using FunctionNgramSearchCaseInsensitive = FunctionsStringSimilarity, NameNgramSearchCaseInsensitive>; using FunctionNgramSearchUTF8 = FunctionsStringSimilarity, NameNgramSearchUTF8>; -using FunctionNgramSearchCaseInsensitiveUTF8 - = FunctionsStringSimilarity, NameNgramSearchUTF8CaseInsensitive>; +using FunctionNgramSearchCaseInsensitiveUTF8 = FunctionsStringSimilarity, NameNgramSearchUTF8CaseInsensitive>; void registerFunctionsStringSimilarity(FunctionFactory & factory) From 07b5f9a58f1546a2afe1a65ef084d359b1c3dbf4 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Fri, 7 Aug 2020 18:04:51 +0300 Subject: [PATCH 09/23] Fix build. --- src/Functions/FunctionsMiscellaneous.h | 2 ++ src/Functions/FunctionsStringHash.h | 2 +- src/Functions/bitHammingDistance.cpp | 2 +- src/Functions/tupleHammingDistance.cpp | 2 +- 4 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/Functions/FunctionsMiscellaneous.h b/src/Functions/FunctionsMiscellaneous.h index 5703f72ce2a..6cd11b12bd9 100644 --- a/src/Functions/FunctionsMiscellaneous.h +++ b/src/Functions/FunctionsMiscellaneous.h @@ -210,6 +210,8 @@ public: if (action.type == ExpressionAction::Type::JOIN || action.type == ExpressionAction::Type::ARRAY_JOIN) throw Exception("Expression with arrayJoin or other unusual action cannot be captured", ErrorCodes::BAD_ARGUMENTS); +std::cerr << "=============== FunctionCaptureOverloadResolver expr " << expression_actions->dumpActions() << std::endl; + std::unordered_map arguments_map; const auto & all_arguments = expression_actions->getRequiredColumnsWithTypes(); diff --git a/src/Functions/FunctionsStringHash.h b/src/Functions/FunctionsStringHash.h index 23c6db51e8e..64ee7f9fe59 100644 --- a/src/Functions/FunctionsStringHash.h +++ b/src/Functions/FunctionsStringHash.h @@ -46,7 +46,7 @@ public: bool useDefaultImplementationForConstants() const override { return true; } - void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t) override + void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t) const override { const ColumnPtr & column = block.getByPosition(arguments[0]).column; using ResultType = typename Impl::ResultType; diff --git a/src/Functions/bitHammingDistance.cpp b/src/Functions/bitHammingDistance.cpp index 5c13a57c426..21d4aa2c69c 100644 --- a/src/Functions/bitHammingDistance.cpp +++ b/src/Functions/bitHammingDistance.cpp @@ -93,7 +93,7 @@ public: bool useDefaultImplementationForConstants() const override { return true; } - void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t) override + void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t) const override { auto * left_generic = block.getByPosition(arguments[0]).type.get(); auto * right_generic = block.getByPosition(arguments[1]).type.get(); diff --git a/src/Functions/tupleHammingDistance.cpp b/src/Functions/tupleHammingDistance.cpp index 8b3f9a696aa..a0dc938ab17 100644 --- a/src/Functions/tupleHammingDistance.cpp +++ b/src/Functions/tupleHammingDistance.cpp @@ -100,7 +100,7 @@ public: return std::make_shared(); } - void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t) override + void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t) const override { const ColumnWithTypeAndName & arg1 = block.getByPosition(arguments[0]); const ColumnWithTypeAndName & arg2 = block.getByPosition(arguments[1]); From 11966f62576e527e26d2f751f7ecf1cdc1cde14b Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Mon, 10 Aug 2020 18:22:08 +0300 Subject: [PATCH 10/23] Update FunctionsMiscellaneous.h --- src/Functions/FunctionsMiscellaneous.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/Functions/FunctionsMiscellaneous.h b/src/Functions/FunctionsMiscellaneous.h index 6cd11b12bd9..5703f72ce2a 100644 --- a/src/Functions/FunctionsMiscellaneous.h +++ b/src/Functions/FunctionsMiscellaneous.h @@ -210,8 +210,6 @@ public: if (action.type == ExpressionAction::Type::JOIN || action.type == ExpressionAction::Type::ARRAY_JOIN) throw Exception("Expression with arrayJoin or other unusual action cannot be captured", ErrorCodes::BAD_ARGUMENTS); -std::cerr << "=============== FunctionCaptureOverloadResolver expr " << expression_actions->dumpActions() << std::endl; - std::unordered_map arguments_map; const auto & all_arguments = expression_actions->getRequiredColumnsWithTypes(); From 2067501ead62322c51ec4de0bea469c7e758d8b9 Mon Sep 17 00:00:00 2001 From: feng lv Date: Sun, 16 Aug 2020 15:42:35 +0800 Subject: [PATCH 11/23] fix --- src/Functions/bitHammingDistance.cpp | 23 ++++++++++++----------- src/Functions/tupleHammingDistance.cpp | 19 ++++++++++--------- 2 files changed, 22 insertions(+), 20 deletions(-) diff --git a/src/Functions/bitHammingDistance.cpp b/src/Functions/bitHammingDistance.cpp index 5c13a57c426..08678689a15 100644 --- a/src/Functions/bitHammingDistance.cpp +++ b/src/Functions/bitHammingDistance.cpp @@ -19,21 +19,21 @@ struct BitHammingDistanceImpl { using ResultType = UInt8; - static void NO_INLINE vector_vector(const PaddedPODArray & a, const PaddedPODArray & b, PaddedPODArray & c) + static void NO_INLINE vectorVector(const PaddedPODArray & a, const PaddedPODArray & b, PaddedPODArray & c) { size_t size = a.size(); for (size_t i = 0; i < size; ++i) c[i] = apply(a[i], b[i]); } - static void NO_INLINE vector_constant(const PaddedPODArray & a, B b, PaddedPODArray & c) + static void NO_INLINE vectorConstant(const PaddedPODArray & a, B b, PaddedPODArray & c) { size_t size = a.size(); for (size_t i = 0; i < size; ++i) c[i] = apply(a[i], b); } - static void NO_INLINE constant_vector(A a, const PaddedPODArray & b, PaddedPODArray & c) + static void NO_INLINE constantVector(A a, const PaddedPODArray & b, PaddedPODArray & c) { size_t size = b.size(); for (size_t i = 0; i < size; ++i) @@ -95,9 +95,10 @@ public: void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t) override { - auto * left_generic = block.getByPosition(arguments[0]).type.get(); - auto * right_generic = block.getByPosition(arguments[1]).type.get(); - bool valid = castBothTypes(left_generic, right_generic, [&](const auto & left, const auto & right) { + const auto * left_generic = block.getByPosition(arguments[0]).type.get(); + const auto * right_generic = block.getByPosition(arguments[1]).type.get(); + bool valid = castBothTypes(left_generic, right_generic, [&](const auto & left, const auto & right) + { using LeftDataType = std::decay_t; using RightDataType = std::decay_t; using T0 = typename LeftDataType::FieldType; @@ -108,8 +109,8 @@ public: using OpImpl = BitHammingDistanceImpl; - auto col_left_raw = block.getByPosition(arguments[0]).column.get(); - auto col_right_raw = block.getByPosition(arguments[1]).column.get(); + const auto col_left_raw = block.getByPosition(arguments[0]).column.get(); + const auto col_right_raw = block.getByPosition(arguments[1]).column.get(); typename ColVecResult::MutablePtr col_res = nullptr; col_res = ColVecResult::create(); @@ -122,7 +123,7 @@ public: if (auto col_right = checkAndGetColumn(col_right_raw)) { // constant integer - non-constant integer - OpImpl::constant_vector(col_left_const->template getValue(), col_right->getData(), vec_res); + OpImpl::constantVector(col_left_const->template getValue(), col_right->getData(), vec_res); } else return false; @@ -131,10 +132,10 @@ public: { if (auto col_right = checkAndGetColumn(col_right_raw)) // non-constant integer - non-constant integer - OpImpl::vector_vector(col_left->getData(), col_right->getData(), vec_res); + OpImpl::vectorVector(col_left->getData(), col_right->getData(), vec_res); else if (auto col_right_const = checkAndGetColumnConst(col_right_raw)) // non-constant integer - constant integer - OpImpl::vector_constant(col_left->getData(), col_right_const->template getValue(), vec_res); + OpImpl::vectorConstant(col_left->getData(), col_right_const->template getValue(), vec_res); else return false; } diff --git a/src/Functions/tupleHammingDistance.cpp b/src/Functions/tupleHammingDistance.cpp index 8b3f9a696aa..c2d0ae66875 100644 --- a/src/Functions/tupleHammingDistance.cpp +++ b/src/Functions/tupleHammingDistance.cpp @@ -20,7 +20,7 @@ struct TupleHammingDistanceImpl { using ResultType = UInt8; - static void NO_INLINE vector_vector( + static void NO_INLINE vectorVector( const PaddedPODArray & a1, const PaddedPODArray & b1, const PaddedPODArray & a2, @@ -33,7 +33,7 @@ struct TupleHammingDistanceImpl } static void NO_INLINE - vector_constant(const PaddedPODArray & a1, const PaddedPODArray & b1, UInt64 a2, UInt64 b2, PaddedPODArray & c) + vectorConstant(const PaddedPODArray & a1, const PaddedPODArray & b1, UInt64 a2, UInt64 b2, PaddedPODArray & c) { size_t size = a1.size(); for (size_t i = 0; i < size; ++i) @@ -41,14 +41,14 @@ struct TupleHammingDistanceImpl } static void NO_INLINE - constant_vector(UInt64 a1, UInt64 b1, const PaddedPODArray & a2, const PaddedPODArray & b2, PaddedPODArray & c) + constantVector(UInt64 a1, UInt64 b1, const PaddedPODArray & a2, const PaddedPODArray & b2, PaddedPODArray & c) { size_t size = a2.size(); for (size_t i = 0; i < size; ++i) c[i] = apply(a1, a2[i]) + apply(b1, b2[i]); } - static ResultType constant_constant(UInt64 a1, UInt64 b1, UInt64 a2, UInt64 b2) { return apply(a1, a2) + apply(b1, b2); } + static ResultType constantConstant(UInt64 a1, UInt64 b1, UInt64 a2, UInt64 b2) { return apply(a1, a2) + apply(b1, b2); } private: static inline UInt8 apply(UInt64 a, UInt64 b) { return a != b; } @@ -112,7 +112,8 @@ public: throw Exception( "Illegal column of arguments of function " + getName() + ", tuple should have exactly two elements.", ErrorCodes::ILLEGAL_COLUMN); - bool valid = castBothTypes(left_elems[0].get(), right_elems[0].get(), [&](const auto & left, const auto & right) { + bool valid = castBothTypes(left_elems[0].get(), right_elems[0].get(), [&](const auto & left, const auto & right) + { using LeftDataType = std::decay_t; using RightDataType = std::decay_t; using T0 = typename LeftDataType::FieldType; @@ -137,7 +138,7 @@ public: cols1[1]->get(0, b1); cols2[0]->get(0, a2); cols2[1]->get(0, b2); - auto res = OpImpl::constant_constant(a1.get(), b1.get(), a2.get(), b2.get()); + auto res = OpImpl::constantConstant(a1.get(), b1.get(), a2.get(), b2.get()); block.getByPosition(result).column = DataTypeUInt8().createColumnConst(const_col_left->size(), toField(res)); return true; } @@ -159,7 +160,7 @@ public: auto col_r1 = checkAndGetColumn(&col_right->getColumn(0)); auto col_r2 = checkAndGetColumn(&col_right->getColumn(1)); if (col_r1 && col_r2) - OpImpl::constant_vector(a1.get(), b1.get(), col_r1->getData(), col_r2->getData(), vec_res); + OpImpl::constantVector(a1.get(), b1.get(), col_r1->getData(), col_r2->getData(), vec_res); else return false; } @@ -179,7 +180,7 @@ public: Field a2, b2; const_cols[0]->get(0, a2); const_cols[1]->get(0, b2); - OpImpl::vector_constant(col_l1->getData(), col_l2->getData(), a2.get(), a2.get(), vec_res); + OpImpl::vectorConstant(col_l1->getData(), col_l2->getData(), a2.get(), a2.get(), vec_res); } // non-constant tuple - non-constant tuple else if (const ColumnTuple * col_right = typeid_cast(arg2.column.get())) @@ -187,7 +188,7 @@ public: auto col_r1 = checkAndGetColumn(&col_right->getColumn(0)); auto col_r2 = checkAndGetColumn(&col_right->getColumn(1)); if (col_r1 && col_r2) - OpImpl::vector_vector(col_l1->getData(), col_l2->getData(), col_r1->getData(), col_r2->getData(), vec_res); + OpImpl::vectorVector(col_l1->getData(), col_l2->getData(), col_r1->getData(), col_r2->getData(), vec_res); else return false; } From 9a370a03ef19e6e306e479cd5f7ac445a67fad75 Mon Sep 17 00:00:00 2001 From: feng lv Date: Thu, 10 Sep 2020 15:36:38 +0800 Subject: [PATCH 12/23] fix fix --- src/Functions/FunctionsStringHash.h | 1 - src/Functions/bitHammingDistance.cpp | 7 +++---- src/Functions/tupleHammingDistance.cpp | 4 ++-- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/Functions/FunctionsStringHash.h b/src/Functions/FunctionsStringHash.h index 64ee7f9fe59..19fea2d4fc6 100644 --- a/src/Functions/FunctionsStringHash.h +++ b/src/Functions/FunctionsStringHash.h @@ -15,7 +15,6 @@ namespace DB namespace ErrorCodes { extern const int ILLEGAL_TYPE_OF_ARGUMENT; - extern const int TOO_LARGE_STRING_SIZE; } // FunctionStringHash diff --git a/src/Functions/bitHammingDistance.cpp b/src/Functions/bitHammingDistance.cpp index cb79b498aa6..cb34634b00d 100644 --- a/src/Functions/bitHammingDistance.cpp +++ b/src/Functions/bitHammingDistance.cpp @@ -97,8 +97,7 @@ public: { const auto * left_generic = block.getByPosition(arguments[0]).type.get(); const auto * right_generic = block.getByPosition(arguments[1]).type.get(); - bool valid = castBothTypes(left_generic, right_generic, [&](const auto & left, const auto & right) - { + bool valid = castBothTypes(left_generic, right_generic, [&](const auto & left, const auto & right) { using LeftDataType = std::decay_t; using RightDataType = std::decay_t; using T0 = typename LeftDataType::FieldType; @@ -109,8 +108,8 @@ public: using OpImpl = BitHammingDistanceImpl; - const auto col_left_raw = block.getByPosition(arguments[0]).column.get(); - const auto col_right_raw = block.getByPosition(arguments[1]).column.get(); + const auto * const col_left_raw = block.getByPosition(arguments[0]).column.get(); + const auto * const col_right_raw = block.getByPosition(arguments[1]).column.get(); typename ColVecResult::MutablePtr col_res = nullptr; col_res = ColVecResult::create(); diff --git a/src/Functions/tupleHammingDistance.cpp b/src/Functions/tupleHammingDistance.cpp index 2f0475f3a6c..aa38426d228 100644 --- a/src/Functions/tupleHammingDistance.cpp +++ b/src/Functions/tupleHammingDistance.cpp @@ -106,8 +106,8 @@ public: const ColumnWithTypeAndName & arg2 = block.getByPosition(arguments[1]); const DataTypeTuple & type1 = static_cast(*arg1.type); const DataTypeTuple & type2 = static_cast(*arg2.type); - auto & left_elems = type1.getElements(); - auto & right_elems = type2.getElements(); + const auto & left_elems = type1.getElements(); + const auto & right_elems = type2.getElements(); if (left_elems.size() != 2 || right_elems.size() != 2) throw Exception( "Illegal column of arguments of function " + getName() + ", tuple should have exactly two elements.", From 3a723f77be0c818fdd082c10fcf88ab3fb8762cf Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Thu, 17 Dec 2020 19:09:54 +0300 Subject: [PATCH 13/23] Try fix submodules --- .gitmodules | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitmodules b/.gitmodules index 3b6ff83099e..23deab0eed0 100644 --- a/.gitmodules +++ b/.gitmodules @@ -188,6 +188,10 @@ path = contrib/cyrus-sasl url = https://github.com/cyrusimap/cyrus-sasl branch = cyrus-sasl-2.1 +[submodule "contrib/croaring"] + path = contrib/croaring + url = https://github.com/RoaringBitmap/CRoaring + branch = v0.2.66 [submodule "contrib/miniselect"] path = contrib/miniselect url = https://github.com/danlark1/miniselect From 1287ef607a568ddfbcdeb3981653758753630ee2 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Thu, 17 Dec 2020 19:26:31 +0300 Subject: [PATCH 14/23] Fix other adaprots --- contrib/croaring | 1 + 1 file changed, 1 insertion(+) create mode 160000 contrib/croaring diff --git a/contrib/croaring b/contrib/croaring new file mode 160000 index 00000000000..5f20740ec0d --- /dev/null +++ b/contrib/croaring @@ -0,0 +1 @@ +Subproject commit 5f20740ec0de5e153e8f4cb2ab91814e8b291a14 From 654b8528b25718f933b026225a0d5ab88b0ff2d0 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Thu, 17 Dec 2020 22:14:01 +0300 Subject: [PATCH 15/23] Try fix build --- src/Functions/FunctionsStringHash.h | 8 ++++---- src/Functions/bitHammingDistance.cpp | 17 ++++++++++------- src/Functions/tupleHammingDistance.cpp | 17 +++++++++++------ 3 files changed, 25 insertions(+), 17 deletions(-) diff --git a/src/Functions/FunctionsStringHash.h b/src/Functions/FunctionsStringHash.h index 19fea2d4fc6..93795d1eaab 100644 --- a/src/Functions/FunctionsStringHash.h +++ b/src/Functions/FunctionsStringHash.h @@ -45,9 +45,9 @@ public: bool useDefaultImplementationForConstants() const override { return true; } - void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t) const override + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t) const override { - const ColumnPtr & column = block.getByPosition(arguments[0]).column; + const ColumnPtr & column = arguments[0].column; using ResultType = typename Impl::ResultType; if constexpr (is_simhash) { @@ -57,7 +57,7 @@ public: vec_res.resize(column->size()); const ColumnString * col_str_vector = checkAndGetColumn(&*column); Impl::apply(col_str_vector->getChars(), col_str_vector->getOffsets(), vec_res); - block.getByPosition(result).column = std::move(col_res); + return std::move(col_res); } else // Min hash { @@ -73,7 +73,7 @@ public: MutableColumns tuple_columns; tuple_columns.emplace_back(std::move(col_h1)); tuple_columns.emplace_back(std::move(col_h2)); - block.getByPosition(result).column = ColumnTuple::create(std::move(tuple_columns)); + return ColumnTuple::create(std::move(tuple_columns)); } } }; diff --git a/src/Functions/bitHammingDistance.cpp b/src/Functions/bitHammingDistance.cpp index cb34634b00d..6e07ca0ad7c 100644 --- a/src/Functions/bitHammingDistance.cpp +++ b/src/Functions/bitHammingDistance.cpp @@ -93,10 +93,11 @@ public: bool useDefaultImplementationForConstants() const override { return true; } - void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t) const override + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override { - const auto * left_generic = block.getByPosition(arguments[0]).type.get(); - const auto * right_generic = block.getByPosition(arguments[1]).type.get(); + const auto * left_generic = arguments[0].type.get(); + const auto * right_generic = arguments[1].type.get(); + ColumnPtr result_column; bool valid = castBothTypes(left_generic, right_generic, [&](const auto & left, const auto & right) { using LeftDataType = std::decay_t; using RightDataType = std::decay_t; @@ -108,14 +109,14 @@ public: using OpImpl = BitHammingDistanceImpl; - const auto * const col_left_raw = block.getByPosition(arguments[0]).column.get(); - const auto * const col_right_raw = block.getByPosition(arguments[1]).column.get(); + const auto * const col_left_raw = arguments[0].column.get(); + const auto * const col_right_raw = arguments[1].column.get(); typename ColVecResult::MutablePtr col_res = nullptr; col_res = ColVecResult::create(); auto & vec_res = col_res->getData(); - vec_res.resize(block.rows()); + vec_res.resize(input_rows_count); if (auto col_left_const = checkAndGetColumnConst(col_left_raw)) { @@ -141,11 +142,13 @@ public: else return false; - block.getByPosition(result).column = std::move(col_res); + result_column = std::move(col_res); return true; }); if (!valid) throw Exception(getName() + "'s arguments do not match the expected data types", ErrorCodes::ILLEGAL_COLUMN); + + return result_column; } }; diff --git a/src/Functions/tupleHammingDistance.cpp b/src/Functions/tupleHammingDistance.cpp index aa38426d228..67d5f73065b 100644 --- a/src/Functions/tupleHammingDistance.cpp +++ b/src/Functions/tupleHammingDistance.cpp @@ -100,10 +100,10 @@ public: return std::make_shared(); } - void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t) const override + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override { - const ColumnWithTypeAndName & arg1 = block.getByPosition(arguments[0]); - const ColumnWithTypeAndName & arg2 = block.getByPosition(arguments[1]); + const ColumnWithTypeAndName & arg1 = arguments[0]; + const ColumnWithTypeAndName & arg2 = arguments[1]; const DataTypeTuple & type1 = static_cast(*arg1.type); const DataTypeTuple & type2 = static_cast(*arg2.type); const auto & left_elems = type1.getElements(); @@ -112,6 +112,9 @@ public: throw Exception( "Illegal column of arguments of function " + getName() + ", tuple should have exactly two elements.", ErrorCodes::ILLEGAL_COLUMN); + + ColumnPtr result_column; + bool valid = castBothTypes(left_elems[0].get(), right_elems[0].get(), [&](const auto & left, const auto & right) { using LeftDataType = std::decay_t; @@ -139,7 +142,7 @@ public: cols2[0]->get(0, a2); cols2[1]->get(0, b2); auto res = OpImpl::constantConstant(a1.get(), b1.get(), a2.get(), b2.get()); - block.getByPosition(result).column = DataTypeUInt8().createColumnConst(const_col_left->size(), toField(res)); + result_column = DataTypeUInt8().createColumnConst(const_col_left->size(), toField(res)); return true; } } @@ -147,7 +150,7 @@ public: typename ColVecResult::MutablePtr col_res = nullptr; col_res = ColVecResult::create(); auto & vec_res = col_res->getData(); - vec_res.resize(block.rows()); + vec_res.resize(input_rows_count); // constant tuple - non-constant tuple if (const ColumnConst * const_col_left = checkAndGetColumnConst(arg1.column.get())) { @@ -200,11 +203,13 @@ public: } else return false; - block.getByPosition(result).column = std::move(col_res); + result_column = std::move(col_res); return true; }); if (!valid) throw Exception(getName() + "'s arguments do not match the expected data types", ErrorCodes::ILLEGAL_COLUMN); + + return result_column; } }; From c3a99e21bd3fc1b7783da7af30d93ee997a66161 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Mon, 21 Dec 2020 14:39:17 +0300 Subject: [PATCH 16/23] Refactor some code. --- src/Functions/FunctionsStringHash.cpp | 164 +++++++++++++++----------- src/Functions/FunctionsStringHash.h | 28 +++-- 2 files changed, 108 insertions(+), 84 deletions(-) diff --git a/src/Functions/FunctionsStringHash.cpp b/src/Functions/FunctionsStringHash.cpp index f8c78a808b3..b27d8601f3a 100644 --- a/src/Functions/FunctionsStringHash.cpp +++ b/src/Functions/FunctionsStringHash.cpp @@ -18,52 +18,94 @@ namespace DB { + +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +} + struct Hash { static ALWAYS_INLINE inline UInt64 ngramASCIIHash(const UInt8 * code_points) { - return intHashCRC32(unalignedLoad(code_points)); +#ifdef __SSE4_2__ + return _mm_crc32_u64(-1ULL, unalignedLoad(code_points)); +#else + throw Exception("ngramASCIIHash is not implemented without sse4.2 support", ErrorCodes::NOT_IMPLEMENTED); +#endif } static ALWAYS_INLINE inline UInt64 ngramUTF8Hash(const UInt32 * code_points) - { - UInt64 combined = (static_cast(code_points[0]) << 32) | code_points[1]; -#ifdef __SSE4_2__ - return _mm_crc32_u64(code_points[2], combined); -#else - return (intHashCRC32(combined) ^ intHashCRC32(code_points[2])); -#endif - } - - static ALWAYS_INLINE inline UInt64 wordShinglesHash(const UInt64 * hashes, size_t size, size_t offset) { UInt64 crc = -1ULL; #ifdef __SSE4_2__ - for (size_t i = offset; i < size; ++i) - crc = _mm_crc32_u64(crc, hashes[i]); - for (size_t i = 0; i < offset; ++i) - crc = _mm_crc32_u64(crc, hashes[i]); + crc = _mm_crc32_u64(crc, code_points[0]); + crc = _mm_crc32_u64(crc, code_points[1]); + crc = _mm_crc32_u64(crc, code_points[2]); #else - for (size_t i = offset; i < size; ++i) - crc = intHashCRC32(crc) ^ intHashCRC32(hashes[i]); - for (size_t i = 0; i < offset; ++i) - crc = intHashCRC32(crc) ^ intHashCRC32(hashes[i]); + throw Exception("ngramUTF8Hash is not implemented without sse4.2 support", ErrorCodes::NOT_IMPLEMENTED); #endif return crc; } - template - static ALWAYS_INLINE inline UInt64 hashSum(const CodePoint * hashes, size_t K) + static ALWAYS_INLINE inline UInt64 wordShinglesHash(const UInt32 * hashes, size_t size, size_t offset) { - UInt64 crc = -1ULL; + UInt64 crc1 = -1ULL; + UInt64 crc2 = -1ULL; #ifdef __SSE4_2__ - for (size_t i = 0; i < K; ++i) - crc = _mm_crc32_u64(crc, hashes[i]); + for (size_t i = offset; i < size; i += 2) + crc1 = _mm_crc32_u64(crc1, hashes[i]); + for (size_t i = offset + 1; i < size; i += 2) + crc2 = _mm_crc32_u64(crc2, hashes[i]); + + if ((size - offset) & 1) + { + for (size_t i = 0; i < offset; i += 2) + crc2 = _mm_crc32_u64(crc2, hashes[i]); + for (size_t i = 1; i < offset; i += 2) + crc1 = _mm_crc32_u64(crc1, hashes[i]); + } + else + { + for (size_t i = 0; i < offset; i += 2) + crc1 = _mm_crc32_u64(crc1, hashes[i]); + for (size_t i = 1; i < offset; i += 2) + crc2 = _mm_crc32_u64(crc2, hashes[i]); + } #else - for (size_t i = 0; i < K; ++i) - crc = intHashCRC32(crc) ^ intHashCRC32(hashes[i]); + throw Exception("wordShinglesHash is not implemented without sse4.2 support", ErrorCodes::NOT_IMPLEMENTED); #endif - return crc; + return crc1 | (crc2 << 32u); + } + + static ALWAYS_INLINE inline UInt64 hashSum(const UInt8 * hashes, size_t K) + { + UInt64 crc1 = -1ULL; + UInt64 crc2 = -1ULL; +#ifdef __SSE4_2__ + for (size_t i = 0; i < K; i += 2) + crc1 = _mm_crc32_u8(crc1, hashes[i]); + for (size_t i = 1; i < K; i += 2) + crc2 = _mm_crc32_u8(crc2, hashes[i]); +#else + throw Exception("hashSum is not implemented without sse4.2 support", ErrorCodes::NOT_IMPLEMENTED); +#endif + return crc1 | (crc2 << 32u); + } + + static ALWAYS_INLINE inline UInt64 hashSum(const UInt32 * hashes, size_t K) + { + UInt64 crc1 = -1ULL; + UInt64 crc2 = -1ULL; +#ifdef __SSE4_2__ + for (size_t i = 0; i < K; i += 2) + crc1 = _mm_crc32_u32(crc1, hashes[i]); + for (size_t i = 1; i < K; i += 2) + crc2 = _mm_crc32_u32(crc2, hashes[i]); +#else + throw Exception("hashSum is not implemented without sse4.2 support", ErrorCodes::NOT_IMPLEMENTED); +#endif + return crc1 | (crc2 << 32u); } }; @@ -76,7 +118,6 @@ struct Hash template struct SimhashImpl { - using ResultType = UInt64; using StrOp = ExtractStringImpl; // we made an assumption that the size of one word cann't exceed 128, which may not true // if some word's size exceed 128, it would be cut up to several word @@ -204,25 +245,6 @@ struct SimhashImpl return res_bit.to_ullong(); } - template - static ALWAYS_INLINE inline auto dispatch(CalcFunc calc_func, Args &&... args) - { - if constexpr (Ngram) - { - if constexpr (!UTF8) - return calc_func(std::forward(args)..., StrOp::readASCIICodePoints, Hash::ngramASCIIHash); - else - return calc_func(std::forward(args)..., StrOp::readUTF8CodePoints, Hash::ngramUTF8Hash); - } - else - { - if constexpr (!UTF8) - return calc_func(std::forward(args)..., StrOp::readOneASCIIWord, Hash::wordShinglesHash); - else - return calc_func(std::forward(args)..., StrOp::readOneUTF8Word, Hash::wordShinglesHash); - } - } - static void apply(const ColumnString::Chars & data, const ColumnString::Offsets & offsets, PaddedPODArray & res) { for (size_t i = 0; i < offsets.size(); ++i) @@ -232,9 +254,19 @@ struct SimhashImpl if (data_size <= max_string_size) { if constexpr (Ngram) - res[i] = dispatch(ngramCalculateHashValue, one_data, data_size); + { + if constexpr (!UTF8) + res[i] = ngramCalculateHashValue(one_data, data_size, StrOp::readASCIICodePoints, Hash::ngramASCIIHash); + else + res[i] = ngramCalculateHashValue(one_data, data_size, StrOp::readUTF8CodePoints, Hash::ngramUTF8Hash); + } else - res[i] = dispatch(wordShinglesCalculateHashValue, one_data, data_size); + { + if constexpr (!UTF8) + res[i] = wordShinglesCalculateHashValue(one_data, data_size, StrOp::readOneASCIIWord, Hash::wordShinglesHash); + else + res[i] = wordShinglesCalculateHashValue(one_data, data_size, StrOp::readOneUTF8Word, Hash::wordShinglesHash); + } } else res[i] = -1ull; @@ -289,7 +321,6 @@ struct MinhashImpl using Greater = std::greater; using MaxHeap = FixedHeap, K, -1ULL>; using MinHeap = FixedHeap, K, 0>; - using ResultType = UInt64; using StrOp = ExtractStringImpl; static constexpr size_t max_string_size = 1u << 15; static constexpr size_t simultaneously_codepoints_num = StrOp::default_padding + N - 1; @@ -383,25 +414,6 @@ struct MinhashImpl return std::make_tuple(res1, res2); } - template - static ALWAYS_INLINE inline auto dispatch(CalcFunc calc_func, Args &&... args) - { - if constexpr (Ngram) - { - if constexpr (!UTF8) - return calc_func(std::forward(args)..., StrOp::readASCIICodePoints, Hash::ngramASCIIHash); - else - return calc_func(std::forward(args)..., StrOp::readUTF8CodePoints, Hash::ngramUTF8Hash); - } - else - { - if constexpr (!UTF8) - return calc_func(std::forward(args)..., StrOp::readOneASCIIWord, Hash::wordShinglesHash); - else - return calc_func(std::forward(args)..., StrOp::readOneUTF8Word, Hash::wordShinglesHash); - } - } - static void apply( const ColumnString::Chars & data, const ColumnString::Offsets & offsets, @@ -415,9 +427,19 @@ struct MinhashImpl if (data_size <= max_string_size) { if constexpr (Ngram) - std::tie(res1[i], res2[i]) = dispatch(ngramCalculateHashValue, one_data, data_size); + { + if constexpr (!UTF8) + std::tie(res1[i], res2[i]) = ngramCalculateHashValue(one_data, data_size, StrOp::readASCIICodePoints, Hash::ngramASCIIHash); + else + std::tie(res1[i], res2[i]) = ngramCalculateHashValue(one_data, data_size, StrOp::readUTF8CodePoints, Hash::ngramUTF8Hash); + } else - std::tie(res1[i], res2[i]) = dispatch(wordShinglesCalculateHashValue, one_data, data_size); + { + if constexpr (!UTF8) + std::tie(res1[i], res2[i]) = wordShinglesCalculateHashValue(one_data, data_size, StrOp::readOneASCIIWord, Hash::wordShinglesHash); + else + std::tie(res1[i], res2[i]) = wordShinglesCalculateHashValue(one_data, data_size, StrOp::readOneUTF8Word, Hash::wordShinglesHash); + } } else std::tie(res1[i], res2[i]) = std::make_tuple(-1ull, -1ull); diff --git a/src/Functions/FunctionsStringHash.h b/src/Functions/FunctionsStringHash.h index 93795d1eaab..979f2bd8a9d 100644 --- a/src/Functions/FunctionsStringHash.h +++ b/src/Functions/FunctionsStringHash.h @@ -35,12 +35,14 @@ public: DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { if (!isString(arguments[0])) - throw Exception( - "Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Function {} expect single String argument, got {}", getName(), arguments[0]->getName()); + + auto type = std::make_shared(); if constexpr (is_simhash) - return std::make_shared>(); - auto element = DataTypeFactory::instance().get("UInt64"); - return std::make_shared(DataTypes{element, element}); + return type; + + return std::make_shared(DataTypes{type, type}); } bool useDefaultImplementationForConstants() const override { return true; } @@ -48,24 +50,24 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t) const override { const ColumnPtr & column = arguments[0].column; - using ResultType = typename Impl::ResultType; + if constexpr (is_simhash) { // non const string, const case is handled by useDefaultImplementationForConstants. - auto col_res = ColumnVector::create(); - typename ColumnVector::Container & vec_res = col_res->getData(); + auto col_res = ColumnVector::create(); + auto & vec_res = col_res->getData(); vec_res.resize(column->size()); const ColumnString * col_str_vector = checkAndGetColumn(&*column); Impl::apply(col_str_vector->getChars(), col_str_vector->getOffsets(), vec_res); - return std::move(col_res); + return col_res; } else // Min hash { // non const string - auto col_h1 = ColumnVector::create(); - auto col_h2 = ColumnVector::create(); - typename ColumnVector::Container & vec_h1 = col_h1->getData(); - typename ColumnVector::Container & vec_h2 = col_h2->getData(); + auto col_h1 = ColumnVector::create(); + auto col_h2 = ColumnVector::create(); + auto & vec_h1 = col_h1->getData(); + auto & vec_h2 = col_h2->getData(); vec_h1.resize(column->size()); vec_h2.resize(column->size()); const ColumnString * col_str_vector = checkAndGetColumn(&*column); From 29e0b4ec40e2a09c0c7f30f6918ddb53229067c3 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Mon, 21 Dec 2020 17:14:05 +0300 Subject: [PATCH 17/23] Refactor cose a little bit more. --- src/Functions/ExtractString.h | 9 +++++++-- src/Functions/FunctionsStringHash.cpp | 6 +++--- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/src/Functions/ExtractString.h b/src/Functions/ExtractString.h index 51d6f17380c..b659d072887 100644 --- a/src/Functions/ExtractString.h +++ b/src/Functions/ExtractString.h @@ -19,11 +19,16 @@ namespace DB template struct ExtractStringImpl { + /// Padding form ColumnsString. It is a number of bytes we can always read starting from pos if pos < end. static constexpr size_t default_padding = 16; - // the length of code_points = default_padding + N -1 + /// Functions are read `default_padding - (N - 1)` bytes into the buffer. Window of size N is used. + /// Read copies `N - 1` last bytes from buffer into beginning, and then reads new bytes. + static constexpr size_t buffer_size = default_padding + N - 1; + + // the length of code_points = buffer_size // pos: the current beginning location that we want to copy data - // end: the end loction of the string + // end: the end location of the string static ALWAYS_INLINE size_t readASCIICodePoints(UInt8 * code_points, const char *& pos, const char * end) { /// Offset before which we copy some data. diff --git a/src/Functions/FunctionsStringHash.cpp b/src/Functions/FunctionsStringHash.cpp index b27d8601f3a..70e524c5df8 100644 --- a/src/Functions/FunctionsStringHash.cpp +++ b/src/Functions/FunctionsStringHash.cpp @@ -48,7 +48,7 @@ struct Hash return crc; } - static ALWAYS_INLINE inline UInt64 wordShinglesHash(const UInt32 * hashes, size_t size, size_t offset) + static ALWAYS_INLINE inline UInt64 wordShinglesHash(const UInt64 * hashes, size_t size, size_t offset) { UInt64 crc1 = -1ULL; UInt64 crc2 = -1ULL; @@ -122,7 +122,7 @@ struct SimhashImpl // we made an assumption that the size of one word cann't exceed 128, which may not true // if some word's size exceed 128, it would be cut up to several word static constexpr size_t max_string_size = 1u << 15; - static constexpr size_t simultaneously_codepoints_num = StrOp::default_padding + N - 1; + static constexpr size_t simultaneously_codepoints_num = StrOp::buffer_size; // Simhash ngram calculate function: String ->UInt64 // this function extracting ngram from input string, and maintain a 64-dimensions vector @@ -323,7 +323,7 @@ struct MinhashImpl using MinHeap = FixedHeap, K, 0>; using StrOp = ExtractStringImpl; static constexpr size_t max_string_size = 1u << 15; - static constexpr size_t simultaneously_codepoints_num = StrOp::default_padding + N - 1; + static constexpr size_t simultaneously_codepoints_num = StrOp::buffer_size; // Minhash ngram calculate function, String -> Tuple(UInt64, UInt64) // we extract ngram from input string, and calculate a hash value for each ngram From 78429eca70b1bc5803a603fa41f8ebddaae2a802 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Mon, 21 Dec 2020 17:30:17 +0300 Subject: [PATCH 18/23] Refactor code a little bit more. --- src/Functions/FunctionsStringHash.cpp | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/Functions/FunctionsStringHash.cpp b/src/Functions/FunctionsStringHash.cpp index 70e524c5df8..ec0f5fea808 100644 --- a/src/Functions/FunctionsStringHash.cpp +++ b/src/Functions/FunctionsStringHash.cpp @@ -104,6 +104,21 @@ struct Hash crc2 = _mm_crc32_u32(crc2, hashes[i]); #else throw Exception("hashSum is not implemented without sse4.2 support", ErrorCodes::NOT_IMPLEMENTED); +#endif + return crc1 | (crc2 << 32u); + } + + static ALWAYS_INLINE inline UInt64 hashSum(const UInt64 * hashes, size_t K) + { + UInt64 crc1 = -1ULL; + UInt64 crc2 = -1ULL; +#ifdef __SSE4_2__ + for (size_t i = 0; i < K; i += 2) + crc1 = _mm_crc32_u64(crc1, hashes[i]); + for (size_t i = 1; i < K; i += 2) + crc2 = _mm_crc32_u64(crc2, hashes[i]); +#else + throw Exception("hashSum is not implemented without sse4.2 support", ErrorCodes::NOT_IMPLEMENTED); #endif return crc1 | (crc2 << 32u); } From 12f0f82d3d8d9282092f5afecd16fdc548a0ccba Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Mon, 21 Dec 2020 19:53:28 +0300 Subject: [PATCH 19/23] Update test. --- .../01016_simhash_minhash.reference | 180 +++++++++++++----- .../0_stateless/01016_simhash_minhash.sql | 64 +++++++ 2 files changed, 201 insertions(+), 43 deletions(-) diff --git a/tests/queries/0_stateless/01016_simhash_minhash.reference b/tests/queries/0_stateless/01016_simhash_minhash.reference index 2ababa29d1e..edd5afc1af7 100644 --- a/tests/queries/0_stateless/01016_simhash_minhash.reference +++ b/tests/queries/0_stateless/01016_simhash_minhash.reference @@ -1,50 +1,144 @@ 0 2718169299 2718169299 -3333471646 -26585365 -4124079607 -4124079607 -4124079607 -979945684 -(3700739653,3614688582) -(2594676265,556335836) -(2594676265,556335836) -(3157724679,410999184) -(1378962320,1336242123) -(3277652371,1284714580) -(3277652371,1284714580) -(3277652371,1284714580) -(3140472415,3787127930) +1315333491 +1099965843 +5746351769509927967 +5746351769509927967 +8347269581771603092 +6041373934059725027 +(17178276249054052155,8864230932371215121) +(14133097226001036899,7985237721476952807) +(14133097226001036899,7985237721476952807) +(4661257206578284012,15229878657590021759) +(3087743741749030713,11631667950302077749) +(11923981719512934676,1193672187225825732) +(11923981719512934676,1193672187225825732) +(17970606678134635272,3825545538448404526) +(9422952829151664974,568010773615758889) 2548869326 2548869326 401385678 401385710 -4258739090 -4260836242 -718415633 -718681881 -2314703251 -1238864275 -3900085650 -3907425682 -2314703251 -1238864275 -3569207545 -3568143609 -(1525603924,509999509) -(1525603924,3764233597) -(1525603924,2706466536) -(1525603924,1315689278) -(3824755630,2122451089) -(946380879,2122451089) -(3295904092,4129673330) -(3295904092,4129673330) -(138351420,974287950) -(824220170,974287950) -(3300081739,2402902535) -(3300081739,3993394872) -(138351420,974287950) -(824220170,974287950) -(3083836461,957058619) -(4120380459,90533100) +2652202579 +2652235347 +2984455347 +2984488115 +12804820948382413807 +12804820948919350245 +11651601468065149391 +11651600368014488527 +18377198011227067677 +18233505035951822655 +5501050600367972694 +5501050600367972692 +(8590465925632898311,12699049311112305995) +(8590465925632898311,15828587343885202011) +(8590465925632898311,15824051019631343049) +(8590465925632898311,12699049311222825283) +(217966158370437743,14452995911556652133) +(217966158370437743,14452995911556652133) +(2170210914777151141,5341809779339553313) +(12469866236432988845,5341809779339553313) +(12271157076799061825,5514511977572226426) +(11639913962681153226,2767634094725305612) +(12271157075024394466,17994666970078080114) +(12271157077109587702,13572452308677868240) +(6252006845407214340,13538761942960976531) +(13795977174459370328,6392395597500134035) +(16118993428517222971,13602445809406467) +(16118993428517222971,13602445809406467) +uniqExact 6 +ngramSimhash +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 938403918 +ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency.\nClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes.\nClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems. 1 904817231 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system\'s read / write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 904849486 +ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system\'s read and write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems. 1 938469966 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 2 938404430 +ngramSimhashCaseInsensitive +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system\'s read / write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 938453071 +ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system\'s read and write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems. 1 938453599 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 2 938404430 +ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency.\nClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes.\nClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems. 1 636382047 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 938388046 +ngramSimhashUTF8 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system\'s read / write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 2400625214 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 2669060670 +ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency.\nClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes.\nClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems. 1 2671174174 +ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system\'s read and write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems. 1 2669060798 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 2 2635506238 +ngramSimhashCaseInsensitiveUTF8 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 2984307934 +ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency.\nClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes.\nClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems. 1 2967514366 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system\'s read / write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 2715855070 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 2967529694 +ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system\'s read and write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems. 1 2984290526 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 2984306910 +wordShingleSimhash +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 2384813566025024242 +ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system\'s read and write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems. 1 2393820766427040734 +ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency.\nClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes.\nClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems. 1 2421405261516400471 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 2384883934767174398 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system\'s read / write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 2384813567165864670 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 2384813567098766070 +wordShingleSimhashCaseInsensitive +ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system\'s read and write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems. 1 11635224793909957342 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system\'s read / write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 11617192803208139478 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 11617192803208151794 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 11617192803208151766 +ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency.\nClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes.\nClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems. 1 3006891407629799254 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 11617263171950236406 +wordShingleSimhashUTF8 +ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency.\nClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes.\nClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems. 1 9097818277104946605 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 9084246141658271116 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 9084247241171471628 +ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system\'s read and write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems. 1 9088752215857929613 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 9093255814816009484 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system\'s read / write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 9084247481822285196 +wordShingleSimhashCaseInsensitiveUTF8 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 14788772559981154978 +ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system\'s read and write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems. 1 14497164445320454820 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 14500537785782895266 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system\'s read / write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 14787646625647636642 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 14500016612976573090 +ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency.\nClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes.\nClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems. 1 14787956717160870888 +ngramMinhash +ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency.\nClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes.\nClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems. 1 (15568933215262012353,16287411738807860353) +ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system\'s read and write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems. 1 (9473589826959436958,14264235017873782379) +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 2 (261441656340606110,13387826928927239258) +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system\'s read / write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 2 (261441656340606110,3305790294064680121) +ngramMinhashCaseInsensitive +ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency.\nClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes.\nClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems. 1 (15568933215262012353,16287411738807860353) +ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system\'s read and write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems. 1 (9473589826959436958,14264235017873782379) +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system\'s read / write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 2 (3051755284325985438,3305790294064680121) +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 2 (3051755284325985438,13387826928927239258) +ngramMinhashUTF8 +ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system\'s read and write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 4 (309830857064065611,7476109060377919216) +ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency.\nClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes.\nClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems. 1 (309830856946430871,7521913981442105351) +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system\'s read / write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 (309830857559697399,7476109060377919216) +ngramMinhashCaseInsensitiveUTF8 +ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency.\nClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes.\nClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems. 1 (13010809262502929096,2266175201446733829) +ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system\'s read and write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system\'s read / write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 5 (16827851229372179144,976408052548769549) +wordShingleMinhash +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 2 (14343822344862533053,11776483993821900250) +ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system\'s read and write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems. 1 (18417749332128868312,11776483993821900250) +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system\'s read / write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 (18417749329907528200,14156831980621923226) +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 (4600092690178227586,11776483993821900250) +ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency.\nClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes.\nClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems. 1 (12998011837685887081,1565093152297016105) +wordShingleMinhashCaseInsensitive +ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency.\nClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes.\nClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems. 1 (12998011837880940480,1565093152297016105) +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system\'s read / write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 (1100751419997894255,15225006848401474458) +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 2 (1100751419777226283,12993805708561478711) +ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system\'s read and write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems. 1 (1260401089202135898,12993805709529540523) +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 (1638964264353944555,12993805708561478711) +wordShingleMinhashUTF8 +ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system\'s read and write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems. 1 (742280067319112377,14237963017046410351) +ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency.\nClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes.\nClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems. 1 (7237654052534217600,14400297883226437452) +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system\'s read / write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 (742280067319112377,17574811665615962276) +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 3 (3458625375707825328,17574811665615962276) +wordShingleMinhashCaseInsensitiveUTF8 +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 (7032848390598450936,5104668712725998486) +ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency.\nClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes.\nClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems. 1 (15582670464629505464,13034678298246801511) +ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system\'s read and write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems. 1 (9935434838523508980,7648038926638343017) +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.\n:::::::\nClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system.\nClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 2 (7032848390598450936,16870743692447971238) +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency.\nClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system\'s read / write availability.\nClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. 1 (7302041809563941951,6856814412450461959) diff --git a/tests/queries/0_stateless/01016_simhash_minhash.sql b/tests/queries/0_stateless/01016_simhash_minhash.sql index 9e87216d26f..61b9ac14259 100644 --- a/tests/queries/0_stateless/01016_simhash_minhash.sql +++ b/tests/queries/0_stateless/01016_simhash_minhash.sql @@ -44,4 +44,68 @@ SELECT wordShingleMinhashCaseInsensitive(s) FROM defaults; SELECT wordShingleMinhashUTF8(s) FROM defaults; SELECT wordShingleMinhashCaseInsensitiveUTF8(s) FROM defaults; +TRUNCATE TABLE defaults; +INSERT INTO defaults SELECT arrayJoin(splitByString('\n\n', +'ClickHouse uses all available hardware to its full potential to process each query as fast as possible. Peak processing performance for a single query stands at more than 2 terabytes per second (after decompression, only used columns). In distributed setup reads are automatically balanced among healthy replicas to avoid increasing latency. +ClickHouse supports multi-master asynchronous replication and can be deployed across multiple datacenters. All nodes are equal, which allows avoiding having single points of failure. Downtime of a single node or the whole datacenter wont affect the systems availability for both reads and writes. +ClickHouse is simple and works out-of-the-box. It streamlines all your data processing: ingest all your structured data into the system and it becomes instantly available for building reports. SQL dialect allows expressing the desired result without involving any custom non-standard API that could be found in some alternative systems. + +ClickHouse makes full use of all available hardware to process every request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (only used columns after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency. +ClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid single points of failure. Downtime for one site or the entire data center will not affect the system''s read and write availability. +ClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they immediately become available for building reports. The SQL dialect allows you to express the desired result without resorting to any non-standard APIs that can be found in some alternative systems. + +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (used columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency. +ClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the system''s read / write availability. +ClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all your structured data into the system, and they are immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. + +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns only after unpacking). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency. +ClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system. +ClickHouse is simple and works out of the box. It simplifies all the processing of your data: it loads all of your structured data into the system, and it is immediately available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. + +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency. +ClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system. +ClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all your structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems. + +ClickHouse makes full use of all available hardware to process each request as quickly as possible. Peak performance for a single query is over 2 terabytes per second (using columns after decompression only). In a distributed setup, reads are automatically balanced across healthy replicas to avoid increased latency. +ClickHouse supports asynchronous multi-master replication and can be deployed across multiple data centers. All nodes are equal to avoid a single point of failure. Downtime for one site or the entire data center will not affect the read / write availability of the system. +ClickHouse is simple and works out of the box. It simplifies all processing of your data: it loads all structured data into the system and immediately becomes available for building reports. The SQL dialect allows you to express the desired result without resorting to any of the non-standard APIs found in some alternative systems.' +)); + +SELECT 'uniqExact', uniqExact(s) FROM defaults; + + +SELECT 'ngramSimhash'; +SELECT arrayStringConcat(groupArray(s), '\n:::::::\n'), count(), ngramSimhash(s) as h FROM defaults GROUP BY h; +SELECT 'ngramSimhashCaseInsensitive'; +SELECT arrayStringConcat(groupArray(s), '\n:::::::\n'), count(), ngramSimhashCaseInsensitive(s) as h FROM defaults GROUP BY h; +SELECT 'ngramSimhashUTF8'; +SELECT arrayStringConcat(groupArray(s), '\n:::::::\n'), count(), ngramSimhashUTF8(s) as h FROM defaults GROUP BY h; +SELECT 'ngramSimhashCaseInsensitiveUTF8'; +SELECT arrayStringConcat(groupArray(s), '\n:::::::\n'), count(), ngramSimhashCaseInsensitiveUTF8(s) as h FROM defaults GROUP BY h; +SELECT 'wordShingleSimhash'; +SELECT arrayStringConcat(groupArray(s), '\n:::::::\n'), count(), wordShingleSimhash(s) as h FROM defaults GROUP BY h; +SELECT 'wordShingleSimhashCaseInsensitive'; +SELECT arrayStringConcat(groupArray(s), '\n:::::::\n'), count(), wordShingleSimhashCaseInsensitive(s) as h FROM defaults GROUP BY h; +SELECT 'wordShingleSimhashUTF8'; +SELECT arrayStringConcat(groupArray(s), '\n:::::::\n'), count(), wordShingleSimhashUTF8(s) as h FROM defaults GROUP BY h; +SELECT 'wordShingleSimhashCaseInsensitiveUTF8'; +SELECT arrayStringConcat(groupArray(s), '\n:::::::\n'), count(), wordShingleSimhashCaseInsensitiveUTF8(s) as h FROM defaults GROUP BY h; + +SELECT 'ngramMinhash'; +SELECT arrayStringConcat(groupArray(s), '\n:::::::\n'), count(), ngramMinhash(s) as h FROM defaults GROUP BY h; +SELECT 'ngramMinhashCaseInsensitive'; +SELECT arrayStringConcat(groupArray(s), '\n:::::::\n'), count(), ngramMinhashCaseInsensitive(s) as h FROM defaults GROUP BY h; +SELECT 'ngramMinhashUTF8'; +SELECT arrayStringConcat(groupArray(s), '\n:::::::\n'), count(), ngramMinhashUTF8(s) as h FROM defaults GROUP BY h; +SELECT 'ngramMinhashCaseInsensitiveUTF8'; +SELECT arrayStringConcat(groupArray(s), '\n:::::::\n'), count(), ngramMinhashCaseInsensitiveUTF8(s) as h FROM defaults GROUP BY h; +SELECT 'wordShingleMinhash'; +SELECT arrayStringConcat(groupArray(s), '\n:::::::\n'), count(), wordShingleMinhash(s) as h FROM defaults GROUP BY h; +SELECT 'wordShingleMinhashCaseInsensitive'; +SELECT arrayStringConcat(groupArray(s), '\n:::::::\n'), count(), wordShingleMinhashCaseInsensitive(s) as h FROM defaults GROUP BY h; +SELECT 'wordShingleMinhashUTF8'; +SELECT arrayStringConcat(groupArray(s), '\n:::::::\n'), count(), wordShingleMinhashUTF8(s) as h FROM defaults GROUP BY h; +SELECT 'wordShingleMinhashCaseInsensitiveUTF8'; +SELECT arrayStringConcat(groupArray(s), '\n:::::::\n'), count(), wordShingleMinhashCaseInsensitiveUTF8(s) as h FROM defaults GROUP BY h; + DROP TABLE defaults; From 74671bda4ae1f6ed8eda89200e9de65ed34555a7 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 22 Dec 2020 11:01:13 +0300 Subject: [PATCH 20/23] Fix special build --- src/Functions/FunctionsStringHash.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Functions/FunctionsStringHash.cpp b/src/Functions/FunctionsStringHash.cpp index ec0f5fea808..acb2feb87b8 100644 --- a/src/Functions/FunctionsStringHash.cpp +++ b/src/Functions/FunctionsStringHash.cpp @@ -295,12 +295,12 @@ class FixedHeap public: FixedHeap() = delete; - explicit FixedHeap(F f_) : f(f_), data_t(std::make_shared>(K, v)) + explicit FixedHeap(F f_) : f(f_), data_t(std::make_shared>(K, v)) { std::make_heap(data_t->begin(), data_t->end(), f); } - void insertAndReplace(size_t new_v) + void insertAndReplace(UInt64 new_v) { data_t->push_back(new_v); std::push_heap(data_t->begin(), data_t->end(), f); @@ -308,11 +308,11 @@ public: data_t->pop_back(); } - const size_t * data() { return data_t->data(); } + const UInt64 * data() { return data_t->data(); } private: F f; - std::shared_ptr> data_t; + std::shared_ptr> data_t; }; From 2c07516aeea0d5bdc92932685927232f8bd1c1b0 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 22 Dec 2020 18:17:23 +0300 Subject: [PATCH 21/23] Fix special build and style. --- src/Functions/FunctionsStringHash.cpp | 12 ++++++------ src/Functions/bitHammingDistance.cpp | 3 ++- src/Functions/ya.make | 3 +++ 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/src/Functions/FunctionsStringHash.cpp b/src/Functions/FunctionsStringHash.cpp index acb2feb87b8..5fd46117ea3 100644 --- a/src/Functions/FunctionsStringHash.cpp +++ b/src/Functions/FunctionsStringHash.cpp @@ -26,7 +26,7 @@ namespace ErrorCodes struct Hash { - static ALWAYS_INLINE inline UInt64 ngramASCIIHash(const UInt8 * code_points) + static ALWAYS_INLINE inline UInt64 ngramASCIIHash(const UInt8 * code_points [[maybe_unused]]) { #ifdef __SSE4_2__ return _mm_crc32_u64(-1ULL, unalignedLoad(code_points)); @@ -35,7 +35,7 @@ struct Hash #endif } - static ALWAYS_INLINE inline UInt64 ngramUTF8Hash(const UInt32 * code_points) + static ALWAYS_INLINE inline UInt64 ngramUTF8Hash(const UInt32 * code_points [[maybe_unused]]) { UInt64 crc = -1ULL; #ifdef __SSE4_2__ @@ -48,7 +48,7 @@ struct Hash return crc; } - static ALWAYS_INLINE inline UInt64 wordShinglesHash(const UInt64 * hashes, size_t size, size_t offset) + static ALWAYS_INLINE inline UInt64 wordShinglesHash(const UInt64 * hashes [[maybe_unused]], size_t size [[maybe_unused]], size_t offset [[maybe_unused]]) { UInt64 crc1 = -1ULL; UInt64 crc2 = -1ULL; @@ -78,7 +78,7 @@ struct Hash return crc1 | (crc2 << 32u); } - static ALWAYS_INLINE inline UInt64 hashSum(const UInt8 * hashes, size_t K) + static ALWAYS_INLINE inline UInt64 hashSum(const UInt8 * hashes [[maybe_unused]], size_t K [[maybe_unused]]) { UInt64 crc1 = -1ULL; UInt64 crc2 = -1ULL; @@ -93,7 +93,7 @@ struct Hash return crc1 | (crc2 << 32u); } - static ALWAYS_INLINE inline UInt64 hashSum(const UInt32 * hashes, size_t K) + static ALWAYS_INLINE inline UInt64 hashSum(const UInt32 * hashes [[maybe_unused]], size_t K [[maybe_unused]]) { UInt64 crc1 = -1ULL; UInt64 crc2 = -1ULL; @@ -108,7 +108,7 @@ struct Hash return crc1 | (crc2 << 32u); } - static ALWAYS_INLINE inline UInt64 hashSum(const UInt64 * hashes, size_t K) + static ALWAYS_INLINE inline UInt64 hashSum(const UInt64 * hashes [[maybe_unused]], size_t K [[maybe_unused]]) { UInt64 crc1 = -1ULL; UInt64 crc2 = -1ULL; diff --git a/src/Functions/bitHammingDistance.cpp b/src/Functions/bitHammingDistance.cpp index 6e07ca0ad7c..9b9ff5b6c07 100644 --- a/src/Functions/bitHammingDistance.cpp +++ b/src/Functions/bitHammingDistance.cpp @@ -98,7 +98,8 @@ public: const auto * left_generic = arguments[0].type.get(); const auto * right_generic = arguments[1].type.get(); ColumnPtr result_column; - bool valid = castBothTypes(left_generic, right_generic, [&](const auto & left, const auto & right) { + bool valid = castBothTypes(left_generic, right_generic, [&](const auto & left, const auto & right) + { using LeftDataType = std::decay_t; using RightDataType = std::decay_t; using T0 = typename LeftDataType::FieldType; diff --git a/src/Functions/ya.make b/src/Functions/ya.make index 6e5d832db77..7e64deef64d 100644 --- a/src/Functions/ya.make +++ b/src/Functions/ya.make @@ -53,6 +53,7 @@ SRCS( FunctionsRandom.cpp FunctionsRound.cpp FunctionsStringArray.cpp + FunctionsStringHash.cpp FunctionsStringSimilarity.cpp GatherUtils/concat.cpp GatherUtils/createArraySink.cpp @@ -185,6 +186,7 @@ SRCS( bitBoolMaskAnd.cpp bitBoolMaskOr.cpp bitCount.cpp + bitHammingDistance.cpp bitNot.cpp bitOr.cpp bitRotateLeft.cpp @@ -504,6 +506,7 @@ SRCS( tryBase64Decode.cpp tuple.cpp tupleElement.cpp + tupleHammingDistance.cpp upper.cpp upperUTF8.cpp uptime.cpp From 396ea14f67aabe128d75f592b3b27ed19d15672f Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 22 Dec 2020 18:29:32 +0300 Subject: [PATCH 22/23] Fix special build. --- src/Functions/FunctionsStringHash.cpp | 99 ++++++++++++++++----------- 1 file changed, 58 insertions(+), 41 deletions(-) diff --git a/src/Functions/FunctionsStringHash.cpp b/src/Functions/FunctionsStringHash.cpp index 5fd46117ea3..d57be67ef7f 100644 --- a/src/Functions/FunctionsStringHash.cpp +++ b/src/Functions/FunctionsStringHash.cpp @@ -26,55 +26,78 @@ namespace ErrorCodes struct Hash { - static ALWAYS_INLINE inline UInt64 ngramASCIIHash(const UInt8 * code_points [[maybe_unused]]) + static UInt64 crc32u64(UInt64 crc [[maybe_unused]], UInt64 val [[maybe_unused]]) { #ifdef __SSE4_2__ - return _mm_crc32_u64(-1ULL, unalignedLoad(code_points)); + return _mm_crc32_u64(crc, val); +#elif defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) + return __crc32cd(crc, val); #else - throw Exception("ngramASCIIHash is not implemented without sse4.2 support", ErrorCodes::NOT_IMPLEMENTED); + throw Exception("String hash is not implemented without sse4.2 support", ErrorCodes::NOT_IMPLEMENTED); #endif } - static ALWAYS_INLINE inline UInt64 ngramUTF8Hash(const UInt32 * code_points [[maybe_unused]]) + static UInt64 crc32u32(UInt64 crc [[maybe_unused]], UInt32 val [[maybe_unused]]) + { +#ifdef __SSE4_2__ + return _mm_crc32_u32(crc, val); +#elif defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) + return __crc32cw(crc, val); +#else + throw Exception("String hash is not implemented without sse4.2 support", ErrorCodes::NOT_IMPLEMENTED); +#endif + } + + static UInt64 crc32u8(UInt64 crc [[maybe_unused]], UInt8 val [[maybe_unused]]) + { +#ifdef __SSE4_2__ + return _mm_crc32_u8(crc, val); +#elif defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) + return __crc32cb(crc, val); +#else + throw Exception("String hash is not implemented without sse4.2 support", ErrorCodes::NOT_IMPLEMENTED); +#endif + } + + static ALWAYS_INLINE inline UInt64 ngramASCIIHash(const UInt8 * code_points) + { + return crc32u64(-1ULL, unalignedLoad(code_points)); + } + + static ALWAYS_INLINE inline UInt64 ngramUTF8Hash(const UInt32 * code_points) { UInt64 crc = -1ULL; -#ifdef __SSE4_2__ - crc = _mm_crc32_u64(crc, code_points[0]); - crc = _mm_crc32_u64(crc, code_points[1]); - crc = _mm_crc32_u64(crc, code_points[2]); -#else - throw Exception("ngramUTF8Hash is not implemented without sse4.2 support", ErrorCodes::NOT_IMPLEMENTED); -#endif + crc = crc32u64(crc, code_points[0]); + crc = crc32u64(crc, code_points[1]); + crc = crc32u64(crc, code_points[2]); return crc; } - static ALWAYS_INLINE inline UInt64 wordShinglesHash(const UInt64 * hashes [[maybe_unused]], size_t size [[maybe_unused]], size_t offset [[maybe_unused]]) + static ALWAYS_INLINE inline UInt64 wordShinglesHash(const UInt64 * hashes, size_t size, size_t offset) { UInt64 crc1 = -1ULL; UInt64 crc2 = -1ULL; -#ifdef __SSE4_2__ + for (size_t i = offset; i < size; i += 2) - crc1 = _mm_crc32_u64(crc1, hashes[i]); + crc1 = crc32u64(crc1, hashes[i]); for (size_t i = offset + 1; i < size; i += 2) - crc2 = _mm_crc32_u64(crc2, hashes[i]); + crc2 = crc32u64(crc2, hashes[i]); if ((size - offset) & 1) { for (size_t i = 0; i < offset; i += 2) - crc2 = _mm_crc32_u64(crc2, hashes[i]); + crc2 = crc32u64(crc2, hashes[i]); for (size_t i = 1; i < offset; i += 2) - crc1 = _mm_crc32_u64(crc1, hashes[i]); + crc1 = crc32u64(crc1, hashes[i]); } else { for (size_t i = 0; i < offset; i += 2) - crc1 = _mm_crc32_u64(crc1, hashes[i]); + crc1 = crc32u64(crc1, hashes[i]); for (size_t i = 1; i < offset; i += 2) - crc2 = _mm_crc32_u64(crc2, hashes[i]); + crc2 = crc32u64(crc2, hashes[i]); } -#else - throw Exception("wordShinglesHash is not implemented without sse4.2 support", ErrorCodes::NOT_IMPLEMENTED); -#endif + return crc1 | (crc2 << 32u); } @@ -82,14 +105,12 @@ struct Hash { UInt64 crc1 = -1ULL; UInt64 crc2 = -1ULL; -#ifdef __SSE4_2__ + for (size_t i = 0; i < K; i += 2) - crc1 = _mm_crc32_u8(crc1, hashes[i]); + crc1 = crc32u8(crc1, hashes[i]); for (size_t i = 1; i < K; i += 2) - crc2 = _mm_crc32_u8(crc2, hashes[i]); -#else - throw Exception("hashSum is not implemented without sse4.2 support", ErrorCodes::NOT_IMPLEMENTED); -#endif + crc2 = crc32u8(crc2, hashes[i]); + return crc1 | (crc2 << 32u); } @@ -97,29 +118,25 @@ struct Hash { UInt64 crc1 = -1ULL; UInt64 crc2 = -1ULL; -#ifdef __SSE4_2__ + for (size_t i = 0; i < K; i += 2) - crc1 = _mm_crc32_u32(crc1, hashes[i]); + crc1 = crc32u32(crc1, hashes[i]); for (size_t i = 1; i < K; i += 2) - crc2 = _mm_crc32_u32(crc2, hashes[i]); -#else - throw Exception("hashSum is not implemented without sse4.2 support", ErrorCodes::NOT_IMPLEMENTED); -#endif + crc2 = crc32u32(crc2, hashes[i]); + return crc1 | (crc2 << 32u); } - static ALWAYS_INLINE inline UInt64 hashSum(const UInt64 * hashes [[maybe_unused]], size_t K [[maybe_unused]]) + static ALWAYS_INLINE inline UInt64 hashSum(const UInt64 * hashes, size_t K) { UInt64 crc1 = -1ULL; UInt64 crc2 = -1ULL; -#ifdef __SSE4_2__ + for (size_t i = 0; i < K; i += 2) - crc1 = _mm_crc32_u64(crc1, hashes[i]); + crc1 = crc32u64(crc1, hashes[i]); for (size_t i = 1; i < K; i += 2) - crc2 = _mm_crc32_u64(crc2, hashes[i]); -#else - throw Exception("hashSum is not implemented without sse4.2 support", ErrorCodes::NOT_IMPLEMENTED); -#endif + crc2 = crc32u64(crc2, hashes[i]); + return crc1 | (crc2 << 32u); } }; From 2f4968c8da61a8e2d73e5a6bc320761a565bd436 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 22 Dec 2020 19:07:04 +0300 Subject: [PATCH 23/23] Fix style. --- src/Functions/ExtractString.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Functions/ExtractString.h b/src/Functions/ExtractString.h index b659d072887..c4251f8c4a6 100644 --- a/src/Functions/ExtractString.h +++ b/src/Functions/ExtractString.h @@ -1,3 +1,4 @@ +#pragma once #include #include #include