From 2d198f640ed4627cf6f12d03589ccc8f16b2722e Mon Sep 17 00:00:00 2001 From: fenglv Date: Wed, 6 Nov 2019 18:34:13 +0800 Subject: [PATCH 001/504] add simhash and minhash --- dbms/src/Functions/ExtractString.h | 187 ++++++ dbms/src/Functions/FunctionsStringHash.cpp | 585 ++++++++++++++++++ dbms/src/Functions/FunctionsStringHash.h | 124 ++++ .../Functions/FunctionsStringSimilarity.cpp | 158 +---- 4 files changed, 919 insertions(+), 135 deletions(-) create mode 100644 dbms/src/Functions/ExtractString.h create mode 100644 dbms/src/Functions/FunctionsStringHash.cpp create mode 100644 dbms/src/Functions/FunctionsStringHash.h diff --git a/dbms/src/Functions/ExtractString.h b/dbms/src/Functions/ExtractString.h new file mode 100644 index 00000000000..05566496cba --- /dev/null +++ b/dbms/src/Functions/ExtractString.h @@ -0,0 +1,187 @@ +#include + +#include +#include +#include +#include +#include + +#ifdef __SSE4_2__ +# include +#endif + +namespace DB +{ +//used by FunctionsStringSimilarity and FunctionsStringHash +//includes exacting ASCII ngram, UTF8 ngram, ASCII word and UTF8 word +template +struct ExtractStringImpl +{ + static constexpr size_t default_padding = 16; + + static ALWAYS_INLINE size_t readASCIICodePoints(UInt8 * code_points, const char *& pos, const char * end) + { + /// Offset before which we copy some data. + constexpr size_t padding_offset = default_padding - N + 1; + /// We have an array like this for ASCII (N == 4, other cases are similar) + /// |a0|a1|a2|a3|a4|a5|a6|a7|a8|a9|a10|a11|a12|a13|a14|a15|a16|a17|a18| + /// And we copy ^^^^^^^^^^^^^^^ these bytes to the start + /// Actually it is enough to copy 3 bytes, but memcpy for 4 bytes translates into 1 instruction + memcpy(code_points, code_points + padding_offset, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(UInt8)); + /// Now we have an array + /// |a13|a14|a15|a16|a4|a5|a6|a7|a8|a9|a10|a11|a12|a13|a14|a15|a16|a17|a18| + /// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + /// Doing unaligned read of 16 bytes and copy them like above + /// 16 is also chosen to do two `movups`. + /// Such copying allow us to have 3 codepoints from the previous read to produce the 4-grams with them. + memcpy(code_points + (N - 1), pos, default_padding * sizeof(UInt8)); + + if constexpr (CaseInsensitive) + { + /// We really need template lambdas with C++20 to do it inline + unrollLowering(code_points, std::make_index_sequence()); + } + pos += padding_offset; + if (pos > end) + return default_padding - (pos - end); + return default_padding; + } + + //used by FunctionsStringHash + //it's not easy to add padding for ColumnString, so we need safety check each memcpy + static ALWAYS_INLINE size_t readASCIICodePointsNoPadding(UInt8 * code_points, const char *& pos, const char * end) + { + constexpr size_t padding_offset = default_padding - N + 1; + memcpy(code_points, code_points + padding_offset, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(UInt8)); + + //safety check + size_t cpy_size = (pos + padding_offset > end) ? end - pos : padding_offset; + + memcpy(code_points + (N - 1), pos, cpy_size * sizeof(UInt8)); + + if constexpr (CaseInsensitive) + { + unrollLowering(code_points, std::make_index_sequence()); + } + pos += padding_offset; + if (pos > end) + return default_padding - (pos - end); + return default_padding; + } + + //read a ASCII word from pos to word + //if the word size exceeds max_word_size, only read max_word_size byte + //in FuntionsStringHash, the default value of max_word_size is 128 + static ALWAYS_INLINE inline size_t readOneASCIIWord(UInt8 * word, const char *& pos, const char * end, const size_t & max_word_size) + { + //jump seperators + while (pos < end && !isAlphaNum(*pos)) + ++pos; + + // word start from here + const char * word_start = pos; + while (pos < end && isAlphaNum(*pos)) + ++pos; + + size_t word_size = (static_cast(pos - word_start) <= max_word_size) ? pos - word_start : max_word_size; + + memcpy(word, word_start, word_size); + if (CaseInsensitive) + { + std::transform(word, word + word_size, word, [](UInt8 c) { return std::tolower(c); }); + } + return word_size; + } + + static ALWAYS_INLINE inline size_t readUTF8CodePoints(UInt32 * code_points, const char *& pos, const char * end) + { + memcpy(code_points, code_points + default_padding - N + 1, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(UInt32)); + + size_t num = N - 1; + while (num < default_padding && pos < end) + { + code_points[num++] = readOneUTF8Code(pos, end); + } + return num; + } + + //read one UTF8 word from pos to word + //also, we assume that one word size cann't exceed max_word_size with default value 128 + static ALWAYS_INLINE inline size_t readOneUTF8Word(UInt32 * word, const char *& pos, const char * end, const size_t & max_word_size) + { + // jump UTF8 seperator + while (pos < end && isUTF8Sep(*pos)) + ++pos; + //UTF8 word's character number + size_t num = 0; + while (pos < end && num < max_word_size && !isUTF8Sep(*pos)) + { + word[num++] = readOneUTF8Code(pos, end); + } + return num; + } + +private: + static ALWAYS_INLINE inline bool isAlphaNum(const UInt8 c) + { + return (c >= 48 && c <= 57) || (c >= 65 && c <= 90) || (c >= 97 && c <= 122); + } + + template + static ALWAYS_INLINE inline void unrollLowering(Container & cont, const std::index_sequence &) + { + ((cont[Offset + I] = std::tolower(cont[Offset + I])), ...); + } + + //we use ASCII non-alphanum character as UTF8 seperator + static ALWAYS_INLINE inline bool isUTF8Sep(const UInt8 c) { return c < 128 && !isAlphaNum(c); } + + // read one UTF8 character and return it + static ALWAYS_INLINE inline UInt32 readOneUTF8Code(const char *& pos, const char * end) + { + size_t length = UTF8::seqLength(*pos); + + if (pos + length > end) + length = end - pos; + UInt32 res; + switch (length) + { + case 1: + res = 0; + memcpy(&res, pos, 1); + break; + case 2: + res = 0; + memcpy(&res, pos, 2); + break; + case 3: + res = 0; + memcpy(&res, pos, 3); + break; + default: + memcpy(&res, pos, 4); + } + + if constexpr (CaseInsensitive) + { + switch (length) + { + case 4: + res &= ~(1u << (5 + 3 * CHAR_BIT)); + [[fallthrough]]; + case 3: + res &= ~(1u << (5 + 2 * CHAR_BIT)); + [[fallthrough]]; + case 2: + res &= ~(1u); + res &= ~(1u << (5 + CHAR_BIT)); + [[fallthrough]]; + default: + res &= ~(1u << 5); + } + } + pos += length; + return res; + } +}; +} diff --git a/dbms/src/Functions/FunctionsStringHash.cpp b/dbms/src/Functions/FunctionsStringHash.cpp new file mode 100644 index 00000000000..797d7d30078 --- /dev/null +++ b/dbms/src/Functions/FunctionsStringHash.cpp @@ -0,0 +1,585 @@ +#include + +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +namespace DB +{ +struct Hash +{ + static ALWAYS_INLINE inline UInt64 ngramASCIIHash(const UInt8 * code_points) + { + return intHashCRC32(unalignedLoad(code_points)); + } + + static ALWAYS_INLINE inline UInt64 ngramUTF8Hash(const UInt32 * code_points) + { + UInt64 combined = (static_cast(code_points[0]) << 32) | code_points[1]; +#ifdef __SSE4_2__ + return _mm_crc32_u64(code_points[2], combined); +#else + return (intHashCRC32(combined) ^ intHashCRC32(code_points[2])); +#endif + } + + static ALWAYS_INLINE inline UInt64 wordShinglesHash(const UInt64 * hashes, const size_t & size, const size_t & offset) + { + UInt64 res = 0; + UInt8 flag = 0; + for (size_t i = offset; i < size; ++i) + { + if (flag) + res &= intHashCRC32(hashes[i]); + else + res |= intHashCRC32(hashes[i]); + flag = (flag + 1) % 2; + } + for (size_t i = 0; i < offset; ++i) + { + if (flag) + res &= intHashCRC32(hashes[i]); + else + res |= intHashCRC32(hashes[i]); + flag = (flag + 1) % 2; + } + return res; + } + + template + static ALWAYS_INLINE inline UInt64 hashSum(const CodePoint * hashes, const size_t & K) + { + UInt64 even = 0; + UInt64 odd = 0; + size_t i = 0; + for (; i + 1 < K; i += 2) + { + even |= intHashCRC32(hashes[i]); + odd |= intHashCRC32(hashes[i + 1]); + } + if (i < K) + even |= intHashCRC32(hashes[K - 1]); +#ifdef __SSE4_2__ + return _mm_crc32_u64(even, odd); +#else + return (intHashCRC32(even) ^ intHashCRC32(odd)); +#endif + } +}; + +//Sinhash String -> UInt64 +template +struct SimhashImpl +{ + using ResultType = UInt64; + using StrOp = ExtractStringImpl; + // we made an assumption that the size of one word cann't exceed 128, which may not true + // if some word's size exceed 128, it would be cut up to several word + static constexpr size_t max_word_size = 1u << 7; + static constexpr size_t max_string_size = 1u << 15; + static constexpr size_t simultaneously_codepoints_num = StrOp::default_padding + N - 1; + + // Simhash ngram calculate function: String ->UInt64 + // this function extracting ngram from input string, and maintain a 64-dimensions vector + // for each ngram, calculate a 64 bit hash value, and update the vector according the hash value + // finally return a 64 bit value(UInt64), i'th bit is 1 means vector[i] > 0, otherwise, vector[i] < 0 + static ALWAYS_INLINE inline UInt64 ngramCalculateHashValue( + const char * data, + const size_t size, + size_t (*read_code_points)(CodePoint *, const char *&, const char *), + UInt64 (*hash_functor)(const CodePoint *)) + { + const char * start = data; + const char * end = data + size; + // fingerprint vector, all dimensions initialized to zero at the first + Int64 finger_vec[64] = {}; + CodePoint cp[simultaneously_codepoints_num] = {}; + + size_t found = read_code_points(cp, start, end); + size_t iter = N - 1; + + do + { + for (; iter + N <= found; ++iter) + { + // for each ngram, we can calculate an 64 bit hash + // then update finger_vec according to this hash value + // if the i'th bit is 1, finger_vec[i] plus 1, otherwise minus 1 + UInt64 hash_value = hash_functor(cp + iter); + std::bitset<64> bits(hash_value); + for (size_t i = 0; i < 64; ++i) + { + finger_vec[i] += ((bits.test(i)) ? 1 : -1); + } + } + iter = 0; + } while (start < end && (found = read_code_points(cp, start, end))); + + //finally, we return a 64 bit value according to finger_vec + //if finger_vec[i] > 0, the i'th bit of the value is 1, otherwise 0 + std::bitset<64> res_bit(0u); + for (size_t i = 0; i < 64; ++i) + { + if (finger_vec[i] > 0) + res_bit.set(i); + } + return res_bit.to_ullong(); + } + + // Simhash word shingle calculate funtion: String -> UInt64 + // this function extracting n word shingle from input string, and maintain a 64-dimensions vector as well + // for each word shingle, calculate a 64 bit hash value, and update the vector according the hash value + // finally return a 64 bit value(UInt64), i'th bit is 1 means vector[i] > 0, otherwise, vector[i] < 0 + // + // word shingle hash value calculate: + // 1. at the first, extracts N word shingles and calculate N hash values, store into an array, use this N hash values + // to calculate the first word shingle hash value + // 2. next, we extrac one word each time, and calculate a new hash value of the new word,then use the latest N hash + // values to caculate the next word shingle hash value + static ALWAYS_INLINE inline UInt64 wordShinglesCalculateHashValue( + const char * data, + const size_t size, + size_t (*read_one_word)(CodePoint *, const char *&, const char *, const size_t &), + UInt64 (*hash_functor)(const UInt64 *, const size_t &, const size_t &)) + { + const char * start = data; + const char * end = data + size; + + // Also, a 64 bit vector initialized to zero + Int64 finger_vec[64] = {}; + // a array to store N word hash values + UInt64 nwordHashes[N] = {}; + // word buffer to store one word + CodePoint word_buf[max_word_size] = {}; + size_t word_size; + //get first word shingle + for (size_t i = 0; i < N && start < end; ++i) + { + word_size = read_one_word(word_buf, start, end, max_word_size); + if (word_size) + { + // for each word, calculate a hash value and stored into the array + nwordHashes[i++] = Hash::hashSum(word_buf, word_size); + } + } + + // calculate the first word shingle hash value + UInt64 hash_value = hash_functor(nwordHashes, N, 0); + std::bitset<64> bits_(hash_value); + for (size_t i = 0; i < 64; ++i) + { + finger_vec[i] += ((bits_.test(i)) ? 1 : -1); + } + + size_t offset = 0; + while (start < end && (word_size = read_one_word(word_buf, start, end, max_word_size))) + { + // we need to store the new word hash value to the oldest location. + // for example, N = 5, array |a0|a1|a2|a3|a4|, now , a0 is the oldest location, + // so we need to store new word hash into location of a0, then ,this array become + // |a5|a1|a2|a3|a4|, next time, a1 become the oldest location, we need to store new + // word hash value into locaion of a1, then array become |a5|a6|a2|a3|a4| + nwordHashes[offset] = Hash::hashSum(word_buf, word_size); + offset = (offset + 1) % N; + //according to the word hash storation way, in order to not lose the word shingle's + //sequence information, when calculation word shingle hash value, we need provide the offset + //inforation, which is the offset of the first word's hash value of the word shingle + hash_value = hash_functor(nwordHashes, N, offset); + std::bitset<64> bits(hash_value); + for (size_t i = 0; i < 64; ++i) + { + finger_vec[i] += ((bits.test(i)) ? 1 : -1); + } + } + + std::bitset<64> res_bit(0u); + for (size_t i = 0; i < 64; ++i) + { + if (finger_vec[i] > 0) + res_bit.set(i); + } + return res_bit.to_ullong(); + } + + template + static ALWAYS_INLINE inline auto dispatch(CalcFunc calc_func, Args &&... args) + { + if constexpr (Ngram) + { + if constexpr (!UTF8) + return calc_func(std::forward(args)..., StrOp::readASCIICodePointsNoPadding, Hash::ngramASCIIHash); + else + return calc_func(std::forward(args)..., StrOp::readUTF8CodePoints, Hash::ngramUTF8Hash); + } + else + { + if constexpr (!UTF8) + return calc_func(std::forward(args)..., StrOp::readOneASCIIWord, Hash::wordShinglesHash); + else + return calc_func(std::forward(args)..., StrOp::readOneUTF8Word, Hash::wordShinglesHash); + } + } + + // constant string + static inline void constant(const String data, UInt64 & res) + { + if constexpr (Ngram) + res = dispatch(ngramCalculateHashValue, data.data(), data.size()); + else + res = dispatch(wordShinglesCalculateHashValue, data.data(), data.size()); + } + + //non-constant string + static void vector(const ColumnString::Chars & data, const ColumnString::Offsets & offsets, PaddedPODArray & res) + { + for (size_t i = 0; i < offsets.size(); ++i) + { + const char * one_data = reinterpret_cast(&data[offsets[i - 1]]); + const size_t data_size = offsets[i] - offsets[i - 1] - 1; + if (data_size <= max_string_size) + { + if constexpr (Ngram) + res[i] = dispatch(ngramCalculateHashValue, one_data, data_size); + else + res[i] = dispatch(wordShinglesCalculateHashValue, one_data, data_size); + } + } + } +}; + +//Minhash: String -> Tuple(UInt64, UInt64) +//for each string, we extract ngram or word shingle, +//for each ngram or word shingle, calculate a hash value, +//then we take the K minimum hash values to calculate a hashsum, +//and take the K maximum hash values to calculate another hashsum, +//return this two hashsum: Tuple(hashsum1, hashsum2) +template +struct MinhashImpl +{ + using ResultType = UInt64; + using StrOp = ExtractStringImpl; + static constexpr size_t max_word_size = 1u << 7; + static constexpr size_t max_string_size = 1u << 15; + static constexpr size_t simultaneously_codepoints_num = StrOp::default_padding + N - 1; + + // insert a new value into K minimum hash array if this value + // is smaller than the greatest value in the array + static ALWAYS_INLINE inline void insert_minValue(UInt64 * hashes, UInt64 v) + { + size_t i = 0; + for (; i < K && hashes[i] <= v; ++i) + ; + if (i == K) + return; + for (size_t j = K - 2; j >= i; --j) + hashes[j + 1] = hashes[j]; + hashes[i] = v; + } + + // insert a new value into K maximum hash array if this value + // is greater than the smallest value in the array + static ALWAYS_INLINE inline void insert_maxValue(UInt64 * hashes, UInt64 v) + { + int i = K - 1; + for (; i >= 0 && hashes[i] >= v; --i) + ; + if (i < 0) + return; + for (int j = 1; j <= i; ++j) + hashes[j - 1] = hashes[j]; + hashes[i] = v; + } + + //Minhash ngram calculate function, String -> Tuple(UInt64, UInt64) + //we extract ngram from input string, and calculate a hash value for each ngram + //then we take the K minimum hash values to calculate a hashsum, + //and take the K maximum hash values to calculate another hashsum, + //return this two hashsum: Tuple(hashsum1, hashsum2) + static ALWAYS_INLINE inline std::tuple ngramCalculateHashValue( + const char * data, + const size_t size, + size_t (*read_code_points)(CodePoint *, const char *&, const char *), + UInt64 (*hash_functor)(const CodePoint *)) + { + const char * start = data; + const char * end = data + size; + // we just maintain the K minimu and K maximum hash values + UInt64 k_minimum[K] = {}; + UInt64 k_maxinum[K] = {}; + CodePoint cp[simultaneously_codepoints_num] = {}; + + size_t found = read_code_points(cp, start, end); + size_t iter = N - 1; + + do + { + for (; iter + N <= found; ++iter) + { + auto new_hash = hash_functor(cp + iter); + // insert the new hash value into array used to store K minimum value + // and K maximum value + insert_minValue(k_minimum, new_hash); + insert_maxValue(k_maxinum, new_hash); + } + iter = 0; + } while (start < end && (found = read_code_points(cp, start, end))); + + // calculate hashsum of the K minimum hash values and K maximum hash values + UInt64 res1 = Hash::hashSum(k_maxinum, K); + UInt64 res2 = Hash::hashSum(k_maxinum, K); + return std::make_tuple(res1, res2); + } + + // Minhash word shingle hash value calculate function: String ->Tuple(UInt64, UInt64) + //for each word shingle, we calculate a hash value, but in fact, we just maintain the + //K minimum and K maximum hash value + static ALWAYS_INLINE inline std::tuple wordShinglesCalculateHashValue( + const char * data, + const size_t size, + size_t (*read_one_word)(CodePoint *, const char *&, const char *, const size_t &), + UInt64 (*hash_functor)(const UInt64 *, const size_t &, const size_t &)) + { + const char * start = data; + const char * end = start + size; + //also we just store the K minimu and K maximum hash values + UInt64 k_minimum[K] = {}; + UInt64 k_maxinum[K] = {}; + // array to store n word hashes + UInt64 nwordHashes[N] = {}; + // word buffer to store one word + CodePoint word_buf[max_word_size] = {}; + size_t word_size; + //how word shingle hash value calculation and word hash storation is same as we + //have descripted in Simhash wordShinglesCalculateHashValue function + for (size_t i = 0; i < N && start < end; ++i) + { + word_size = read_one_word(word_buf, start, end, max_word_size); + if (word_size) + { + nwordHashes[i++] = Hash::hashSum(word_buf, word_size); + } + } + + auto new_hash = hash_functor(nwordHashes, N, 0); + insert_minValue(k_minimum, new_hash); + insert_maxValue(k_maxinum, new_hash); + + size_t offset = 0; + while (start < end && (word_size = read_one_word(word_buf, start, end, max_word_size))) + { + nwordHashes[offset] = Hash::hashSum(word_buf, word_size); + offset = (offset + 1) % N; + new_hash = hash_functor(nwordHashes, N, offset); + insert_minValue(k_minimum, new_hash); + insert_maxValue(k_maxinum, new_hash); + } + + // calculate hashsum + UInt64 res1 = Hash::hashSum(k_minimum, K); + UInt64 res2 = Hash::hashSum(k_maxinum, K); + return std::make_tuple(res1, res2); + } + + template + static ALWAYS_INLINE inline auto dispatch(CalcFunc calc_func, Args &&... args) + { + if constexpr (Ngram) + { + if constexpr (!UTF8) + return calc_func(std::forward(args)..., StrOp::readASCIICodePointsNoPadding, Hash::ngramASCIIHash); + else + return calc_func(std::forward(args)..., StrOp::readUTF8CodePoints, Hash::ngramUTF8Hash); + } + else + { + if constexpr (!UTF8) + return calc_func(std::forward(args)..., StrOp::readOneASCIIWord, Hash::wordShinglesHash); + else + return calc_func(std::forward(args)..., StrOp::readOneUTF8Word, Hash::wordShinglesHash); + } + } + + // constant string + static void constant(const String data, UInt64 & res1, UInt64 & res2) + { + if constexpr (Ngram) + std::tie(res1, res2) = dispatch(ngramCalculateHashValue, data.data(), data.size()); + else + std::tie(res1, res2) = dispatch(wordShinglesCalculateHashValue, data.data(), data.size()); + } + + //non-constant string + static void vector( + const ColumnString::Chars & data, + const ColumnString::Offsets & offsets, + PaddedPODArray & res1, + PaddedPODArray & res2) + { + for (size_t i = 0; i < offsets.size(); ++i) + { + const char * one_data = reinterpret_cast(&data[offsets[i - 1]]); + const size_t data_size = offsets[i] - offsets[i - 1] - 1; + if (data_size <= max_string_size) + { + if constexpr (Ngram) + std::tie(res1[i], res2[i]) = dispatch(ngramCalculateHashValue, one_data, data_size); + else + std::tie(res1[i], res2[i]) = dispatch(wordShinglesCalculateHashValue, one_data, data_size); + } + } + } +}; + +struct NameNgramSimhash +{ + static constexpr auto name = "ngramSimhash"; +}; + +struct NameNgramSimhashCaseInsensitive +{ + static constexpr auto name = "ngramSimhashCaseInsensitive"; +}; + +struct NameNgramSimhashUTF8 +{ + static constexpr auto name = "ngramSimhashUTF8"; +}; + +struct NameNgramSimhashCaseInsensitiveUTF8 +{ + static constexpr auto name = "ngramSimhashCaseInsensitiveUTF8"; +}; + +struct NameWordShingleSimhash +{ + static constexpr auto name = "wordShingleSimhash"; +}; + +struct NameWordShingleSimhashCaseInsensitive +{ + static constexpr auto name = "wordShingleSimhashCaseInsensitive"; +}; + +struct NameWordShingleSimhashUTF8 +{ + static constexpr auto name = "wordShingleSimhashUTF8"; +}; + +struct NameWordShingleSimhashCaseInsensitiveUTF8 +{ + static constexpr auto name = "wordShingleSimhashCaseInsensitiveUTF8"; +}; + +struct NameNgramMinhash +{ + static constexpr auto name = "ngramMinhash"; +}; + +struct NameNgramMinhashCaseInsensitive +{ + static constexpr auto name = "ngramMinhashCaseInsensitive"; +}; + +struct NameNgramMinhashUTF8 +{ + static constexpr auto name = "ngramMinhashUTF8"; +}; + +struct NameNgramMinhashCaseInsensitiveUTF8 +{ + static constexpr auto name = "ngramMinhashCaseInsensitiveUTF8"; +}; + +struct NameWordShingleMinhash +{ + static constexpr auto name = "wordShingleMinhash"; +}; + +struct NameWordShingleMinhashCaseInsensitive +{ + static constexpr auto name = "wordShingleMinhashCaseInsensitive"; +}; + +struct NameWordShingleMinhashUTF8 +{ + static constexpr auto name = "wordShingleMinhashUTF8"; +}; + +struct NameWordShingleMinhashCaseInsensitiveUTF8 +{ + static constexpr auto name = "wordShingleMinhashCaseInsensitiveUTF8"; +}; + +//Simhash +using FunctionNgramSimhash = FunctionsStringHash, NameNgramSimhash, true>; + +using FunctionNgramSimhashCaseInsensitive + = FunctionsStringHash, NameNgramSimhashCaseInsensitive, true>; + +using FunctionNgramSimhashUTF8 = FunctionsStringHash, NameNgramSimhashUTF8, true>; + +using FunctionNgramSimhashCaseInsensitiveUTF8 + = FunctionsStringHash, NameNgramSimhashCaseInsensitiveUTF8, true>; + +using FunctionWordShingleSimhash = FunctionsStringHash, NameWordShingleSimhash, true>; + +using FunctionWordShingleSimhashCaseInsensitive + = FunctionsStringHash, NameWordShingleSimhashCaseInsensitive, true>; + +using FunctionWordShingleSimhashUTF8 = FunctionsStringHash, NameWordShingleSimhashUTF8, true>; + +using FunctionWordShingleSimhashCaseInsensitiveUTF8 + = FunctionsStringHash, NameWordShingleSimhashCaseInsensitiveUTF8, true>; + +//Minhash +using FunctionNgramMinhash = FunctionsStringHash, NameNgramMinhash, false>; + +using FunctionNgramMinhashCaseInsensitive + = FunctionsStringHash, NameNgramMinhashCaseInsensitive, false>; + +using FunctionNgramMinhashUTF8 = FunctionsStringHash, NameNgramMinhashUTF8, false>; + +using FunctionNgramMinhashCaseInsensitiveUTF8 + = FunctionsStringHash, NameNgramMinhashCaseInsensitiveUTF8, false>; + +using FunctionWordShingleMinhash = FunctionsStringHash, NameWordShingleMinhash, false>; + +using FunctionWordShingleMinhashCaseInsensitive + = FunctionsStringHash, NameWordShingleMinhashCaseInsensitive, false>; + +using FunctionWordShingleMinhashUTF8 + = FunctionsStringHash, NameWordShingleMinhashUTF8, false>; + +using FunctionWordShingleMinhashCaseInsensitiveUTF8 + = FunctionsStringHash, NameWordShingleMinhashCaseInsensitiveUTF8, false>; + +void registerFunctionsStringHash(FunctionFactory & factory) +{ + factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); + + factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); +} +} + diff --git a/dbms/src/Functions/FunctionsStringHash.h b/dbms/src/Functions/FunctionsStringHash.h new file mode 100644 index 00000000000..185097ade99 --- /dev/null +++ b/dbms/src/Functions/FunctionsStringHash.h @@ -0,0 +1,124 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ +namespace ErrorCodes +{ + extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int ILLEGAL_COLUMN; + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int TOO_LARGE_STRING_SIZE; +} + +//FunctionStringHash +//Simhash: String -> UInt64 +//Minhash: String -> (UInt64, UInt64) +template +class FunctionsStringHash : public IFunction +{ +public: + static constexpr auto name = Name::name; + + static FunctionPtr create(const Context &) { return std::make_shared(); } + + String getName() const override { return name; } + + size_t getNumberOfArguments() const override { return 1; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + if (!isString(arguments[0])) + throw Exception( + "Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + if (IsSimhash) + return std::make_shared>(); + auto element = DataTypeFactory::instance().get("UInt64"); + return std::make_shared(DataTypes{element, element}); + } + + void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t) override + { + const ColumnPtr & column = block.getByPosition(arguments[0]).column; + const ColumnConst * col_const = typeid_cast(&*column); + using ResultType = typename Impl::ResultType; + if constexpr (IsSimhash) + { + if (col_const) + { + ResultType res{}; + const String & str_data = col_const->getValue(); + if (str_data.size() > Impl::max_string_size) + { + throw Exception( + "String size is too big for function " + getName() + ". Should be at most " + std::to_string(Impl::max_string_size), + ErrorCodes::TOO_LARGE_STRING_SIZE); + } + Impl::constant(str_data, res); + block.getByPosition(result).column = block.getByPosition(result).type->createColumnConst(1, toField(res)); + } + else + { + // non const string + auto col_res = ColumnVector::create(); + typename ColumnVector::Container & vec_res = col_res->getData(); + vec_res.resize(column->size()); + const ColumnString * col_str_vector = checkAndGetColumn(&*column); + Impl::vector(col_str_vector->getChars(), col_str_vector->getOffsets(), vec_res); + block.getByPosition(result).column = std::move(col_res); + } + } + else // Min hash + { + if (col_const) + { + ResultType h1, h2; + const String & str_data = col_const->getValue(); + if (str_data.size() > Impl::max_string_size) + { + throw Exception( + "String size is too big for function " + getName() + ". Should be at most " + std::to_string(Impl::max_string_size), + ErrorCodes::TOO_LARGE_STRING_SIZE); + } + Impl::constant(str_data, h1, h2); + auto h1_col = ColumnVector::create(1); + auto h2_col = ColumnVector::create(1); + typename ColumnVector::Container & h1_data = h1_col->getData(); + typename ColumnVector::Container & h2_data = h2_col->getData(); + h1_data[0] = h1; + h2_data[0] = h2; + MutableColumns tuple_columns; + tuple_columns.emplace_back(std::move(h1_col)); + tuple_columns.emplace_back(std::move(h2_col)); + block.getByPosition(result).column = ColumnTuple::create(std::move(tuple_columns)); + } + else + { + //non const string + auto col_h1 = ColumnVector::create(); + auto col_h2 = ColumnVector::create(); + typename ColumnVector::Container & vec_h1 = col_h1->getData(); + typename ColumnVector::Container & vec_h2 = col_h2->getData(); + vec_h1.resize(column->size()); + vec_h2.resize(column->size()); + const ColumnString * col_str_vector = checkAndGetColumn(&*column); + Impl::vector(col_str_vector->getChars(), col_str_vector->getOffsets(), vec_h1, vec_h2); + MutableColumns tuple_columns; + tuple_columns.emplace_back(std::move(col_h1)); + tuple_columns.emplace_back(std::move(col_h2)); + block.getByPosition(result).column = ColumnTuple::create(std::move(tuple_columns)); + } + } + } +}; +} + diff --git a/dbms/src/Functions/FunctionsStringSimilarity.cpp b/dbms/src/Functions/FunctionsStringSimilarity.cpp index 9dda521cd29..c6327ad59b4 100644 --- a/dbms/src/Functions/FunctionsStringSimilarity.cpp +++ b/dbms/src/Functions/FunctionsStringSimilarity.cpp @@ -1,5 +1,6 @@ #include +#include #include #include #include @@ -10,13 +11,6 @@ #include -#include -#include -#include -#include -#include -#include - #ifdef __SSE4_2__ # include #endif @@ -36,6 +30,7 @@ template ; /// map_size for ngram difference. static constexpr size_t map_size = 1u << 16; @@ -44,7 +39,7 @@ struct NgramDistanceImpl static constexpr size_t max_string_size = 1u << 15; /// Default padding to read safely. - static constexpr size_t default_padding = 16; + static constexpr size_t default_padding = StrOp::default_padding; /// Max codepoints to store at once. 16 is for batching usage and PODArray has this padding. static constexpr size_t simultaneously_codepoints_num = default_padding + N - 1; @@ -70,102 +65,6 @@ struct NgramDistanceImpl #endif } - template - static ALWAYS_INLINE inline void unrollLowering(Container & cont, const std::index_sequence &) - { - ((cont[Offset + I] = std::tolower(cont[Offset + I])), ...); - } - - static ALWAYS_INLINE size_t readASCIICodePoints(CodePoint * code_points, const char *& pos, const char * end) - { - /// Offset before which we copy some data. - constexpr size_t padding_offset = default_padding - N + 1; - /// We have an array like this for ASCII (N == 4, other cases are similar) - /// |a0|a1|a2|a3|a4|a5|a6|a7|a8|a9|a10|a11|a12|a13|a14|a15|a16|a17|a18| - /// And we copy ^^^^^^^^^^^^^^^ these bytes to the start - /// Actually it is enough to copy 3 bytes, but memcpy for 4 bytes translates into 1 instruction - memcpy(code_points, code_points + padding_offset, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(CodePoint)); - /// Now we have an array - /// |a13|a14|a15|a16|a4|a5|a6|a7|a8|a9|a10|a11|a12|a13|a14|a15|a16|a17|a18| - /// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - /// Doing unaligned read of 16 bytes and copy them like above - /// 16 is also chosen to do two `movups`. - /// Such copying allow us to have 3 codepoints from the previous read to produce the 4-grams with them. - memcpy(code_points + (N - 1), pos, default_padding * sizeof(CodePoint)); - - if constexpr (case_insensitive) - { - /// We really need template lambdas with C++20 to do it inline - unrollLowering(code_points, std::make_index_sequence()); - } - pos += padding_offset; - if (pos > end) - return default_padding - (pos - end); - return default_padding; - } - - static ALWAYS_INLINE size_t readUTF8CodePoints(CodePoint * code_points, const char *& pos, const char * end) - { - /// The same copying as described in the function above. - memcpy(code_points, code_points + default_padding - N + 1, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(CodePoint)); - - size_t num = N - 1; - while (num < default_padding && pos < end) - { - size_t length = UTF8::seqLength(*pos); - - if (pos + length > end) - length = end - pos; - - CodePoint res; - /// This is faster than just memcpy because of compiler optimizations with moving bytes. - switch (length) - { - case 1: - res = 0; - memcpy(&res, pos, 1); - break; - case 2: - res = 0; - memcpy(&res, pos, 2); - break; - case 3: - res = 0; - memcpy(&res, pos, 3); - break; - default: - memcpy(&res, pos, 4); - } - - /// This is not a really true case insensitive utf8. We zero the 5-th bit of every byte. - /// And first bit of first byte if there are two bytes. - /// For ASCII it works https://catonmat.net/ascii-case-conversion-trick. For most cyrrilic letters also does. - /// For others, we don't care now. Lowering UTF is not a cheap operation. - if constexpr (case_insensitive) - { - switch (length) - { - case 4: - res &= ~(1u << (5 + 3 * CHAR_BIT)); - [[fallthrough]]; - case 3: - res &= ~(1u << (5 + 2 * CHAR_BIT)); - [[fallthrough]]; - case 2: - res &= ~(1u); - res &= ~(1u << (5 + CHAR_BIT)); - [[fallthrough]]; - default: - res &= ~(1u << 5); - } - } - - pos += length; - code_points[num++] = res; - } - return num; - } - template static ALWAYS_INLINE inline size_t calculateNeedleStats( const char * data, @@ -250,9 +149,9 @@ struct NgramDistanceImpl static inline auto dispatchSearcher(Callback callback, Args &&... args) { if constexpr (!UTF8) - return callback(std::forward(args)..., readASCIICodePoints, ASCIIHash); + return callback(std::forward(args)..., StrOp::readASCIICodePoints, ASCIIHash); else - return callback(std::forward(args)..., readUTF8CodePoints, UTF8Hash); + return callback(std::forward(args)..., StrOp::readUTF8CodePoints, UTF8Hash); } static void constant_constant(std::string data, std::string needle, Float32 & res) @@ -269,7 +168,8 @@ struct NgramDistanceImpl size_t distance = second_size; if (data_size <= max_string_size) { - size_t first_size = dispatchSearcher(calculateHaystackStatsAndMetric, data.data(), data_size, common_stats, distance, nullptr); + size_t first_size + = dispatchSearcher(calculateHaystackStatsAndMetric, data.data(), data_size, common_stats, distance, nullptr); /// For !symmetric version we should not use first_size. if constexpr (symmetric) res = distance * 1.f / std::max(first_size + second_size, size_t(1)); @@ -313,23 +213,14 @@ struct NgramDistanceImpl if (needle_size <= max_string_size && haystack_size <= max_string_size) { /// Get needle stats. - const size_t needle_stats_size = dispatchSearcher( - calculateNeedleStats, - needle, - needle_size, - common_stats, - needle_ngram_storage.get()); + const size_t needle_stats_size + = dispatchSearcher(calculateNeedleStats, needle, needle_size, common_stats, needle_ngram_storage.get()); size_t distance = needle_stats_size; /// Combine with haystack stats, return to initial needle stats. const size_t haystack_stats_size = dispatchSearcher( - calculateHaystackStatsAndMetric, - haystack, - haystack_size, - common_stats, - distance, - haystack_ngram_storage.get()); + calculateHaystackStatsAndMetric, haystack, haystack_size, common_stats, distance, haystack_ngram_storage.get()); /// Return to zero array stats. for (size_t j = 0; j < needle_stats_size; ++j) @@ -391,12 +282,8 @@ struct NgramDistanceImpl if (needle_size <= max_string_size && haystack_size <= max_string_size) { - const size_t needle_stats_size = dispatchSearcher( - calculateNeedleStats, - needle, - needle_size, - common_stats, - needle_ngram_storage.get()); + const size_t needle_stats_size + = dispatchSearcher(calculateNeedleStats, needle, needle_size, common_stats, needle_ngram_storage.get()); size_t distance = needle_stats_size; @@ -420,15 +307,11 @@ struct NgramDistanceImpl prev_offset = needle_offsets[i]; } - } } static void vector_constant( - const ColumnString::Chars & data, - const ColumnString::Offsets & offsets, - std::string needle, - PaddedPODArray & res) + const ColumnString::Chars & data, const ColumnString::Offsets & offsets, std::string needle, PaddedPODArray & res) { /// zeroing our map NgramStats common_stats = {}; @@ -454,7 +337,8 @@ struct NgramDistanceImpl size_t haystack_stats_size = dispatchSearcher( calculateHaystackStatsAndMetric, reinterpret_cast(haystack), - haystack_size, common_stats, + haystack_size, + common_stats, distance, ngram_storage.get()); /// For !symmetric version we should not use haystack_stats_size. @@ -516,14 +400,18 @@ struct NameNgramSearchUTF8CaseInsensitive }; using FunctionNgramDistance = FunctionsStringSimilarity, NameNgramDistance>; -using FunctionNgramDistanceCaseInsensitive = FunctionsStringSimilarity, NameNgramDistanceCaseInsensitive>; +using FunctionNgramDistanceCaseInsensitive + = FunctionsStringSimilarity, NameNgramDistanceCaseInsensitive>; using FunctionNgramDistanceUTF8 = FunctionsStringSimilarity, NameNgramDistanceUTF8>; -using FunctionNgramDistanceCaseInsensitiveUTF8 = FunctionsStringSimilarity, NameNgramDistanceUTF8CaseInsensitive>; +using FunctionNgramDistanceCaseInsensitiveUTF8 + = FunctionsStringSimilarity, NameNgramDistanceUTF8CaseInsensitive>; using FunctionNgramSearch = FunctionsStringSimilarity, NameNgramSearch>; -using FunctionNgramSearchCaseInsensitive = FunctionsStringSimilarity, NameNgramSearchCaseInsensitive>; +using FunctionNgramSearchCaseInsensitive + = FunctionsStringSimilarity, NameNgramSearchCaseInsensitive>; using FunctionNgramSearchUTF8 = FunctionsStringSimilarity, NameNgramSearchUTF8>; -using FunctionNgramSearchCaseInsensitiveUTF8 = FunctionsStringSimilarity, NameNgramSearchUTF8CaseInsensitive>; +using FunctionNgramSearchCaseInsensitiveUTF8 + = FunctionsStringSimilarity, NameNgramSearchUTF8CaseInsensitive>; void registerFunctionsStringSimilarity(FunctionFactory & factory) From e0cf07e958c77cf3b7f1faeb727ba3541ae00f18 Mon Sep 17 00:00:00 2001 From: fenglv Date: Wed, 6 Nov 2019 18:35:23 +0800 Subject: [PATCH 002/504] add hammingdistance function --- dbms/src/Functions/bitHammingDistance.cpp | 174 ++++++++++++++ dbms/src/Functions/registerFunctions.cpp | 2 + .../Functions/registerFunctionsArithmetic.cpp | 4 + dbms/src/Functions/tupleHammingDistance.cpp | 224 ++++++++++++++++++ 4 files changed, 404 insertions(+) create mode 100644 dbms/src/Functions/bitHammingDistance.cpp create mode 100644 dbms/src/Functions/tupleHammingDistance.cpp diff --git a/dbms/src/Functions/bitHammingDistance.cpp b/dbms/src/Functions/bitHammingDistance.cpp new file mode 100644 index 00000000000..2572720bb4e --- /dev/null +++ b/dbms/src/Functions/bitHammingDistance.cpp @@ -0,0 +1,174 @@ +#include +#include +#include +#include +#include +#include + +namespace DB +{ +namespace ErrorCodes +{ + extern const int ILLEGAL_COLUMN; + extern const int ILLEGAL_TYPE_OF_ARGUMENT; +} + + +template +struct BitHammingDistanceImpl +{ + using ResultType = UInt8; + + static void NO_INLINE vector_vector(const PaddedPODArray & a, const PaddedPODArray & b, PaddedPODArray & c) + { + size_t size = a.size(); + for (size_t i = 0; i < size; ++i) + c[i] = apply(a[i], b[i]); + } + + static void NO_INLINE vector_constant(const PaddedPODArray & a, B b, PaddedPODArray & c) + { + size_t size = a.size(); + for (size_t i = 0; i < size; ++i) + c[i] = apply(a[i], b); + } + + static void NO_INLINE constant_vector(A a, const PaddedPODArray & b, PaddedPODArray & c) + { + size_t size = b.size(); + for (size_t i = 0; i < size; ++i) + c[i] = apply(a, b[i]); + } + + static ResultType constant_constant(A a, B b) { return apply(a, b); } + +private: + static UInt8 pop_cnt(UInt64 res) + { + UInt8 count = 0; + for (; res; res >>= 1) + count += res & 1u; + return count; + } + + static inline UInt8 apply(UInt64 a, UInt64 b) + { + UInt64 res = a ^ b; + return pop_cnt(res); + } +}; + +template +bool castType(const IDataType * type, F && f) +{ + return castTypeToEither< + DataTypeInt8, + DataTypeInt16, + DataTypeInt32, + DataTypeInt64, + DataTypeUInt8, + DataTypeUInt16, + DataTypeUInt32, + DataTypeUInt64>(type, std::forward(f)); +} + +template +static bool castBothTypes(const IDataType * left, const IDataType * right, F && f) +{ + return castType(left, [&](const auto & left_) { return castType(right, [&](const auto & right_) { return f(left_, right_); }); }); +} + +//bitHammingDistance function: (Integer, Integer) -> UInt8 +class FunctionBitHammingDistance : public IFunction +{ +public: + static constexpr auto name = "bitHammingDistance"; + using ResultType = UInt8; + static FunctionPtr create(const Context &) { return std::make_shared(); } + + String getName() const override { return name; } + + size_t getNumberOfArguments() const override { return 2; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + if (!isInteger(arguments[0])) + throw Exception( + "Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + if (!isInteger(arguments[1])) + throw Exception( + "Illegal type " + arguments[1]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + return std::make_shared(); + } + + void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t) override + { + auto * left_generic = block.getByPosition(arguments[0]).type.get(); + auto * right_generic = block.getByPosition(arguments[1]).type.get(); + bool valid = castBothTypes(left_generic, right_generic, [&](const auto & left, const auto & right) { + using LeftDataType = std::decay_t; + using RightDataType = std::decay_t; + using T0 = typename LeftDataType::FieldType; + using T1 = typename RightDataType::FieldType; + using ColVecT0 = ColumnVector; + using ColVecT1 = ColumnVector; + using ColVecResult = ColumnVector; + + using OpImpl = BitHammingDistanceImpl; + + auto col_left_raw = block.getByPosition(arguments[0]).column.get(); + auto col_right_raw = block.getByPosition(arguments[1]).column.get(); + if (auto col_left = checkAndGetColumnConst(col_left_raw)) + { + if (auto col_right = checkAndGetColumnConst(col_right_raw)) + { + //constant integer - constant integer + auto res = OpImpl::constant_constant(col_left->template getValue(), col_right->template getValue()); + block.getByPosition(result).column = DataTypeUInt8().createColumnConst(col_left->size(), toField(res)); + return true; + } + } + + typename ColVecResult::MutablePtr col_res = nullptr; + col_res = ColVecResult::create(); + + auto & vec_res = col_res->getData(); + vec_res.resize(block.rows()); + + if (auto col_left_const = checkAndGetColumnConst(col_left_raw)) + { + if (auto col_right = checkAndGetColumn(col_right_raw)) + { + // constant integer - non-constant integer + OpImpl::constant_vector(col_left_const->template getValue(), col_right->getData(), vec_res); + } + else + return false; + } + else if (auto col_left = checkAndGetColumn(col_left_raw)) + { + if (auto col_right = checkAndGetColumn(col_right_raw)) + //non-constant integer - non-constant integer + OpImpl::vector_vector(col_left->getData(), col_right->getData(), vec_res); + else if (auto col_right_const = checkAndGetColumnConst(col_right_raw)) + //non-constant integer - constant integer + OpImpl::vector_constant(col_left->getData(), col_right_const->template getValue(), vec_res); + else + return false; + } + else + return false; + + block.getByPosition(result).column = std::move(col_res); + return true; + }); + if (!valid) + throw Exception(getName() + "'s arguments do not match the expected data types", ErrorCodes::ILLEGAL_COLUMN); + } +}; + +void registerFunctionBitHammingDistance(FunctionFactory & factory) +{ + factory.registerFunction(); +} +} diff --git a/dbms/src/Functions/registerFunctions.cpp b/dbms/src/Functions/registerFunctions.cpp index 501f8e7f90a..09000a1dadd 100644 --- a/dbms/src/Functions/registerFunctions.cpp +++ b/dbms/src/Functions/registerFunctions.cpp @@ -41,6 +41,7 @@ void registerFunctionsFindCluster(FunctionFactory &); void registerFunctionsJSON(FunctionFactory &); void registerFunctionsIntrospection(FunctionFactory &); void registerFunctionsConsistentHashing(FunctionFactory & factory); +void registerFunctionsStringHash(FunctionFactory & factory); void registerFunctions() { @@ -80,6 +81,7 @@ void registerFunctions() registerFunctionsJSON(factory); registerFunctionsIntrospection(factory); registerFunctionsConsistentHashing(factory); + registerFunctionsStringHash(factory); } } diff --git a/dbms/src/Functions/registerFunctionsArithmetic.cpp b/dbms/src/Functions/registerFunctionsArithmetic.cpp index 1faa28e395e..a03058c37e9 100644 --- a/dbms/src/Functions/registerFunctionsArithmetic.cpp +++ b/dbms/src/Functions/registerFunctionsArithmetic.cpp @@ -32,6 +32,8 @@ void registerFunctionIntExp10(FunctionFactory & factory); void registerFunctionRoundToExp2(FunctionFactory & factory); void registerFunctionRoundDuration(FunctionFactory & factory); void registerFunctionRoundAge(FunctionFactory & factory); +void registerFunctionBitHammingDistance(FunctionFactory & factory); +void registerFunctionTupleHammingDistance(FunctionFactory & factory); void registerFunctionBitBoolMaskOr(FunctionFactory & factory); void registerFunctionBitBoolMaskAnd(FunctionFactory & factory); @@ -69,6 +71,8 @@ void registerFunctionsArithmetic(FunctionFactory & factory) registerFunctionRoundToExp2(factory); registerFunctionRoundDuration(factory); registerFunctionRoundAge(factory); + registerFunctionBitHammingDistance(factory); + registerFunctionTupleHammingDistance(factory); /// Not for external use. registerFunctionBitBoolMaskOr(factory); diff --git a/dbms/src/Functions/tupleHammingDistance.cpp b/dbms/src/Functions/tupleHammingDistance.cpp new file mode 100644 index 00000000000..4a727aef59a --- /dev/null +++ b/dbms/src/Functions/tupleHammingDistance.cpp @@ -0,0 +1,224 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ +namespace ErrorCodes +{ + extern const int ILLEGAL_COLUMN; + extern const int ILLEGAL_TYPE_OF_ARGUMENT; +} + +template +struct TupleHammingDistanceImpl +{ + using ResultType = UInt8; + + static void NO_INLINE vector_vector( + const PaddedPODArray & a1, + const PaddedPODArray & b1, + const PaddedPODArray & a2, + const PaddedPODArray & b2, + PaddedPODArray & c) + { + size_t size = a1.size(); + for (size_t i = 0; i < size; ++i) + c[i] = apply(a1[i], a2[i]) + apply(b1[i], b2[i]); + } + + static void NO_INLINE + vector_constant(const PaddedPODArray & a1, const PaddedPODArray & b1, UInt64 a2, UInt64 b2, PaddedPODArray & c) + { + size_t size = a1.size(); + for (size_t i = 0; i < size; ++i) + c[i] = apply(a1[i], a2) + apply(b1[i], b2); + } + + static void NO_INLINE + constant_vector(UInt64 a1, UInt64 b1, const PaddedPODArray & a2, const PaddedPODArray & b2, PaddedPODArray & c) + { + size_t size = a2.size(); + for (size_t i = 0; i < size; ++i) + c[i] = apply(a1, a2[i]) + apply(b1, b2[i]); + } + + static ResultType constant_constant(UInt64 a1, UInt64 b1, UInt64 a2, UInt64 b2) { return apply(a1, a2) + apply(b1, b2); } + +private: + static UInt8 pop_cnt(UInt64 res) + { + UInt8 count = 0; + for (; res; res >>= 1) + count += res & 1u; + return count; + } + + static inline UInt8 apply(UInt64 a, UInt64 b) + { + UInt64 res = a ^ b; + return pop_cnt(res); + } +}; + +template +bool castType(const IDataType * type, F && f) +{ + return castTypeToEither< + DataTypeInt8, + DataTypeInt16, + DataTypeInt32, + DataTypeInt64, + DataTypeUInt8, + DataTypeUInt16, + DataTypeUInt32, + DataTypeUInt64>(type, std::forward(f)); +} + +template +static bool castBothTypes(const IDataType * left, const IDataType * right, F && f) +{ + return castType(left, [&](const auto & left_) { return castType(right, [&](const auto & right_) { return f(left_, right_); }); }); +} + +//tupleHammingDistance function: (Tuple(Integer, Integer), Tuple(Integer, Integer))->UInt8 +//in order to avoid code bloating, for non-constant tuple, we make sure that the elements +//in the tuple should have same data type, and for constant tuple, elements can be any integer +//data type, we cast all of them into UInt64 +class FunctionTupleHammingDistance : public IFunction +{ +public: + static constexpr auto name = "tupleHammingDistance"; + using ResultType = UInt8; + static FunctionPtr create(const Context &) { return std::make_shared(); } + + String getName() const override { return name; } + + size_t getNumberOfArguments() const override { return 2; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + if (!isTuple(arguments[0])) + throw Exception( + "Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + if (!isTuple(arguments[1])) + throw Exception( + "Illegal type " + arguments[1]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + return std::make_shared(); + } + + void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t) override + { + const ColumnWithTypeAndName & arg1 = block.getByPosition(arguments[0]); + const ColumnWithTypeAndName & arg2 = block.getByPosition(arguments[1]); + const DataTypeTuple & type1 = static_cast(*arg1.type); + const DataTypeTuple & type2 = static_cast(*arg2.type); + auto & left_elems = type1.getElements(); + auto & right_elems = type2.getElements(); + if (left_elems.size() != 2 || right_elems.size() != 2) + throw Exception( + "Illegal column of arguments of function " + getName() + ", tuple should have exactly two elements.", + ErrorCodes::ILLEGAL_COLUMN); + bool valid = castBothTypes(left_elems[0].get(), right_elems[0].get(), [&](const auto & left, const auto & right) { + using LeftDataType = std::decay_t; + using RightDataType = std::decay_t; + using T0 = typename LeftDataType::FieldType; + using T1 = typename RightDataType::FieldType; + using ColVecT0 = ColumnVector; + using ColVecT1 = ColumnVector; + using ColVecResult = ColumnVector; + + using OpImpl = TupleHammingDistanceImpl; + + // constant tuple - constant tuple + if (const ColumnConst * const_col_left = checkAndGetColumnConst(arg1.column.get())) + { + if (const ColumnConst * const_col_right = checkAndGetColumnConst(arg2.column.get())) + { + auto cols1 = convertConstTupleToConstantElements(*const_col_left); + auto cols2 = convertConstTupleToConstantElements(*const_col_right); + Field a1, b1, a2, b2; + cols1[0]->get(0, a1); + cols1[1]->get(0, b1); + cols2[0]->get(0, a2); + cols2[1]->get(0, b2); + auto res = OpImpl::constant_constant(a1.get(), b1.get(), a2.get(), b2.get()); + block.getByPosition(result).column = DataTypeUInt8().createColumnConst(const_col_left->size(), toField(res)); + return true; + } + } + + typename ColVecResult::MutablePtr col_res = nullptr; + col_res = ColVecResult::create(); + auto & vec_res = col_res->getData(); + vec_res.resize(block.rows()); + // constant tuple - non-constant tuple + if (const ColumnConst * const_col_left = checkAndGetColumnConst(arg1.column.get())) + { + if (const ColumnTuple * col_right = typeid_cast(arg2.column.get())) + { + auto const_cols = convertConstTupleToConstantElements(*const_col_left); + Field a1, b1; + const_cols[0]->get(0, a1); + const_cols[1]->get(0, b1); + auto col_r1 = checkAndGetColumn(&col_right->getColumn(0)); + auto col_r2 = checkAndGetColumn(&col_right->getColumn(1)); + if (col_r1 && col_r2) + OpImpl::constant_vector(a1.get(), b1.get(), col_r1->getData(), col_r2->getData(), vec_res); + else + return false; + } + else + return false; + } + else if (const ColumnTuple * col_left = typeid_cast(arg1.column.get())) + { + auto col_l1 = checkAndGetColumn(&col_left->getColumn(0)); + auto col_l2 = checkAndGetColumn(&col_left->getColumn(1)); + if (col_l1 && col_l2) + { + // non-constant tuple - constant tuple + if (const ColumnConst * const_col_right = checkAndGetColumnConst(arg2.column.get())) + { + auto const_cols = convertConstTupleToConstantElements(*const_col_right); + Field a2, b2; + const_cols[0]->get(0, a2); + const_cols[1]->get(0, b2); + OpImpl::vector_constant(col_l1->getData(), col_l2->getData(), a2.get(), a2.get(), vec_res); + } + // non-constant tuple - non-constant tuple + else if (const ColumnTuple * col_right = typeid_cast(arg2.column.get())) + { + auto col_r1 = checkAndGetColumn(&col_right->getColumn(0)); + auto col_r2 = checkAndGetColumn(&col_right->getColumn(1)); + if (col_r1 && col_r2) + OpImpl::vector_vector(col_l1->getData(), col_l2->getData(), col_r1->getData(), col_r2->getData(), vec_res); + else + return false; + } + else + return false; + } + else + return false; + } + else + return false; + block.getByPosition(result).column = std::move(col_res); + return true; + }); + if (!valid) + throw Exception(getName() + "'s arguments do not match the expected data types", ErrorCodes::ILLEGAL_COLUMN); + } +}; + +void registerFunctionTupleHammingDistance(FunctionFactory & factory) +{ + factory.registerFunction(); +} +} From 9403dd1520dfd7a887a159ab0af0da699747e2ec Mon Sep 17 00:00:00 2001 From: fenglv Date: Wed, 6 Nov 2019 18:35:55 +0800 Subject: [PATCH 003/504] add test fix comment style fix lambda function style --- dbms/src/Functions/ExtractString.h | 26 ++++----- dbms/src/Functions/FunctionsStringHash.cpp | 54 +++++++++---------- dbms/src/Functions/FunctionsStringHash.h | 10 ++-- dbms/src/Functions/bitHammingDistance.cpp | 15 +++--- dbms/src/Functions/tupleHammingDistance.cpp | 15 +++--- .../01016_simhash_minhash.reference | 50 +++++++++++++++++ .../0_stateless/01016_simhash_minhash.sql | 47 ++++++++++++++++ .../01017_bithamming_distance.reference | 15 ++++++ .../0_stateless/01017_bithamming_distance.sql | 20 +++++++ .../01017_tuplehamming_distance.reference | 15 ++++++ .../01017_tuplehamming_distance.sql | 19 +++++++ 11 files changed, 228 insertions(+), 58 deletions(-) create mode 100644 dbms/tests/queries/0_stateless/01016_simhash_minhash.reference create mode 100644 dbms/tests/queries/0_stateless/01016_simhash_minhash.sql create mode 100644 dbms/tests/queries/0_stateless/01017_bithamming_distance.reference create mode 100644 dbms/tests/queries/0_stateless/01017_bithamming_distance.sql create mode 100644 dbms/tests/queries/0_stateless/01017_tuplehamming_distance.reference create mode 100644 dbms/tests/queries/0_stateless/01017_tuplehamming_distance.sql diff --git a/dbms/src/Functions/ExtractString.h b/dbms/src/Functions/ExtractString.h index 05566496cba..c74b5175ea6 100644 --- a/dbms/src/Functions/ExtractString.h +++ b/dbms/src/Functions/ExtractString.h @@ -12,8 +12,8 @@ namespace DB { -//used by FunctionsStringSimilarity and FunctionsStringHash -//includes exacting ASCII ngram, UTF8 ngram, ASCII word and UTF8 word +// used by FunctionsStringSimilarity and FunctionsStringHash +// includes exacting ASCII ngram, UTF8 ngram, ASCII word and UTF8 word template struct ExtractStringImpl { @@ -47,14 +47,14 @@ struct ExtractStringImpl return default_padding; } - //used by FunctionsStringHash - //it's not easy to add padding for ColumnString, so we need safety check each memcpy + // used by FunctionsStringHash + // it's not easy to add padding for ColumnString, so we need safety check each memcpy static ALWAYS_INLINE size_t readASCIICodePointsNoPadding(UInt8 * code_points, const char *& pos, const char * end) { constexpr size_t padding_offset = default_padding - N + 1; memcpy(code_points, code_points + padding_offset, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(UInt8)); - //safety check + // safety check size_t cpy_size = (pos + padding_offset > end) ? end - pos : padding_offset; memcpy(code_points + (N - 1), pos, cpy_size * sizeof(UInt8)); @@ -69,12 +69,12 @@ struct ExtractStringImpl return default_padding; } - //read a ASCII word from pos to word - //if the word size exceeds max_word_size, only read max_word_size byte - //in FuntionsStringHash, the default value of max_word_size is 128 + // read a ASCII word from pos to word + // if the word size exceeds max_word_size, only read max_word_size byte + // in FuntionsStringHash, the default value of max_word_size is 128 static ALWAYS_INLINE inline size_t readOneASCIIWord(UInt8 * word, const char *& pos, const char * end, const size_t & max_word_size) { - //jump seperators + // jump seperators while (pos < end && !isAlphaNum(*pos)) ++pos; @@ -105,14 +105,14 @@ struct ExtractStringImpl return num; } - //read one UTF8 word from pos to word - //also, we assume that one word size cann't exceed max_word_size with default value 128 + // read one UTF8 word from pos to word + // also, we assume that one word size cann't exceed max_word_size with default value 128 static ALWAYS_INLINE inline size_t readOneUTF8Word(UInt32 * word, const char *& pos, const char * end, const size_t & max_word_size) { // jump UTF8 seperator while (pos < end && isUTF8Sep(*pos)) ++pos; - //UTF8 word's character number + // UTF8 word's character number size_t num = 0; while (pos < end && num < max_word_size && !isUTF8Sep(*pos)) { @@ -133,7 +133,7 @@ private: ((cont[Offset + I] = std::tolower(cont[Offset + I])), ...); } - //we use ASCII non-alphanum character as UTF8 seperator + // we use ASCII non-alphanum character as UTF8 seperator static ALWAYS_INLINE inline bool isUTF8Sep(const UInt8 c) { return c < 128 && !isAlphaNum(c); } // read one UTF8 character and return it diff --git a/dbms/src/Functions/FunctionsStringHash.cpp b/dbms/src/Functions/FunctionsStringHash.cpp index 797d7d30078..215d49544cb 100644 --- a/dbms/src/Functions/FunctionsStringHash.cpp +++ b/dbms/src/Functions/FunctionsStringHash.cpp @@ -75,7 +75,7 @@ struct Hash } }; -//Sinhash String -> UInt64 +// Sinhash String -> UInt64 template struct SimhashImpl { @@ -123,8 +123,8 @@ struct SimhashImpl iter = 0; } while (start < end && (found = read_code_points(cp, start, end))); - //finally, we return a 64 bit value according to finger_vec - //if finger_vec[i] > 0, the i'th bit of the value is 1, otherwise 0 + // finally, we return a 64 bit value according to finger_vec + // if finger_vec[i] > 0, the i'th bit of the value is 1, otherwise 0 std::bitset<64> res_bit(0u); for (size_t i = 0; i < 64; ++i) { @@ -160,7 +160,7 @@ struct SimhashImpl // word buffer to store one word CodePoint word_buf[max_word_size] = {}; size_t word_size; - //get first word shingle + // get first word shingle for (size_t i = 0; i < N && start < end; ++i) { word_size = read_one_word(word_buf, start, end, max_word_size); @@ -189,9 +189,9 @@ struct SimhashImpl // word hash value into locaion of a1, then array become |a5|a6|a2|a3|a4| nwordHashes[offset] = Hash::hashSum(word_buf, word_size); offset = (offset + 1) % N; - //according to the word hash storation way, in order to not lose the word shingle's - //sequence information, when calculation word shingle hash value, we need provide the offset - //inforation, which is the offset of the first word's hash value of the word shingle + // according to the word hash storation way, in order to not lose the word shingle's + // sequence information, when calculation word shingle hash value, we need provide the offset + // inforation, which is the offset of the first word's hash value of the word shingle hash_value = hash_functor(nwordHashes, N, offset); std::bitset<64> bits(hash_value); for (size_t i = 0; i < 64; ++i) @@ -237,7 +237,7 @@ struct SimhashImpl res = dispatch(wordShinglesCalculateHashValue, data.data(), data.size()); } - //non-constant string + // non-constant string static void vector(const ColumnString::Chars & data, const ColumnString::Offsets & offsets, PaddedPODArray & res) { for (size_t i = 0; i < offsets.size(); ++i) @@ -255,12 +255,12 @@ struct SimhashImpl } }; -//Minhash: String -> Tuple(UInt64, UInt64) -//for each string, we extract ngram or word shingle, -//for each ngram or word shingle, calculate a hash value, -//then we take the K minimum hash values to calculate a hashsum, -//and take the K maximum hash values to calculate another hashsum, -//return this two hashsum: Tuple(hashsum1, hashsum2) +// Minhash: String -> Tuple(UInt64, UInt64) +// for each string, we extract ngram or word shingle, +// for each ngram or word shingle, calculate a hash value, +// then we take the K minimum hash values to calculate a hashsum, +// and take the K maximum hash values to calculate another hashsum, +// return this two hashsum: Tuple(hashsum1, hashsum2) template struct MinhashImpl { @@ -298,11 +298,11 @@ struct MinhashImpl hashes[i] = v; } - //Minhash ngram calculate function, String -> Tuple(UInt64, UInt64) - //we extract ngram from input string, and calculate a hash value for each ngram - //then we take the K minimum hash values to calculate a hashsum, - //and take the K maximum hash values to calculate another hashsum, - //return this two hashsum: Tuple(hashsum1, hashsum2) + // Minhash ngram calculate function, String -> Tuple(UInt64, UInt64) + // we extract ngram from input string, and calculate a hash value for each ngram + // then we take the K minimum hash values to calculate a hashsum, + // and take the K maximum hash values to calculate another hashsum, + // return this two hashsum: Tuple(hashsum1, hashsum2) static ALWAYS_INLINE inline std::tuple ngramCalculateHashValue( const char * data, const size_t size, @@ -339,8 +339,8 @@ struct MinhashImpl } // Minhash word shingle hash value calculate function: String ->Tuple(UInt64, UInt64) - //for each word shingle, we calculate a hash value, but in fact, we just maintain the - //K minimum and K maximum hash value + // for each word shingle, we calculate a hash value, but in fact, we just maintain the + // K minimum and K maximum hash value static ALWAYS_INLINE inline std::tuple wordShinglesCalculateHashValue( const char * data, const size_t size, @@ -349,7 +349,7 @@ struct MinhashImpl { const char * start = data; const char * end = start + size; - //also we just store the K minimu and K maximum hash values + // also we just store the K minimu and K maximum hash values UInt64 k_minimum[K] = {}; UInt64 k_maxinum[K] = {}; // array to store n word hashes @@ -357,8 +357,8 @@ struct MinhashImpl // word buffer to store one word CodePoint word_buf[max_word_size] = {}; size_t word_size; - //how word shingle hash value calculation and word hash storation is same as we - //have descripted in Simhash wordShinglesCalculateHashValue function + // how word shingle hash value calculation and word hash storation is same as we + // have descripted in Simhash wordShinglesCalculateHashValue function for (size_t i = 0; i < N && start < end; ++i) { word_size = read_one_word(word_buf, start, end, max_word_size); @@ -416,7 +416,7 @@ struct MinhashImpl std::tie(res1, res2) = dispatch(wordShinglesCalculateHashValue, data.data(), data.size()); } - //non-constant string + // non-constant string static void vector( const ColumnString::Chars & data, const ColumnString::Offsets & offsets, @@ -518,7 +518,7 @@ struct NameWordShingleMinhashCaseInsensitiveUTF8 static constexpr auto name = "wordShingleMinhashCaseInsensitiveUTF8"; }; -//Simhash +// Simhash using FunctionNgramSimhash = FunctionsStringHash, NameNgramSimhash, true>; using FunctionNgramSimhashCaseInsensitive @@ -539,7 +539,7 @@ using FunctionWordShingleSimhashUTF8 = FunctionsStringHash, NameWordShingleSimhashCaseInsensitiveUTF8, true>; -//Minhash +// Minhash using FunctionNgramMinhash = FunctionsStringHash, NameNgramMinhash, false>; using FunctionNgramMinhashCaseInsensitive diff --git a/dbms/src/Functions/FunctionsStringHash.h b/dbms/src/Functions/FunctionsStringHash.h index 185097ade99..bb1e42ab5fa 100644 --- a/dbms/src/Functions/FunctionsStringHash.h +++ b/dbms/src/Functions/FunctionsStringHash.h @@ -15,14 +15,12 @@ namespace DB namespace ErrorCodes { extern const int ILLEGAL_TYPE_OF_ARGUMENT; - extern const int ILLEGAL_COLUMN; - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int TOO_LARGE_STRING_SIZE; } -//FunctionStringHash -//Simhash: String -> UInt64 -//Minhash: String -> (UInt64, UInt64) +// FunctionStringHash +// Simhash: String -> UInt64 +// Minhash: String -> (UInt64, UInt64) template class FunctionsStringHash : public IFunction { @@ -103,7 +101,7 @@ public: } else { - //non const string + // non const string auto col_h1 = ColumnVector::create(); auto col_h2 = ColumnVector::create(); typename ColumnVector::Container & vec_h1 = col_h1->getData(); diff --git a/dbms/src/Functions/bitHammingDistance.cpp b/dbms/src/Functions/bitHammingDistance.cpp index 2572720bb4e..fdef72d4c43 100644 --- a/dbms/src/Functions/bitHammingDistance.cpp +++ b/dbms/src/Functions/bitHammingDistance.cpp @@ -75,10 +75,12 @@ bool castType(const IDataType * type, F && f) template static bool castBothTypes(const IDataType * left, const IDataType * right, F && f) { - return castType(left, [&](const auto & left_) { return castType(right, [&](const auto & right_) { return f(left_, right_); }); }); + return castType(left, [&](const auto & left_) { + return castType(right, [&](const auto & right_) { return f(left_, right_); }); + }); } -//bitHammingDistance function: (Integer, Integer) -> UInt8 +// bitHammingDistance function: (Integer, Integer) -> UInt8 class FunctionBitHammingDistance : public IFunction { public: @@ -105,7 +107,8 @@ public: { auto * left_generic = block.getByPosition(arguments[0]).type.get(); auto * right_generic = block.getByPosition(arguments[1]).type.get(); - bool valid = castBothTypes(left_generic, right_generic, [&](const auto & left, const auto & right) { + bool valid = castBothTypes(left_generic, right_generic, [&](const auto & left, const auto & right) + { using LeftDataType = std::decay_t; using RightDataType = std::decay_t; using T0 = typename LeftDataType::FieldType; @@ -122,7 +125,7 @@ public: { if (auto col_right = checkAndGetColumnConst(col_right_raw)) { - //constant integer - constant integer + // constant integer - constant integer auto res = OpImpl::constant_constant(col_left->template getValue(), col_right->template getValue()); block.getByPosition(result).column = DataTypeUInt8().createColumnConst(col_left->size(), toField(res)); return true; @@ -148,10 +151,10 @@ public: else if (auto col_left = checkAndGetColumn(col_left_raw)) { if (auto col_right = checkAndGetColumn(col_right_raw)) - //non-constant integer - non-constant integer + // non-constant integer - non-constant integer OpImpl::vector_vector(col_left->getData(), col_right->getData(), vec_res); else if (auto col_right_const = checkAndGetColumnConst(col_right_raw)) - //non-constant integer - constant integer + // non-constant integer - constant integer OpImpl::vector_constant(col_left->getData(), col_right_const->template getValue(), vec_res); else return false; diff --git a/dbms/src/Functions/tupleHammingDistance.cpp b/dbms/src/Functions/tupleHammingDistance.cpp index 4a727aef59a..45c113edad4 100644 --- a/dbms/src/Functions/tupleHammingDistance.cpp +++ b/dbms/src/Functions/tupleHammingDistance.cpp @@ -83,13 +83,15 @@ bool castType(const IDataType * type, F && f) template static bool castBothTypes(const IDataType * left, const IDataType * right, F && f) { - return castType(left, [&](const auto & left_) { return castType(right, [&](const auto & right_) { return f(left_, right_); }); }); + return castType(left, [&](const auto & left_) { + return castType(right, [&](const auto & right_) { return f(left_, right_); }); + }); } -//tupleHammingDistance function: (Tuple(Integer, Integer), Tuple(Integer, Integer))->UInt8 -//in order to avoid code bloating, for non-constant tuple, we make sure that the elements -//in the tuple should have same data type, and for constant tuple, elements can be any integer -//data type, we cast all of them into UInt64 +// tupleHammingDistance function: (Tuple(Integer, Integer), Tuple(Integer, Integer))->UInt8 +// in order to avoid code bloating, for non-constant tuple, we make sure that the elements +// in the tuple should have same data type, and for constant tuple, elements can be any integer +// data type, we cast all of them into UInt64 class FunctionTupleHammingDistance : public IFunction { public: @@ -124,7 +126,8 @@ public: throw Exception( "Illegal column of arguments of function " + getName() + ", tuple should have exactly two elements.", ErrorCodes::ILLEGAL_COLUMN); - bool valid = castBothTypes(left_elems[0].get(), right_elems[0].get(), [&](const auto & left, const auto & right) { + bool valid = castBothTypes(left_elems[0].get(), right_elems[0].get(), [&](const auto & left, const auto & right) + { using LeftDataType = std::decay_t; using RightDataType = std::decay_t; using T0 = typename LeftDataType::FieldType; diff --git a/dbms/tests/queries/0_stateless/01016_simhash_minhash.reference b/dbms/tests/queries/0_stateless/01016_simhash_minhash.reference new file mode 100644 index 00000000000..fa62adde45c --- /dev/null +++ b/dbms/tests/queries/0_stateless/01016_simhash_minhash.reference @@ -0,0 +1,50 @@ +0 +2718169299 +2718169299 +3333471646 +26585365 +4151513063 +4151513063 +4151513063 +3150464485 +(0,0) +(2736268688,2736268688) +(2736268688,2736268688) +(916562399,916562399) +(3436376151,3436376151) +(0,3423682776) +(0,3423682776) +(0,3423682776) +(0,2393737641) +2548869326 +2548869326 +401385678 +401385710 +4258739090 +4260836242 +718415633 +718681881 +4026448893 +4026449917 +4026466301 +4026466301 +4026448893 +4026449917 +3957325823 +4217372671 +(3946088007,3946088007) +(3946088007,3946088007) +(2332295796,2332295796) +(535012010,535012010) +(3696559901,3696559901) +(3696559901,3696559901) +(169287209,169287209) +(169287209,169287209) +(0,1509393235) +(0,1509393235) +(0,1509393235) +(0,1509393235) +(0,1509393235) +(0,1509393235) +(0,1975937193) +(0,1975937193) diff --git a/dbms/tests/queries/0_stateless/01016_simhash_minhash.sql b/dbms/tests/queries/0_stateless/01016_simhash_minhash.sql new file mode 100644 index 00000000000..9e87216d26f --- /dev/null +++ b/dbms/tests/queries/0_stateless/01016_simhash_minhash.sql @@ -0,0 +1,47 @@ +SELECT ngramSimhash(''); +SELECT ngramSimhash('what a cute cat.'); +SELECT ngramSimhashCaseInsensitive('what a cute cat.'); +SELECT ngramSimhashUTF8('what a cute cat.'); +SELECT ngramSimhashCaseInsensitiveUTF8('what a cute cat.'); +SELECT wordShingleSimhash('what a cute cat.'); +SELECT wordShingleSimhashCaseInsensitive('what a cute cat.'); +SELECT wordShingleSimhashUTF8('what a cute cat.'); +SELECT wordShingleSimhashCaseInsensitiveUTF8('what a cute cat.'); + +SELECT ngramMinhash(''); +SELECT ngramMinhash('what a cute cat.'); +SELECT ngramMinhashCaseInsensitive('what a cute cat.'); +SELECT ngramMinhashUTF8('what a cute cat.'); +SELECT ngramMinhashCaseInsensitiveUTF8('what a cute cat.'); +SELECT wordShingleMinhash('what a cute cat.'); +SELECT wordShingleMinhashCaseInsensitive('what a cute cat.'); +SELECT wordShingleMinhashUTF8('what a cute cat.'); +SELECT wordShingleMinhashCaseInsensitiveUTF8('what a cute cat.'); + +DROP TABLE IF EXISTS defaults; +CREATE TABLE defaults +( + s String +)ENGINE = Memory(); + +INSERT INTO defaults values ('It is the latest occurrence of the Southeast European haze, the issue that occurs in constant intensity during every wet season. It has mainly been caused by forest fires resulting from illegal slash-and-burn clearing performed on behalf of the palm oil industry in Kazakhstan, principally on the islands, which then spread quickly in the dry season.') ('It is the latest occurrence of the Southeast Asian haze, the issue that occurs in constant intensity during every wet season. It has mainly been caused by forest fires resulting from illegal slash-and-burn clearing performed on behalf of the palm oil industry in Kazakhstan, principally on the islands, which then spread quickly in the dry season.'); + +SELECT ngramSimhash(s) FROM defaults; +SELECT ngramSimhashCaseInsensitive(s) FROM defaults; +SELECT ngramSimhashUTF8(s) FROM defaults; +SELECT ngramSimhashCaseInsensitiveUTF8(s) FROM defaults; +SELECT wordShingleSimhash(s) FROM defaults; +SELECT wordShingleSimhashCaseInsensitive(s) FROM defaults; +SELECT wordShingleSimhashUTF8(s) FROM defaults; +SELECT wordShingleSimhashCaseInsensitiveUTF8(s) FROM defaults; + +SELECT ngramMinhash(s) FROM defaults; +SELECT ngramMinhashCaseInsensitive(s) FROM defaults; +SELECT ngramMinhashUTF8(s) FROM defaults; +SELECT ngramMinhashCaseInsensitiveUTF8(s) FROM defaults; +SELECT wordShingleMinhash(s) FROM defaults; +SELECT wordShingleMinhashCaseInsensitive(s) FROM defaults; +SELECT wordShingleMinhashUTF8(s) FROM defaults; +SELECT wordShingleMinhashCaseInsensitiveUTF8(s) FROM defaults; + +DROP TABLE defaults; diff --git a/dbms/tests/queries/0_stateless/01017_bithamming_distance.reference b/dbms/tests/queries/0_stateless/01017_bithamming_distance.reference new file mode 100644 index 00000000000..cc2d4f39154 --- /dev/null +++ b/dbms/tests/queries/0_stateless/01017_bithamming_distance.reference @@ -0,0 +1,15 @@ +1 +7 +63 +2 +1 +3 +5 +4 +6 +6 +6 +3 +5 +9 +9 diff --git a/dbms/tests/queries/0_stateless/01017_bithamming_distance.sql b/dbms/tests/queries/0_stateless/01017_bithamming_distance.sql new file mode 100644 index 00000000000..4b36894b97c --- /dev/null +++ b/dbms/tests/queries/0_stateless/01017_bithamming_distance.sql @@ -0,0 +1,20 @@ +SELECT bitHammingDistance(1, 5); +SELECT bitHammingDistance(100, 100000); +SELECT bitHammingDistance(-1, 1); + +DROP TABLE IF EXISTS defaults; +CREATE TABLE defaults +( + n1 UInt8, + n2 UInt16, + n3 UInt32, + n4 UInt64 +)ENGINE = Memory(); + +INSERT INTO defaults VALUES (1, 2, 3, 4) (12, 4345, 435, 1233) (45, 675, 32343, 54566) (90, 784, 9034, 778752); + +SELECT bitHammingDistance(4, n1) FROM defaults; +SELECT bitHammingDistance(n2, 100) FROM defaults; +SELECT bitHammingDistance(n3, n4) FROM defaults; + +DROP TABLE defaults; diff --git a/dbms/tests/queries/0_stateless/01017_tuplehamming_distance.reference b/dbms/tests/queries/0_stateless/01017_tuplehamming_distance.reference new file mode 100644 index 00000000000..eee1a7eee3b --- /dev/null +++ b/dbms/tests/queries/0_stateless/01017_tuplehamming_distance.reference @@ -0,0 +1,15 @@ +3 +5 +60 +5 +3 +10 +10 +114 +119 +111 +104 +69 +13 +65 +25 diff --git a/dbms/tests/queries/0_stateless/01017_tuplehamming_distance.sql b/dbms/tests/queries/0_stateless/01017_tuplehamming_distance.sql new file mode 100644 index 00000000000..0db73232bb3 --- /dev/null +++ b/dbms/tests/queries/0_stateless/01017_tuplehamming_distance.sql @@ -0,0 +1,19 @@ +SELECT tupleHammingDistance((1, 2), (3, 4)); +SELECT tupleHammingDistance((120, 2434), (123, 434)); +SELECT tupleHammingDistance((-12, 434), (987, 432)); + +DROP TABLE IF EXISTS defaults; +CREATE TABLE defaults +( + t1 Tuple(UInt16, UInt16), + t2 Tuple(UInt32, UInt32), + t3 Tuple(Int64, Int64) +)ENGINE = Memory(); + +INSERT INTO defaults VALUES ((12, 43), (12312, 43453) ,(-10, 32)) ((1, 4), (546, 12345), (123, 456)) ((90, 9875), (43456, 234203), (1231, -123)) ((87, 987), (545645, 768354634), (9123, 909)); + +SELECT tupleHammingDistance((1, 3), t1) FROM defaults; +SELECT tupleHammingDistance(t2, (-1, 1)) FROM defaults; +SELECT tupleHammingDistance(t2, t3) FROM defaults; + +DROP TABLE defaults; From ced7fe59dbe48f261abb3fec427eadbc50ba7c5f Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Thu, 5 Dec 2019 06:48:40 +0300 Subject: [PATCH 004/504] Update ExtractString.h --- dbms/src/Functions/ExtractString.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Functions/ExtractString.h b/dbms/src/Functions/ExtractString.h index c74b5175ea6..040e62d9580 100644 --- a/dbms/src/Functions/ExtractString.h +++ b/dbms/src/Functions/ExtractString.h @@ -13,7 +13,7 @@ namespace DB { // used by FunctionsStringSimilarity and FunctionsStringHash -// includes exacting ASCII ngram, UTF8 ngram, ASCII word and UTF8 word +// includes extracting ASCII ngram, UTF8 ngram, ASCII word and UTF8 word template struct ExtractStringImpl { From 241fd556576fc7833174c5346568732e1742a8d8 Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Thu, 5 Dec 2019 07:08:35 +0300 Subject: [PATCH 005/504] Update FunctionsStringHash.cpp --- dbms/src/Functions/FunctionsStringHash.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Functions/FunctionsStringHash.cpp b/dbms/src/Functions/FunctionsStringHash.cpp index 215d49544cb..d7277fcb98b 100644 --- a/dbms/src/Functions/FunctionsStringHash.cpp +++ b/dbms/src/Functions/FunctionsStringHash.cpp @@ -75,7 +75,7 @@ struct Hash } }; -// Sinhash String -> UInt64 +// Simhash String -> UInt64 template struct SimhashImpl { From 83c0807b43d7ca5587b16c43a577fec6ee51ec75 Mon Sep 17 00:00:00 2001 From: feng lv Date: Fri, 22 May 2020 21:23:49 +0800 Subject: [PATCH 006/504] update update name --- src/Functions/ExtractString.h | 14 +-- src/Functions/FunctionsStringHash.cpp | 106 ++++++++---------- src/Functions/FunctionsStringHash.h | 8 +- src/Functions/bitHammingDistance.cpp | 31 +---- src/Functions/registerFunctions.cpp | 7 +- src/Functions/tupleHammingDistance.cpp | 27 ++--- .../01016_simhash_minhash.reference | 59 ++++++++++ .../0_stateless/01016_simhash_minhash.sql | 47 ++++++++ .../01017_bithamming_distance.reference | 15 +++ .../0_stateless/01017_bithamming_distance.sql | 20 ++++ .../01017_tuplehamming_distance.reference | 15 +++ .../01017_tuplehamming_distance.sql | 19 ++++ 12 files changed, 251 insertions(+), 117 deletions(-) create mode 100644 tests/queries/0_stateless/01016_simhash_minhash.reference create mode 100644 tests/queries/0_stateless/01016_simhash_minhash.sql create mode 100644 tests/queries/0_stateless/01017_bithamming_distance.reference create mode 100644 tests/queries/0_stateless/01017_bithamming_distance.sql create mode 100644 tests/queries/0_stateless/01017_tuplehamming_distance.reference create mode 100644 tests/queries/0_stateless/01017_tuplehamming_distance.sql diff --git a/src/Functions/ExtractString.h b/src/Functions/ExtractString.h index 040e62d9580..f6a7394a9fc 100644 --- a/src/Functions/ExtractString.h +++ b/src/Functions/ExtractString.h @@ -72,18 +72,18 @@ struct ExtractStringImpl // read a ASCII word from pos to word // if the word size exceeds max_word_size, only read max_word_size byte // in FuntionsStringHash, the default value of max_word_size is 128 - static ALWAYS_INLINE inline size_t readOneASCIIWord(UInt8 * word, const char *& pos, const char * end, const size_t & max_word_size) + static ALWAYS_INLINE inline size_t readOneASCIIWord(UInt8 * word, const char *& pos, const char * end, size_t max_word_size) { // jump seperators - while (pos < end && !isAlphaNum(*pos)) + while (pos < end && !isAlphaNumericASCII(*pos)) ++pos; // word start from here const char * word_start = pos; - while (pos < end && isAlphaNum(*pos)) + while (pos < end && isAlphaNumericASCII(*pos)) ++pos; - size_t word_size = (static_cast(pos - word_start) <= max_word_size) ? pos - word_start : max_word_size; + size_t word_size = std::min(pos - word_start, max_word_size); memcpy(word, word_start, word_size); if (CaseInsensitive) @@ -107,7 +107,7 @@ struct ExtractStringImpl // read one UTF8 word from pos to word // also, we assume that one word size cann't exceed max_word_size with default value 128 - static ALWAYS_INLINE inline size_t readOneUTF8Word(UInt32 * word, const char *& pos, const char * end, const size_t & max_word_size) + static ALWAYS_INLINE inline size_t readOneUTF8Word(UInt32 * word, const char *& pos, const char * end, size_t max_word_size) { // jump UTF8 seperator while (pos < end && isUTF8Sep(*pos)) @@ -122,7 +122,7 @@ struct ExtractStringImpl } private: - static ALWAYS_INLINE inline bool isAlphaNum(const UInt8 c) + static ALWAYS_INLINE inline bool isAlphaNumericASCII(const UInt8 c) { return (c >= 48 && c <= 57) || (c >= 65 && c <= 90) || (c >= 97 && c <= 122); } @@ -134,7 +134,7 @@ private: } // we use ASCII non-alphanum character as UTF8 seperator - static ALWAYS_INLINE inline bool isUTF8Sep(const UInt8 c) { return c < 128 && !isAlphaNum(c); } + static ALWAYS_INLINE inline bool isUTF8Sep(const UInt8 c) { return c < 128 && !isAlphaNumericASCII(c); } // read one UTF8 character and return it static ALWAYS_INLINE inline UInt32 readOneUTF8Code(const char *& pos, const char * end) diff --git a/src/Functions/FunctionsStringHash.cpp b/src/Functions/FunctionsStringHash.cpp index d7277fcb98b..2195ff7c703 100644 --- a/src/Functions/FunctionsStringHash.cpp +++ b/src/Functions/FunctionsStringHash.cpp @@ -31,47 +31,35 @@ struct Hash #endif } - static ALWAYS_INLINE inline UInt64 wordShinglesHash(const UInt64 * hashes, const size_t & size, const size_t & offset) + static ALWAYS_INLINE inline UInt64 wordShinglesHash(const UInt64 * hashes, size_t size, size_t offset) { - UInt64 res = 0; - UInt8 flag = 0; + UInt64 crc = -1ULL; +#ifdef __SSE4_2__ for (size_t i = offset; i < size; ++i) - { - if (flag) - res &= intHashCRC32(hashes[i]); - else - res |= intHashCRC32(hashes[i]); - flag = (flag + 1) % 2; - } + crc = _mm_crc32_u64(crc, hashes[i]); for (size_t i = 0; i < offset; ++i) - { - if (flag) - res &= intHashCRC32(hashes[i]); - else - res |= intHashCRC32(hashes[i]); - flag = (flag + 1) % 2; - } - return res; + crc = _mm_crc32_u64(crc, hashes[i]); +#else + for (size_t i = offset; i < size; ++i) + crc = intHashCRC32(crc) ^ intHashCRC32(hashes[i]); + for (size_t i = 0; i < offset; ++i) + crc = intHashCRC32(crc) ^ intHashCRC32(hashes[i]); +#endif + return crc; } template - static ALWAYS_INLINE inline UInt64 hashSum(const CodePoint * hashes, const size_t & K) + static ALWAYS_INLINE inline UInt64 hashSum(const CodePoint * hashes, size_t K) { - UInt64 even = 0; - UInt64 odd = 0; - size_t i = 0; - for (; i + 1 < K; i += 2) - { - even |= intHashCRC32(hashes[i]); - odd |= intHashCRC32(hashes[i + 1]); - } - if (i < K) - even |= intHashCRC32(hashes[K - 1]); + UInt64 crc = -1ULL; #ifdef __SSE4_2__ - return _mm_crc32_u64(even, odd); + for (size_t i = 0; i < K; ++i) + crc = _mm_crc32_u64(crc, hashes[i]); #else - return (intHashCRC32(even) ^ intHashCRC32(odd)); + for (size_t i = 0; i < K; ++i) + crc = intHashCRC32(crc) ^ intHashCRC32(hashes[i]); #endif + return crc; } }; @@ -93,7 +81,7 @@ struct SimhashImpl // finally return a 64 bit value(UInt64), i'th bit is 1 means vector[i] > 0, otherwise, vector[i] < 0 static ALWAYS_INLINE inline UInt64 ngramCalculateHashValue( const char * data, - const size_t size, + size_t size, size_t (*read_code_points)(CodePoint *, const char *&, const char *), UInt64 (*hash_functor)(const CodePoint *)) { @@ -146,9 +134,9 @@ struct SimhashImpl // values to caculate the next word shingle hash value static ALWAYS_INLINE inline UInt64 wordShinglesCalculateHashValue( const char * data, - const size_t size, - size_t (*read_one_word)(CodePoint *, const char *&, const char *, const size_t &), - UInt64 (*hash_functor)(const UInt64 *, const size_t &, const size_t &)) + size_t size, + size_t (*read_one_word)(CodePoint *, const char *&, const char *, size_t), + UInt64 (*hash_functor)(const UInt64 *, size_t, size_t)) { const char * start = data; const char * end = data + size; @@ -156,7 +144,7 @@ struct SimhashImpl // Also, a 64 bit vector initialized to zero Int64 finger_vec[64] = {}; // a array to store N word hash values - UInt64 nwordHashes[N] = {}; + UInt64 nword_hashes[N] = {}; // word buffer to store one word CodePoint word_buf[max_word_size] = {}; size_t word_size; @@ -167,16 +155,16 @@ struct SimhashImpl if (word_size) { // for each word, calculate a hash value and stored into the array - nwordHashes[i++] = Hash::hashSum(word_buf, word_size); + nword_hashes[i++] = Hash::hashSum(word_buf, word_size); } } // calculate the first word shingle hash value - UInt64 hash_value = hash_functor(nwordHashes, N, 0); - std::bitset<64> bits_(hash_value); + UInt64 hash_value = hash_functor(nword_hashes, N, 0); + std::bitset<64> first_bits(hash_value); for (size_t i = 0; i < 64; ++i) { - finger_vec[i] += ((bits_.test(i)) ? 1 : -1); + finger_vec[i] += ((first_bits.test(i)) ? 1 : -1); } size_t offset = 0; @@ -187,12 +175,12 @@ struct SimhashImpl // so we need to store new word hash into location of a0, then ,this array become // |a5|a1|a2|a3|a4|, next time, a1 become the oldest location, we need to store new // word hash value into locaion of a1, then array become |a5|a6|a2|a3|a4| - nwordHashes[offset] = Hash::hashSum(word_buf, word_size); + nword_hashes[offset] = Hash::hashSum(word_buf, word_size); offset = (offset + 1) % N; // according to the word hash storation way, in order to not lose the word shingle's // sequence information, when calculation word shingle hash value, we need provide the offset // inforation, which is the offset of the first word's hash value of the word shingle - hash_value = hash_functor(nwordHashes, N, offset); + hash_value = hash_functor(nword_hashes, N, offset); std::bitset<64> bits(hash_value); for (size_t i = 0; i < 64; ++i) { @@ -272,7 +260,7 @@ struct MinhashImpl // insert a new value into K minimum hash array if this value // is smaller than the greatest value in the array - static ALWAYS_INLINE inline void insert_minValue(UInt64 * hashes, UInt64 v) + static ALWAYS_INLINE inline void insertMinValue(UInt64 * hashes, UInt64 v) { size_t i = 0; for (; i < K && hashes[i] <= v; ++i) @@ -286,7 +274,7 @@ struct MinhashImpl // insert a new value into K maximum hash array if this value // is greater than the smallest value in the array - static ALWAYS_INLINE inline void insert_maxValue(UInt64 * hashes, UInt64 v) + static ALWAYS_INLINE inline void insertMaxValue(UInt64 * hashes, UInt64 v) { int i = K - 1; for (; i >= 0 && hashes[i] >= v; --i) @@ -305,7 +293,7 @@ struct MinhashImpl // return this two hashsum: Tuple(hashsum1, hashsum2) static ALWAYS_INLINE inline std::tuple ngramCalculateHashValue( const char * data, - const size_t size, + size_t size, size_t (*read_code_points)(CodePoint *, const char *&, const char *), UInt64 (*hash_functor)(const CodePoint *)) { @@ -326,8 +314,8 @@ struct MinhashImpl auto new_hash = hash_functor(cp + iter); // insert the new hash value into array used to store K minimum value // and K maximum value - insert_minValue(k_minimum, new_hash); - insert_maxValue(k_maxinum, new_hash); + insertMinValue(k_minimum, new_hash); + insertMaxValue(k_maxinum, new_hash); } iter = 0; } while (start < end && (found = read_code_points(cp, start, end))); @@ -343,9 +331,9 @@ struct MinhashImpl // K minimum and K maximum hash value static ALWAYS_INLINE inline std::tuple wordShinglesCalculateHashValue( const char * data, - const size_t size, - size_t (*read_one_word)(CodePoint *, const char *&, const char *, const size_t &), - UInt64 (*hash_functor)(const UInt64 *, const size_t &, const size_t &)) + size_t size, + size_t (*read_one_word)(CodePoint *, const char *&, const char *, size_t), + UInt64 (*hash_functor)(const UInt64 *, size_t, size_t)) { const char * start = data; const char * end = start + size; @@ -353,7 +341,7 @@ struct MinhashImpl UInt64 k_minimum[K] = {}; UInt64 k_maxinum[K] = {}; // array to store n word hashes - UInt64 nwordHashes[N] = {}; + UInt64 nword_hashes[N] = {}; // word buffer to store one word CodePoint word_buf[max_word_size] = {}; size_t word_size; @@ -364,22 +352,22 @@ struct MinhashImpl word_size = read_one_word(word_buf, start, end, max_word_size); if (word_size) { - nwordHashes[i++] = Hash::hashSum(word_buf, word_size); + nword_hashes[i++] = Hash::hashSum(word_buf, word_size); } } - auto new_hash = hash_functor(nwordHashes, N, 0); - insert_minValue(k_minimum, new_hash); - insert_maxValue(k_maxinum, new_hash); + auto new_hash = hash_functor(nword_hashes, N, 0); + insertMinValue(k_minimum, new_hash); + insertMaxValue(k_maxinum, new_hash); size_t offset = 0; while (start < end && (word_size = read_one_word(word_buf, start, end, max_word_size))) { - nwordHashes[offset] = Hash::hashSum(word_buf, word_size); + nword_hashes[offset] = Hash::hashSum(word_buf, word_size); offset = (offset + 1) % N; - new_hash = hash_functor(nwordHashes, N, offset); - insert_minValue(k_minimum, new_hash); - insert_maxValue(k_maxinum, new_hash); + new_hash = hash_functor(nword_hashes, N, offset); + insertMinValue(k_minimum, new_hash); + insertMaxValue(k_maxinum, new_hash); } // calculate hashsum diff --git a/src/Functions/FunctionsStringHash.h b/src/Functions/FunctionsStringHash.h index bb1e42ab5fa..bada7490288 100644 --- a/src/Functions/FunctionsStringHash.h +++ b/src/Functions/FunctionsStringHash.h @@ -8,7 +8,7 @@ #include #include #include -#include +#include namespace DB { @@ -21,7 +21,7 @@ namespace ErrorCodes // FunctionStringHash // Simhash: String -> UInt64 // Minhash: String -> (UInt64, UInt64) -template +template class FunctionsStringHash : public IFunction { public: @@ -38,7 +38,7 @@ public: if (!isString(arguments[0])) throw Exception( "Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - if (IsSimhash) + if constexpr (is_simhash) return std::make_shared>(); auto element = DataTypeFactory::instance().get("UInt64"); return std::make_shared(DataTypes{element, element}); @@ -49,7 +49,7 @@ public: const ColumnPtr & column = block.getByPosition(arguments[0]).column; const ColumnConst * col_const = typeid_cast(&*column); using ResultType = typename Impl::ResultType; - if constexpr (IsSimhash) + if constexpr (is_simhash) { if (col_const) { diff --git a/src/Functions/bitHammingDistance.cpp b/src/Functions/bitHammingDistance.cpp index fdef72d4c43..5c13a57c426 100644 --- a/src/Functions/bitHammingDistance.cpp +++ b/src/Functions/bitHammingDistance.cpp @@ -40,21 +40,11 @@ struct BitHammingDistanceImpl c[i] = apply(a, b[i]); } - static ResultType constant_constant(A a, B b) { return apply(a, b); } - private: - static UInt8 pop_cnt(UInt64 res) - { - UInt8 count = 0; - for (; res; res >>= 1) - count += res & 1u; - return count; - } - static inline UInt8 apply(UInt64 a, UInt64 b) { UInt64 res = a ^ b; - return pop_cnt(res); + return __builtin_popcountll(res); } }; @@ -75,9 +65,7 @@ bool castType(const IDataType * type, F && f) template static bool castBothTypes(const IDataType * left, const IDataType * right, F && f) { - return castType(left, [&](const auto & left_) { - return castType(right, [&](const auto & right_) { return f(left_, right_); }); - }); + return castType(left, [&](const auto & left_) { return castType(right, [&](const auto & right_) { return f(left_, right_); }); }); } // bitHammingDistance function: (Integer, Integer) -> UInt8 @@ -103,12 +91,13 @@ public: return std::make_shared(); } + bool useDefaultImplementationForConstants() const override { return true; } + void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t) override { auto * left_generic = block.getByPosition(arguments[0]).type.get(); auto * right_generic = block.getByPosition(arguments[1]).type.get(); - bool valid = castBothTypes(left_generic, right_generic, [&](const auto & left, const auto & right) - { + bool valid = castBothTypes(left_generic, right_generic, [&](const auto & left, const auto & right) { using LeftDataType = std::decay_t; using RightDataType = std::decay_t; using T0 = typename LeftDataType::FieldType; @@ -121,16 +110,6 @@ public: auto col_left_raw = block.getByPosition(arguments[0]).column.get(); auto col_right_raw = block.getByPosition(arguments[1]).column.get(); - if (auto col_left = checkAndGetColumnConst(col_left_raw)) - { - if (auto col_right = checkAndGetColumnConst(col_right_raw)) - { - // constant integer - constant integer - auto res = OpImpl::constant_constant(col_left->template getValue(), col_right->template getValue()); - block.getByPosition(result).column = DataTypeUInt8().createColumnConst(col_left->size(), toField(res)); - return true; - } - } typename ColVecResult::MutablePtr col_res = nullptr; col_res = ColVecResult::create(); diff --git a/src/Functions/registerFunctions.cpp b/src/Functions/registerFunctions.cpp index 02013e33d16..f3e2883a179 100644 --- a/src/Functions/registerFunctions.cpp +++ b/src/Functions/registerFunctions.cpp @@ -3,7 +3,6 @@ namespace DB { - void registerFunctionsArithmetic(FunctionFactory &); void registerFunctionsArray(FunctionFactory &); void registerFunctionsTuple(FunctionFactory &); @@ -37,6 +36,9 @@ void registerFunctionsIntrospection(FunctionFactory &); void registerFunctionsNull(FunctionFactory &); void registerFunctionsJSON(FunctionFactory &); void registerFunctionsConsistentHashing(FunctionFactory & factory); +void registerFunctionBitHammingDistance(FunctionFactory & factory); +void registerFunctionTupleHammingDistance(FunctionFactory & factory); +void registerFunctionsStringHash(FunctionFactory & factory); void registerFunctions() @@ -78,6 +80,9 @@ void registerFunctions() registerFunctionsJSON(factory); registerFunctionsIntrospection(factory); registerFunctionsConsistentHashing(factory); + registerFunctionBitHammingDistance(factory); + registerFunctionTupleHammingDistance(factory); + registerFunctionsStringHash(factory); } } diff --git a/src/Functions/tupleHammingDistance.cpp b/src/Functions/tupleHammingDistance.cpp index 45c113edad4..8b3f9a696aa 100644 --- a/src/Functions/tupleHammingDistance.cpp +++ b/src/Functions/tupleHammingDistance.cpp @@ -51,19 +51,7 @@ struct TupleHammingDistanceImpl static ResultType constant_constant(UInt64 a1, UInt64 b1, UInt64 a2, UInt64 b2) { return apply(a1, a2) + apply(b1, b2); } private: - static UInt8 pop_cnt(UInt64 res) - { - UInt8 count = 0; - for (; res; res >>= 1) - count += res & 1u; - return count; - } - - static inline UInt8 apply(UInt64 a, UInt64 b) - { - UInt64 res = a ^ b; - return pop_cnt(res); - } + static inline UInt8 apply(UInt64 a, UInt64 b) { return a != b; } }; template @@ -83,12 +71,10 @@ bool castType(const IDataType * type, F && f) template static bool castBothTypes(const IDataType * left, const IDataType * right, F && f) { - return castType(left, [&](const auto & left_) { - return castType(right, [&](const auto & right_) { return f(left_, right_); }); - }); + return castType(left, [&](const auto & left_) { return castType(right, [&](const auto & right_) { return f(left_, right_); }); }); } -// tupleHammingDistance function: (Tuple(Integer, Integer), Tuple(Integer, Integer))->UInt8 +// tupleHammingDistance function: (Tuple(Integer, Integer), Tuple(Integer, Integer))->0/1/2 // in order to avoid code bloating, for non-constant tuple, we make sure that the elements // in the tuple should have same data type, and for constant tuple, elements can be any integer // data type, we cast all of them into UInt64 @@ -126,8 +112,7 @@ public: throw Exception( "Illegal column of arguments of function " + getName() + ", tuple should have exactly two elements.", ErrorCodes::ILLEGAL_COLUMN); - bool valid = castBothTypes(left_elems[0].get(), right_elems[0].get(), [&](const auto & left, const auto & right) - { + bool valid = castBothTypes(left_elems[0].get(), right_elems[0].get(), [&](const auto & left, const auto & right) { using LeftDataType = std::decay_t; using RightDataType = std::decay_t; using T0 = typename LeftDataType::FieldType; @@ -138,7 +123,9 @@ public: using OpImpl = TupleHammingDistanceImpl; - // constant tuple - constant tuple + // we can not useDefaultImplementationForConstants, + // because with that, tupleHammingDistance((10, 300), (10, 20)) does not work, + // since 10 has data type UInt8, and 300 has data type UInt16 if (const ColumnConst * const_col_left = checkAndGetColumnConst(arg1.column.get())) { if (const ColumnConst * const_col_right = checkAndGetColumnConst(arg2.column.get())) diff --git a/tests/queries/0_stateless/01016_simhash_minhash.reference b/tests/queries/0_stateless/01016_simhash_minhash.reference new file mode 100644 index 00000000000..7fa70b343a4 --- /dev/null +++ b/tests/queries/0_stateless/01016_simhash_minhash.reference @@ -0,0 +1,59 @@ +0 +2718169299 +2718169299 +3333471646 +26585365 +4124079607 +4124079607 +4124079607 +979945684 +(3614688582,3614688582) +(3614688582,3614688582) +(765622645,765622645) +(765622645,765622645) +(765622645,765622645) +(765622645,765622645) +(3573094983,3573094983) +(3573094983,3573094983) +(3604768422,3604768422) +(3604768422,3604768422) +(3614688582,1599892600) +(3614688582,1599892600) +(3614688582,1599892600) +(3614688582,1599892600) +(3614688582,1599892600) +(3614688582,1599892600) +(3614688582,996508363) +(3614688582,996508363) +2548869326 +2548869326 +401385678 +401385710 +4258739090 +4260836242 +718415633 +718681881 +2314703251 +1238864275 +3900085650 +3907425682 +2314703251 +1238864275 +3569207545 +3568143609 +(1436198067,1436198067) +(1436198067,1436198067) +(3846780865,3846780865) +(1956854492,1956854492) +(2929435161,2929435161) +(2929435161,2929435161) +(3310088565,3310088565) +(3310088565,3310088565) +(3614688582,1294895121) +(3614688582,1294895121) +(3614688582,1138551650) +(3614688582,1138551650) +(3614688582,1294895121) +(3614688582,1294895121) +(3614688582,2840007763) +(3614688582,929186815) diff --git a/tests/queries/0_stateless/01016_simhash_minhash.sql b/tests/queries/0_stateless/01016_simhash_minhash.sql new file mode 100644 index 00000000000..9e87216d26f --- /dev/null +++ b/tests/queries/0_stateless/01016_simhash_minhash.sql @@ -0,0 +1,47 @@ +SELECT ngramSimhash(''); +SELECT ngramSimhash('what a cute cat.'); +SELECT ngramSimhashCaseInsensitive('what a cute cat.'); +SELECT ngramSimhashUTF8('what a cute cat.'); +SELECT ngramSimhashCaseInsensitiveUTF8('what a cute cat.'); +SELECT wordShingleSimhash('what a cute cat.'); +SELECT wordShingleSimhashCaseInsensitive('what a cute cat.'); +SELECT wordShingleSimhashUTF8('what a cute cat.'); +SELECT wordShingleSimhashCaseInsensitiveUTF8('what a cute cat.'); + +SELECT ngramMinhash(''); +SELECT ngramMinhash('what a cute cat.'); +SELECT ngramMinhashCaseInsensitive('what a cute cat.'); +SELECT ngramMinhashUTF8('what a cute cat.'); +SELECT ngramMinhashCaseInsensitiveUTF8('what a cute cat.'); +SELECT wordShingleMinhash('what a cute cat.'); +SELECT wordShingleMinhashCaseInsensitive('what a cute cat.'); +SELECT wordShingleMinhashUTF8('what a cute cat.'); +SELECT wordShingleMinhashCaseInsensitiveUTF8('what a cute cat.'); + +DROP TABLE IF EXISTS defaults; +CREATE TABLE defaults +( + s String +)ENGINE = Memory(); + +INSERT INTO defaults values ('It is the latest occurrence of the Southeast European haze, the issue that occurs in constant intensity during every wet season. It has mainly been caused by forest fires resulting from illegal slash-and-burn clearing performed on behalf of the palm oil industry in Kazakhstan, principally on the islands, which then spread quickly in the dry season.') ('It is the latest occurrence of the Southeast Asian haze, the issue that occurs in constant intensity during every wet season. It has mainly been caused by forest fires resulting from illegal slash-and-burn clearing performed on behalf of the palm oil industry in Kazakhstan, principally on the islands, which then spread quickly in the dry season.'); + +SELECT ngramSimhash(s) FROM defaults; +SELECT ngramSimhashCaseInsensitive(s) FROM defaults; +SELECT ngramSimhashUTF8(s) FROM defaults; +SELECT ngramSimhashCaseInsensitiveUTF8(s) FROM defaults; +SELECT wordShingleSimhash(s) FROM defaults; +SELECT wordShingleSimhashCaseInsensitive(s) FROM defaults; +SELECT wordShingleSimhashUTF8(s) FROM defaults; +SELECT wordShingleSimhashCaseInsensitiveUTF8(s) FROM defaults; + +SELECT ngramMinhash(s) FROM defaults; +SELECT ngramMinhashCaseInsensitive(s) FROM defaults; +SELECT ngramMinhashUTF8(s) FROM defaults; +SELECT ngramMinhashCaseInsensitiveUTF8(s) FROM defaults; +SELECT wordShingleMinhash(s) FROM defaults; +SELECT wordShingleMinhashCaseInsensitive(s) FROM defaults; +SELECT wordShingleMinhashUTF8(s) FROM defaults; +SELECT wordShingleMinhashCaseInsensitiveUTF8(s) FROM defaults; + +DROP TABLE defaults; diff --git a/tests/queries/0_stateless/01017_bithamming_distance.reference b/tests/queries/0_stateless/01017_bithamming_distance.reference new file mode 100644 index 00000000000..cc2d4f39154 --- /dev/null +++ b/tests/queries/0_stateless/01017_bithamming_distance.reference @@ -0,0 +1,15 @@ +1 +7 +63 +2 +1 +3 +5 +4 +6 +6 +6 +3 +5 +9 +9 diff --git a/tests/queries/0_stateless/01017_bithamming_distance.sql b/tests/queries/0_stateless/01017_bithamming_distance.sql new file mode 100644 index 00000000000..4b36894b97c --- /dev/null +++ b/tests/queries/0_stateless/01017_bithamming_distance.sql @@ -0,0 +1,20 @@ +SELECT bitHammingDistance(1, 5); +SELECT bitHammingDistance(100, 100000); +SELECT bitHammingDistance(-1, 1); + +DROP TABLE IF EXISTS defaults; +CREATE TABLE defaults +( + n1 UInt8, + n2 UInt16, + n3 UInt32, + n4 UInt64 +)ENGINE = Memory(); + +INSERT INTO defaults VALUES (1, 2, 3, 4) (12, 4345, 435, 1233) (45, 675, 32343, 54566) (90, 784, 9034, 778752); + +SELECT bitHammingDistance(4, n1) FROM defaults; +SELECT bitHammingDistance(n2, 100) FROM defaults; +SELECT bitHammingDistance(n3, n4) FROM defaults; + +DROP TABLE defaults; diff --git a/tests/queries/0_stateless/01017_tuplehamming_distance.reference b/tests/queries/0_stateless/01017_tuplehamming_distance.reference new file mode 100644 index 00000000000..017ffb0cd33 --- /dev/null +++ b/tests/queries/0_stateless/01017_tuplehamming_distance.reference @@ -0,0 +1,15 @@ +2 +1 +1 +0 +2 +2 +2 +2 +1 +2 +2 +2 +0 +2 +2 diff --git a/tests/queries/0_stateless/01017_tuplehamming_distance.sql b/tests/queries/0_stateless/01017_tuplehamming_distance.sql new file mode 100644 index 00000000000..d0ed1cee096 --- /dev/null +++ b/tests/queries/0_stateless/01017_tuplehamming_distance.sql @@ -0,0 +1,19 @@ +SELECT tupleHammingDistance((1, 2), (3, 4)); +SELECT tupleHammingDistance((120, 243), (120, 434)); +SELECT tupleHammingDistance((-12, 434), (434, 434)); + +DROP TABLE IF EXISTS defaults; +CREATE TABLE defaults +( + t1 Tuple(UInt16, UInt16), + t2 Tuple(UInt32, UInt32), + t3 Tuple(Int64, Int64) +)ENGINE = Memory(); + +INSERT INTO defaults VALUES ((12, 43), (12312, 43453) ,(-10, 32)) ((1, 4), (546, 12345), (546, 12345)) ((90, 9875), (43456, 234203), (1231, -123)) ((87, 987), (545645, 768354634), (9123, 909)); + +SELECT tupleHammingDistance((12, 43), t1) FROM defaults; +SELECT tupleHammingDistance(t2, (546, 456)) FROM defaults; +SELECT tupleHammingDistance(t2, t3) FROM defaults; + +DROP TABLE defaults; From 7b4fc7300c85b38a272a276ff860e226f33a578a Mon Sep 17 00:00:00 2001 From: feng lv Date: Wed, 10 Jun 2020 23:02:58 +0800 Subject: [PATCH 007/504] update fix fix fix --- src/Functions/ExtractString.h | 57 ++---- src/Functions/FunctionsStringHash.cpp | 167 +++++++++--------- src/Functions/FunctionsStringHash.h | 84 +++------ .../01016_simhash_minhash.reference | 59 +++---- 4 files changed, 145 insertions(+), 222 deletions(-) diff --git a/src/Functions/ExtractString.h b/src/Functions/ExtractString.h index f6a7394a9fc..51d6f17380c 100644 --- a/src/Functions/ExtractString.h +++ b/src/Functions/ExtractString.h @@ -1,3 +1,5 @@ +#include +#include #include #include @@ -19,6 +21,9 @@ struct ExtractStringImpl { static constexpr size_t default_padding = 16; + // the length of code_points = default_padding + N -1 + // pos: the current beginning location that we want to copy data + // end: the end loction of the string static ALWAYS_INLINE size_t readASCIICodePoints(UInt8 * code_points, const char *& pos, const char * end) { /// Offset before which we copy some data. @@ -47,32 +52,8 @@ struct ExtractStringImpl return default_padding; } - // used by FunctionsStringHash - // it's not easy to add padding for ColumnString, so we need safety check each memcpy - static ALWAYS_INLINE size_t readASCIICodePointsNoPadding(UInt8 * code_points, const char *& pos, const char * end) - { - constexpr size_t padding_offset = default_padding - N + 1; - memcpy(code_points, code_points + padding_offset, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(UInt8)); - - // safety check - size_t cpy_size = (pos + padding_offset > end) ? end - pos : padding_offset; - - memcpy(code_points + (N - 1), pos, cpy_size * sizeof(UInt8)); - - if constexpr (CaseInsensitive) - { - unrollLowering(code_points, std::make_index_sequence()); - } - pos += padding_offset; - if (pos > end) - return default_padding - (pos - end); - return default_padding; - } - - // read a ASCII word from pos to word - // if the word size exceeds max_word_size, only read max_word_size byte - // in FuntionsStringHash, the default value of max_word_size is 128 - static ALWAYS_INLINE inline size_t readOneASCIIWord(UInt8 * word, const char *& pos, const char * end, size_t max_word_size) + // read a ASCII word + static ALWAYS_INLINE inline size_t readOneASCIIWord(PaddedPODArray & word_buf, const char *& pos, const char * end) { // jump seperators while (pos < end && !isAlphaNumericASCII(*pos)) @@ -83,14 +64,12 @@ struct ExtractStringImpl while (pos < end && isAlphaNumericASCII(*pos)) ++pos; - size_t word_size = std::min(pos - word_start, max_word_size); - - memcpy(word, word_start, word_size); + word_buf.assign(word_start, pos); if (CaseInsensitive) { - std::transform(word, word + word_size, word, [](UInt8 c) { return std::tolower(c); }); + std::transform(word_buf.begin(), word_buf.end(), word_buf.begin(), [](UInt8 c) { return std::tolower(c); }); } - return word_size; + return word_buf.size(); } static ALWAYS_INLINE inline size_t readUTF8CodePoints(UInt32 * code_points, const char *& pos, const char * end) @@ -106,27 +85,21 @@ struct ExtractStringImpl } // read one UTF8 word from pos to word - // also, we assume that one word size cann't exceed max_word_size with default value 128 - static ALWAYS_INLINE inline size_t readOneUTF8Word(UInt32 * word, const char *& pos, const char * end, size_t max_word_size) + static ALWAYS_INLINE inline size_t readOneUTF8Word(PaddedPODArray & word_buf, const char *& pos, const char * end) { // jump UTF8 seperator while (pos < end && isUTF8Sep(*pos)) ++pos; + word_buf.clear(); // UTF8 word's character number - size_t num = 0; - while (pos < end && num < max_word_size && !isUTF8Sep(*pos)) + while (pos < end && !isUTF8Sep(*pos)) { - word[num++] = readOneUTF8Code(pos, end); + word_buf.push_back(readOneUTF8Code(pos, end)); } - return num; + return word_buf.size(); } private: - static ALWAYS_INLINE inline bool isAlphaNumericASCII(const UInt8 c) - { - return (c >= 48 && c <= 57) || (c >= 65 && c <= 90) || (c >= 97 && c <= 122); - } - template static ALWAYS_INLINE inline void unrollLowering(Container & cont, const std::index_sequence &) { diff --git a/src/Functions/FunctionsStringHash.cpp b/src/Functions/FunctionsStringHash.cpp index 2195ff7c703..f8c78a808b3 100644 --- a/src/Functions/FunctionsStringHash.cpp +++ b/src/Functions/FunctionsStringHash.cpp @@ -5,11 +5,15 @@ #include #include #include +#include #include #include +#include +#include #include +#include #include namespace DB @@ -64,6 +68,11 @@ struct Hash }; // Simhash String -> UInt64 +// N: the length of ngram or words shingles +// CodePoint: UInt8(ASCII) or UInt32(UTF8) +// UTF8: means ASCII or UTF8, these two parameters CodePoint and UTF8 can only be (UInt8, false) or (UInt32, true) +// Ngram: means ngram(true) or words shingles(false) +// CaseInsensitive: means should we consider about letter case or not template struct SimhashImpl { @@ -71,7 +80,6 @@ struct SimhashImpl using StrOp = ExtractStringImpl; // we made an assumption that the size of one word cann't exceed 128, which may not true // if some word's size exceed 128, it would be cut up to several word - static constexpr size_t max_word_size = 1u << 7; static constexpr size_t max_string_size = 1u << 15; static constexpr size_t simultaneously_codepoints_num = StrOp::default_padding + N - 1; @@ -135,7 +143,7 @@ struct SimhashImpl static ALWAYS_INLINE inline UInt64 wordShinglesCalculateHashValue( const char * data, size_t size, - size_t (*read_one_word)(CodePoint *, const char *&, const char *, size_t), + size_t (*read_one_word)(PaddedPODArray &, const char *&, const char *), UInt64 (*hash_functor)(const UInt64 *, size_t, size_t)) { const char * start = data; @@ -146,16 +154,15 @@ struct SimhashImpl // a array to store N word hash values UInt64 nword_hashes[N] = {}; // word buffer to store one word - CodePoint word_buf[max_word_size] = {}; - size_t word_size; + PaddedPODArray word_buf; // get first word shingle for (size_t i = 0; i < N && start < end; ++i) { - word_size = read_one_word(word_buf, start, end, max_word_size); - if (word_size) + read_one_word(word_buf, start, end); + if (!word_buf.empty()) { // for each word, calculate a hash value and stored into the array - nword_hashes[i++] = Hash::hashSum(word_buf, word_size); + nword_hashes[i++] = Hash::hashSum(word_buf.data(), word_buf.size()); } } @@ -168,14 +175,14 @@ struct SimhashImpl } size_t offset = 0; - while (start < end && (word_size = read_one_word(word_buf, start, end, max_word_size))) + while (start < end && read_one_word(word_buf, start, end)) { // we need to store the new word hash value to the oldest location. // for example, N = 5, array |a0|a1|a2|a3|a4|, now , a0 is the oldest location, // so we need to store new word hash into location of a0, then ,this array become // |a5|a1|a2|a3|a4|, next time, a1 become the oldest location, we need to store new // word hash value into locaion of a1, then array become |a5|a6|a2|a3|a4| - nword_hashes[offset] = Hash::hashSum(word_buf, word_size); + nword_hashes[offset] = Hash::hashSum(word_buf.data(), word_buf.size()); offset = (offset + 1) % N; // according to the word hash storation way, in order to not lose the word shingle's // sequence information, when calculation word shingle hash value, we need provide the offset @@ -203,7 +210,7 @@ struct SimhashImpl if constexpr (Ngram) { if constexpr (!UTF8) - return calc_func(std::forward(args)..., StrOp::readASCIICodePointsNoPadding, Hash::ngramASCIIHash); + return calc_func(std::forward(args)..., StrOp::readASCIICodePoints, Hash::ngramASCIIHash); else return calc_func(std::forward(args)..., StrOp::readUTF8CodePoints, Hash::ngramUTF8Hash); } @@ -216,17 +223,7 @@ struct SimhashImpl } } - // constant string - static inline void constant(const String data, UInt64 & res) - { - if constexpr (Ngram) - res = dispatch(ngramCalculateHashValue, data.data(), data.size()); - else - res = dispatch(wordShinglesCalculateHashValue, data.data(), data.size()); - } - - // non-constant string - static void vector(const ColumnString::Chars & data, const ColumnString::Offsets & offsets, PaddedPODArray & res) + static void apply(const ColumnString::Chars & data, const ColumnString::Offsets & offsets, PaddedPODArray & res) { for (size_t i = 0; i < offsets.size(); ++i) { @@ -239,53 +236,64 @@ struct SimhashImpl else res[i] = dispatch(wordShinglesCalculateHashValue, one_data, data_size); } + else + res[i] = -1ull; } } }; +template +class FixedHeap +{ +public: + FixedHeap() = delete; + + explicit FixedHeap(F f_) : f(f_), data_t(std::make_shared>(K, v)) + { + std::make_heap(data_t->begin(), data_t->end(), f); + } + + void insertAndReplace(size_t new_v) + { + data_t->push_back(new_v); + std::push_heap(data_t->begin(), data_t->end(), f); + std::pop_heap(data_t->begin(), data_t->end(), f); + data_t->pop_back(); + } + + const size_t * data() { return data_t->data(); } + +private: + F f; + std::shared_ptr> data_t; +}; + + // Minhash: String -> Tuple(UInt64, UInt64) // for each string, we extract ngram or word shingle, // for each ngram or word shingle, calculate a hash value, // then we take the K minimum hash values to calculate a hashsum, // and take the K maximum hash values to calculate another hashsum, // return this two hashsum: Tuple(hashsum1, hashsum2) +// +// N: the length of ngram or words shingles +// K: the number of minimum hashes and maximum hashes that we keep +// CodePoint: UInt8(ASCII) or UInt32(UTF8) +// UTF8: means ASCII or UTF8, these two parameters CodePoint and UTF8 can only be (UInt8, false) or (UInt32, true) +// Ngram: means ngram(true) or words shingles(false) +// CaseInsensitive: means should we consider about letter case or not template struct MinhashImpl { + using Less = std::less; + using Greater = std::greater; + using MaxHeap = FixedHeap, K, -1ULL>; + using MinHeap = FixedHeap, K, 0>; using ResultType = UInt64; using StrOp = ExtractStringImpl; - static constexpr size_t max_word_size = 1u << 7; static constexpr size_t max_string_size = 1u << 15; static constexpr size_t simultaneously_codepoints_num = StrOp::default_padding + N - 1; - // insert a new value into K minimum hash array if this value - // is smaller than the greatest value in the array - static ALWAYS_INLINE inline void insertMinValue(UInt64 * hashes, UInt64 v) - { - size_t i = 0; - for (; i < K && hashes[i] <= v; ++i) - ; - if (i == K) - return; - for (size_t j = K - 2; j >= i; --j) - hashes[j + 1] = hashes[j]; - hashes[i] = v; - } - - // insert a new value into K maximum hash array if this value - // is greater than the smallest value in the array - static ALWAYS_INLINE inline void insertMaxValue(UInt64 * hashes, UInt64 v) - { - int i = K - 1; - for (; i >= 0 && hashes[i] >= v; --i) - ; - if (i < 0) - return; - for (int j = 1; j <= i; ++j) - hashes[j - 1] = hashes[j]; - hashes[i] = v; - } - // Minhash ngram calculate function, String -> Tuple(UInt64, UInt64) // we extract ngram from input string, and calculate a hash value for each ngram // then we take the K minimum hash values to calculate a hashsum, @@ -300,8 +308,8 @@ struct MinhashImpl const char * start = data; const char * end = data + size; // we just maintain the K minimu and K maximum hash values - UInt64 k_minimum[K] = {}; - UInt64 k_maxinum[K] = {}; + MaxHeap k_minimum_hashes(Less{}); + MinHeap k_maximum_hashes(Greater{}); CodePoint cp[simultaneously_codepoints_num] = {}; size_t found = read_code_points(cp, start, end); @@ -314,15 +322,15 @@ struct MinhashImpl auto new_hash = hash_functor(cp + iter); // insert the new hash value into array used to store K minimum value // and K maximum value - insertMinValue(k_minimum, new_hash); - insertMaxValue(k_maxinum, new_hash); + k_minimum_hashes.insertAndReplace(new_hash); + k_maximum_hashes.insertAndReplace(new_hash); } iter = 0; } while (start < end && (found = read_code_points(cp, start, end))); // calculate hashsum of the K minimum hash values and K maximum hash values - UInt64 res1 = Hash::hashSum(k_maxinum, K); - UInt64 res2 = Hash::hashSum(k_maxinum, K); + UInt64 res1 = Hash::hashSum(k_minimum_hashes.data(), K); + UInt64 res2 = Hash::hashSum(k_maximum_hashes.data(), K); return std::make_tuple(res1, res2); } @@ -332,47 +340,46 @@ struct MinhashImpl static ALWAYS_INLINE inline std::tuple wordShinglesCalculateHashValue( const char * data, size_t size, - size_t (*read_one_word)(CodePoint *, const char *&, const char *, size_t), + size_t (*read_one_word)(PaddedPODArray &, const char *&, const char *), UInt64 (*hash_functor)(const UInt64 *, size_t, size_t)) { const char * start = data; const char * end = start + size; // also we just store the K minimu and K maximum hash values - UInt64 k_minimum[K] = {}; - UInt64 k_maxinum[K] = {}; + MaxHeap k_minimum_hashes(Less{}); + MinHeap k_maximum_hashes(Greater{}); // array to store n word hashes UInt64 nword_hashes[N] = {}; // word buffer to store one word - CodePoint word_buf[max_word_size] = {}; - size_t word_size; + PaddedPODArray word_buf; // how word shingle hash value calculation and word hash storation is same as we // have descripted in Simhash wordShinglesCalculateHashValue function for (size_t i = 0; i < N && start < end; ++i) { - word_size = read_one_word(word_buf, start, end, max_word_size); - if (word_size) + read_one_word(word_buf, start, end); + if (!word_buf.empty()) { - nword_hashes[i++] = Hash::hashSum(word_buf, word_size); + nword_hashes[i++] = Hash::hashSum(word_buf.data(), word_buf.size()); } } auto new_hash = hash_functor(nword_hashes, N, 0); - insertMinValue(k_minimum, new_hash); - insertMaxValue(k_maxinum, new_hash); + k_minimum_hashes.insertAndReplace(new_hash); + k_maximum_hashes.insertAndReplace(new_hash); size_t offset = 0; - while (start < end && (word_size = read_one_word(word_buf, start, end, max_word_size))) + while (start < end && read_one_word(word_buf, start, end)) { - nword_hashes[offset] = Hash::hashSum(word_buf, word_size); + nword_hashes[offset] = Hash::hashSum(word_buf.data(), word_buf.size()); offset = (offset + 1) % N; new_hash = hash_functor(nword_hashes, N, offset); - insertMinValue(k_minimum, new_hash); - insertMaxValue(k_maxinum, new_hash); + k_minimum_hashes.insertAndReplace(new_hash); + k_maximum_hashes.insertAndReplace(new_hash); } // calculate hashsum - UInt64 res1 = Hash::hashSum(k_minimum, K); - UInt64 res2 = Hash::hashSum(k_maxinum, K); + UInt64 res1 = Hash::hashSum(k_minimum_hashes.data(), K); + UInt64 res2 = Hash::hashSum(k_maximum_hashes.data(), K); return std::make_tuple(res1, res2); } @@ -382,7 +389,7 @@ struct MinhashImpl if constexpr (Ngram) { if constexpr (!UTF8) - return calc_func(std::forward(args)..., StrOp::readASCIICodePointsNoPadding, Hash::ngramASCIIHash); + return calc_func(std::forward(args)..., StrOp::readASCIICodePoints, Hash::ngramASCIIHash); else return calc_func(std::forward(args)..., StrOp::readUTF8CodePoints, Hash::ngramUTF8Hash); } @@ -395,17 +402,7 @@ struct MinhashImpl } } - // constant string - static void constant(const String data, UInt64 & res1, UInt64 & res2) - { - if constexpr (Ngram) - std::tie(res1, res2) = dispatch(ngramCalculateHashValue, data.data(), data.size()); - else - std::tie(res1, res2) = dispatch(wordShinglesCalculateHashValue, data.data(), data.size()); - } - - // non-constant string - static void vector( + static void apply( const ColumnString::Chars & data, const ColumnString::Offsets & offsets, PaddedPODArray & res1, @@ -422,6 +419,8 @@ struct MinhashImpl else std::tie(res1[i], res2[i]) = dispatch(wordShinglesCalculateHashValue, one_data, data_size); } + else + std::tie(res1[i], res2[i]) = std::make_tuple(-1ull, -1ull); } } }; diff --git a/src/Functions/FunctionsStringHash.h b/src/Functions/FunctionsStringHash.h index bada7490288..23c6db51e8e 100644 --- a/src/Functions/FunctionsStringHash.h +++ b/src/Functions/FunctionsStringHash.h @@ -44,77 +44,37 @@ public: return std::make_shared(DataTypes{element, element}); } + bool useDefaultImplementationForConstants() const override { return true; } + void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t) override { const ColumnPtr & column = block.getByPosition(arguments[0]).column; - const ColumnConst * col_const = typeid_cast(&*column); using ResultType = typename Impl::ResultType; if constexpr (is_simhash) { - if (col_const) - { - ResultType res{}; - const String & str_data = col_const->getValue(); - if (str_data.size() > Impl::max_string_size) - { - throw Exception( - "String size is too big for function " + getName() + ". Should be at most " + std::to_string(Impl::max_string_size), - ErrorCodes::TOO_LARGE_STRING_SIZE); - } - Impl::constant(str_data, res); - block.getByPosition(result).column = block.getByPosition(result).type->createColumnConst(1, toField(res)); - } - else - { - // non const string - auto col_res = ColumnVector::create(); - typename ColumnVector::Container & vec_res = col_res->getData(); - vec_res.resize(column->size()); - const ColumnString * col_str_vector = checkAndGetColumn(&*column); - Impl::vector(col_str_vector->getChars(), col_str_vector->getOffsets(), vec_res); - block.getByPosition(result).column = std::move(col_res); - } + // non const string, const case is handled by useDefaultImplementationForConstants. + auto col_res = ColumnVector::create(); + typename ColumnVector::Container & vec_res = col_res->getData(); + vec_res.resize(column->size()); + const ColumnString * col_str_vector = checkAndGetColumn(&*column); + Impl::apply(col_str_vector->getChars(), col_str_vector->getOffsets(), vec_res); + block.getByPosition(result).column = std::move(col_res); } else // Min hash { - if (col_const) - { - ResultType h1, h2; - const String & str_data = col_const->getValue(); - if (str_data.size() > Impl::max_string_size) - { - throw Exception( - "String size is too big for function " + getName() + ". Should be at most " + std::to_string(Impl::max_string_size), - ErrorCodes::TOO_LARGE_STRING_SIZE); - } - Impl::constant(str_data, h1, h2); - auto h1_col = ColumnVector::create(1); - auto h2_col = ColumnVector::create(1); - typename ColumnVector::Container & h1_data = h1_col->getData(); - typename ColumnVector::Container & h2_data = h2_col->getData(); - h1_data[0] = h1; - h2_data[0] = h2; - MutableColumns tuple_columns; - tuple_columns.emplace_back(std::move(h1_col)); - tuple_columns.emplace_back(std::move(h2_col)); - block.getByPosition(result).column = ColumnTuple::create(std::move(tuple_columns)); - } - else - { - // non const string - auto col_h1 = ColumnVector::create(); - auto col_h2 = ColumnVector::create(); - typename ColumnVector::Container & vec_h1 = col_h1->getData(); - typename ColumnVector::Container & vec_h2 = col_h2->getData(); - vec_h1.resize(column->size()); - vec_h2.resize(column->size()); - const ColumnString * col_str_vector = checkAndGetColumn(&*column); - Impl::vector(col_str_vector->getChars(), col_str_vector->getOffsets(), vec_h1, vec_h2); - MutableColumns tuple_columns; - tuple_columns.emplace_back(std::move(col_h1)); - tuple_columns.emplace_back(std::move(col_h2)); - block.getByPosition(result).column = ColumnTuple::create(std::move(tuple_columns)); - } + // non const string + auto col_h1 = ColumnVector::create(); + auto col_h2 = ColumnVector::create(); + typename ColumnVector::Container & vec_h1 = col_h1->getData(); + typename ColumnVector::Container & vec_h2 = col_h2->getData(); + vec_h1.resize(column->size()); + vec_h2.resize(column->size()); + const ColumnString * col_str_vector = checkAndGetColumn(&*column); + Impl::apply(col_str_vector->getChars(), col_str_vector->getOffsets(), vec_h1, vec_h2); + MutableColumns tuple_columns; + tuple_columns.emplace_back(std::move(col_h1)); + tuple_columns.emplace_back(std::move(col_h2)); + block.getByPosition(result).column = ColumnTuple::create(std::move(tuple_columns)); } } }; diff --git a/tests/queries/0_stateless/01016_simhash_minhash.reference b/tests/queries/0_stateless/01016_simhash_minhash.reference index 7fa70b343a4..2ababa29d1e 100644 --- a/tests/queries/0_stateless/01016_simhash_minhash.reference +++ b/tests/queries/0_stateless/01016_simhash_minhash.reference @@ -7,24 +7,15 @@ 4124079607 4124079607 979945684 -(3614688582,3614688582) -(3614688582,3614688582) -(765622645,765622645) -(765622645,765622645) -(765622645,765622645) -(765622645,765622645) -(3573094983,3573094983) -(3573094983,3573094983) -(3604768422,3604768422) -(3604768422,3604768422) -(3614688582,1599892600) -(3614688582,1599892600) -(3614688582,1599892600) -(3614688582,1599892600) -(3614688582,1599892600) -(3614688582,1599892600) -(3614688582,996508363) -(3614688582,996508363) +(3700739653,3614688582) +(2594676265,556335836) +(2594676265,556335836) +(3157724679,410999184) +(1378962320,1336242123) +(3277652371,1284714580) +(3277652371,1284714580) +(3277652371,1284714580) +(3140472415,3787127930) 2548869326 2548869326 401385678 @@ -41,19 +32,19 @@ 1238864275 3569207545 3568143609 -(1436198067,1436198067) -(1436198067,1436198067) -(3846780865,3846780865) -(1956854492,1956854492) -(2929435161,2929435161) -(2929435161,2929435161) -(3310088565,3310088565) -(3310088565,3310088565) -(3614688582,1294895121) -(3614688582,1294895121) -(3614688582,1138551650) -(3614688582,1138551650) -(3614688582,1294895121) -(3614688582,1294895121) -(3614688582,2840007763) -(3614688582,929186815) +(1525603924,509999509) +(1525603924,3764233597) +(1525603924,2706466536) +(1525603924,1315689278) +(3824755630,2122451089) +(946380879,2122451089) +(3295904092,4129673330) +(3295904092,4129673330) +(138351420,974287950) +(824220170,974287950) +(3300081739,2402902535) +(3300081739,3993394872) +(138351420,974287950) +(824220170,974287950) +(3083836461,957058619) +(4120380459,90533100) From 61817b30fc5474599b48b16456f0d2f55f756b59 Mon Sep 17 00:00:00 2001 From: feng lv Date: Wed, 24 Jun 2020 00:28:17 +0800 Subject: [PATCH 008/504] fix --- src/Functions/FunctionsStringSimilarity.cpp | 49 +++++++++++++-------- 1 file changed, 30 insertions(+), 19 deletions(-) diff --git a/src/Functions/FunctionsStringSimilarity.cpp b/src/Functions/FunctionsStringSimilarity.cpp index 81adb1de26f..cf9d4d6e42a 100644 --- a/src/Functions/FunctionsStringSimilarity.cpp +++ b/src/Functions/FunctionsStringSimilarity.cpp @@ -1,6 +1,6 @@ +#include #include #include -#include #include #include #include @@ -268,8 +268,7 @@ struct NgramDistanceImpl size_t distance = second_size; if (data_size <= max_string_size) { - size_t first_size - = dispatchSearcher(calculateHaystackStatsAndMetric, data.data(), data_size, common_stats, distance, nullptr); + size_t first_size = dispatchSearcher(calculateHaystackStatsAndMetric, data.data(), data_size, common_stats, distance, nullptr); /// For !symmetric version we should not use first_size. if constexpr (symmetric) res = distance * 1.f / std::max(first_size + second_size, size_t(1)); @@ -313,14 +312,23 @@ struct NgramDistanceImpl if (needle_size <= max_string_size && haystack_size <= max_string_size) { /// Get needle stats. - const size_t needle_stats_size - = dispatchSearcher(calculateNeedleStats, needle, needle_size, common_stats, needle_ngram_storage.get()); + const size_t needle_stats_size = dispatchSearcher( + calculateNeedleStats, + needle, + needle_size, + common_stats, + needle_ngram_storage.get()); size_t distance = needle_stats_size; /// Combine with haystack stats, return to initial needle stats. const size_t haystack_stats_size = dispatchSearcher( - calculateHaystackStatsAndMetric, haystack, haystack_size, common_stats, distance, haystack_ngram_storage.get()); + calculateHaystackStatsAndMetric, + haystack, + haystack_size, + common_stats, + distance, + haystack_ngram_storage.get()); /// Return to zero array stats. for (size_t j = 0; j < needle_stats_size; ++j) @@ -382,8 +390,12 @@ struct NgramDistanceImpl if (needle_size <= max_string_size && haystack_size <= max_string_size) { - const size_t needle_stats_size - = dispatchSearcher(calculateNeedleStats, needle, needle_size, common_stats, needle_ngram_storage.get()); + const size_t needle_stats_size = dispatchSearcher( + calculateNeedleStats, + needle, + needle_size, + common_stats, + needle_ngram_storage.get()); size_t distance = needle_stats_size; @@ -407,11 +419,15 @@ struct NgramDistanceImpl prev_offset = needle_offsets[i]; } + } } static void vectorConstant( - const ColumnString::Chars & data, const ColumnString::Offsets & offsets, std::string needle, PaddedPODArray & res) + const ColumnString::Chars & data, + const ColumnString::Offsets & offsets, + std::string needle, + PaddedPODArray & res) { /// zeroing our map NgramStats common_stats = {}; @@ -437,8 +453,7 @@ struct NgramDistanceImpl size_t haystack_stats_size = dispatchSearcher( calculateHaystackStatsAndMetric, reinterpret_cast(haystack), - haystack_size, - common_stats, + haystack_size, common_stats, distance, ngram_storage.get()); /// For !symmetric version we should not use haystack_stats_size. @@ -500,18 +515,14 @@ struct NameNgramSearchUTF8CaseInsensitive }; using FunctionNgramDistance = FunctionsStringSimilarity, NameNgramDistance>; -using FunctionNgramDistanceCaseInsensitive - = FunctionsStringSimilarity, NameNgramDistanceCaseInsensitive>; +using FunctionNgramDistanceCaseInsensitive = FunctionsStringSimilarity, NameNgramDistanceCaseInsensitive>; using FunctionNgramDistanceUTF8 = FunctionsStringSimilarity, NameNgramDistanceUTF8>; -using FunctionNgramDistanceCaseInsensitiveUTF8 - = FunctionsStringSimilarity, NameNgramDistanceUTF8CaseInsensitive>; +using FunctionNgramDistanceCaseInsensitiveUTF8 = FunctionsStringSimilarity, NameNgramDistanceUTF8CaseInsensitive>; using FunctionNgramSearch = FunctionsStringSimilarity, NameNgramSearch>; -using FunctionNgramSearchCaseInsensitive - = FunctionsStringSimilarity, NameNgramSearchCaseInsensitive>; +using FunctionNgramSearchCaseInsensitive = FunctionsStringSimilarity, NameNgramSearchCaseInsensitive>; using FunctionNgramSearchUTF8 = FunctionsStringSimilarity, NameNgramSearchUTF8>; -using FunctionNgramSearchCaseInsensitiveUTF8 - = FunctionsStringSimilarity, NameNgramSearchUTF8CaseInsensitive>; +using FunctionNgramSearchCaseInsensitiveUTF8 = FunctionsStringSimilarity, NameNgramSearchUTF8CaseInsensitive>; void registerFunctionsStringSimilarity(FunctionFactory & factory) From 07b5f9a58f1546a2afe1a65ef084d359b1c3dbf4 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Fri, 7 Aug 2020 18:04:51 +0300 Subject: [PATCH 009/504] Fix build. --- src/Functions/FunctionsMiscellaneous.h | 2 ++ src/Functions/FunctionsStringHash.h | 2 +- src/Functions/bitHammingDistance.cpp | 2 +- src/Functions/tupleHammingDistance.cpp | 2 +- 4 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/Functions/FunctionsMiscellaneous.h b/src/Functions/FunctionsMiscellaneous.h index 5703f72ce2a..6cd11b12bd9 100644 --- a/src/Functions/FunctionsMiscellaneous.h +++ b/src/Functions/FunctionsMiscellaneous.h @@ -210,6 +210,8 @@ public: if (action.type == ExpressionAction::Type::JOIN || action.type == ExpressionAction::Type::ARRAY_JOIN) throw Exception("Expression with arrayJoin or other unusual action cannot be captured", ErrorCodes::BAD_ARGUMENTS); +std::cerr << "=============== FunctionCaptureOverloadResolver expr " << expression_actions->dumpActions() << std::endl; + std::unordered_map arguments_map; const auto & all_arguments = expression_actions->getRequiredColumnsWithTypes(); diff --git a/src/Functions/FunctionsStringHash.h b/src/Functions/FunctionsStringHash.h index 23c6db51e8e..64ee7f9fe59 100644 --- a/src/Functions/FunctionsStringHash.h +++ b/src/Functions/FunctionsStringHash.h @@ -46,7 +46,7 @@ public: bool useDefaultImplementationForConstants() const override { return true; } - void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t) override + void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t) const override { const ColumnPtr & column = block.getByPosition(arguments[0]).column; using ResultType = typename Impl::ResultType; diff --git a/src/Functions/bitHammingDistance.cpp b/src/Functions/bitHammingDistance.cpp index 5c13a57c426..21d4aa2c69c 100644 --- a/src/Functions/bitHammingDistance.cpp +++ b/src/Functions/bitHammingDistance.cpp @@ -93,7 +93,7 @@ public: bool useDefaultImplementationForConstants() const override { return true; } - void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t) override + void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t) const override { auto * left_generic = block.getByPosition(arguments[0]).type.get(); auto * right_generic = block.getByPosition(arguments[1]).type.get(); diff --git a/src/Functions/tupleHammingDistance.cpp b/src/Functions/tupleHammingDistance.cpp index 8b3f9a696aa..a0dc938ab17 100644 --- a/src/Functions/tupleHammingDistance.cpp +++ b/src/Functions/tupleHammingDistance.cpp @@ -100,7 +100,7 @@ public: return std::make_shared(); } - void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t) override + void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t) const override { const ColumnWithTypeAndName & arg1 = block.getByPosition(arguments[0]); const ColumnWithTypeAndName & arg2 = block.getByPosition(arguments[1]); From 3b8020168f5bea44c5a1429cddd9ada9159017c0 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 8 Aug 2020 07:44:04 +0300 Subject: [PATCH 010/504] Add simple watchdog --- programs/server/Server.cpp | 56 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index ddc5ec080fb..c19a537795a 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -263,6 +264,7 @@ void checkForUsersNotInMainConfig( int Server::main(const std::vector & /*args*/) { Poco::Logger * log = &logger(); + UseSSL use_ssl; ThreadStatus thread_status; @@ -1178,8 +1180,62 @@ int Server::main(const std::vector & /*args*/) #pragma GCC diagnostic ignored "-Wunused-function" #pragma GCC diagnostic ignored "-Wmissing-declarations" + +void forkAndWatch(char * process_name) +{ + std::string original_process_name = process_name; + + memset(process_name, 0, original_process_name.size()); + strncpy(process_name, "clickhouse-watchdog", original_process_name.size()); + + setThreadName("clckhouse-watch"); /// 15 characters + + while (true) + { + pid_t pid = fork(); + + if (-1 == pid) + { + std::cerr << "Cannot fork\n"; + exit(1); + } + + if (0 == pid) + { + strncpy(process_name, original_process_name.data(), original_process_name.size()); + setThreadName("clickhouse-serv"); + return; + } + + int status = 0; + if (-1 == waitpid(pid, &status, 0)) + { + std::cerr << "Cannot waitpid\n"; + exit(2); + } + + if (WIFEXITED(status)) + { + std::cerr << fmt::format("Child process exited normally with code {}.\n", WEXITSTATUS(status)); + exit(status); + } + + if (WIFSIGNALED(status)) + std::cerr << fmt::format("Child process was terminated by signal {}.\n", WTERMSIG(status)); + else if (WIFSTOPPED(status)) + std::cerr << fmt::format("Child process was stopped by signal {}.\n", WSTOPSIG(status)); + else + std::cerr << "Child process was not exited normally by unknown reason.\n"; + + std::cerr << "Will restart.\n"; + } +} + + int mainEntryClickHouseServer(int argc, char ** argv) { + forkAndWatch(argv[0]); + DB::Server app; try { From ae716e13e0a804d4a7d570287f0a17d00754deb7 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 8 Aug 2020 07:52:09 +0300 Subject: [PATCH 011/504] Watchdog (experimental) --- programs/server/Server.cpp | 108 +++++++++++++++++++------------------ 1 file changed, 55 insertions(+), 53 deletions(-) diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index c19a537795a..57d8956ff69 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -93,6 +93,8 @@ namespace CurrentMetrics namespace { +char * process_name; /// argv[0] + void setupTmpPath(Poco::Logger * log, const std::string & path) { LOG_DEBUG(log, "Setting up {} to store temporary data in it", path); @@ -261,9 +263,61 @@ void checkForUsersNotInMainConfig( } +void forkAndWatch(Poco::Logger * log) +{ + std::string original_process_name = process_name; + + memset(process_name, 0, original_process_name.size()); + strncpy(process_name, "clickhouse-watchdog", original_process_name.size()); + + setThreadName("clckhouse-watch"); /// 15 characters + + while (true) + { + pid_t pid = fork(); + + if (-1 == pid) + { + LOG_FATAL(log, "Cannot fork"); + exit(1); + } + + if (0 == pid) + { + strncpy(process_name, original_process_name.data(), original_process_name.size()); + setThreadName("clickhouse-serv"); + return; + } + + int status = 0; + if (-1 == waitpid(pid, &status, 0)) + { + LOG_FATAL(log, "Cannot waitpid"); + exit(2); + } + + if (WIFEXITED(status)) + { + LOG_INFO(log, "Child process exited normally with code {}.", WEXITSTATUS(status)); + exit(status); + } + + if (WIFSIGNALED(status)) + LOG_FATAL(log, "Child process was terminated by signal {}.\n", WTERMSIG(status)); + else if (WIFSTOPPED(status)) + LOG_FATAL(log, "Child process was stopped by signal {}.\n", WSTOPSIG(status)); + else + LOG_FATAL(log, "Child process was not exited normally by unknown reason."); + + LOG_INFO(log, "Will restart."); + } +} + + int Server::main(const std::vector & /*args*/) { Poco::Logger * log = &logger(); + forkAndWatch(log); UseSSL use_ssl; @@ -1181,61 +1235,9 @@ int Server::main(const std::vector & /*args*/) #pragma GCC diagnostic ignored "-Wmissing-declarations" -void forkAndWatch(char * process_name) -{ - std::string original_process_name = process_name; - - memset(process_name, 0, original_process_name.size()); - strncpy(process_name, "clickhouse-watchdog", original_process_name.size()); - - setThreadName("clckhouse-watch"); /// 15 characters - - while (true) - { - pid_t pid = fork(); - - if (-1 == pid) - { - std::cerr << "Cannot fork\n"; - exit(1); - } - - if (0 == pid) - { - strncpy(process_name, original_process_name.data(), original_process_name.size()); - setThreadName("clickhouse-serv"); - return; - } - - int status = 0; - if (-1 == waitpid(pid, &status, 0)) - { - std::cerr << "Cannot waitpid\n"; - exit(2); - } - - if (WIFEXITED(status)) - { - std::cerr << fmt::format("Child process exited normally with code {}.\n", WEXITSTATUS(status)); - exit(status); - } - - if (WIFSIGNALED(status)) - std::cerr << fmt::format("Child process was terminated by signal {}.\n", WTERMSIG(status)); - else if (WIFSTOPPED(status)) - std::cerr << fmt::format("Child process was stopped by signal {}.\n", WSTOPSIG(status)); - else - std::cerr << "Child process was not exited normally by unknown reason.\n"; - - std::cerr << "Will restart.\n"; - } -} - - int mainEntryClickHouseServer(int argc, char ** argv) { - forkAndWatch(argv[0]); - + process_name = argv[0]; DB::Server app; try { From 11966f62576e527e26d2f751f7ecf1cdc1cde14b Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Mon, 10 Aug 2020 18:22:08 +0300 Subject: [PATCH 012/504] Update FunctionsMiscellaneous.h --- src/Functions/FunctionsMiscellaneous.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/Functions/FunctionsMiscellaneous.h b/src/Functions/FunctionsMiscellaneous.h index 6cd11b12bd9..5703f72ce2a 100644 --- a/src/Functions/FunctionsMiscellaneous.h +++ b/src/Functions/FunctionsMiscellaneous.h @@ -210,8 +210,6 @@ public: if (action.type == ExpressionAction::Type::JOIN || action.type == ExpressionAction::Type::ARRAY_JOIN) throw Exception("Expression with arrayJoin or other unusual action cannot be captured", ErrorCodes::BAD_ARGUMENTS); -std::cerr << "=============== FunctionCaptureOverloadResolver expr " << expression_actions->dumpActions() << std::endl; - std::unordered_map arguments_map; const auto & all_arguments = expression_actions->getRequiredColumnsWithTypes(); From 2067501ead62322c51ec4de0bea469c7e758d8b9 Mon Sep 17 00:00:00 2001 From: feng lv Date: Sun, 16 Aug 2020 15:42:35 +0800 Subject: [PATCH 013/504] fix --- src/Functions/bitHammingDistance.cpp | 23 ++++++++++++----------- src/Functions/tupleHammingDistance.cpp | 19 ++++++++++--------- 2 files changed, 22 insertions(+), 20 deletions(-) diff --git a/src/Functions/bitHammingDistance.cpp b/src/Functions/bitHammingDistance.cpp index 5c13a57c426..08678689a15 100644 --- a/src/Functions/bitHammingDistance.cpp +++ b/src/Functions/bitHammingDistance.cpp @@ -19,21 +19,21 @@ struct BitHammingDistanceImpl { using ResultType = UInt8; - static void NO_INLINE vector_vector(const PaddedPODArray & a, const PaddedPODArray & b, PaddedPODArray & c) + static void NO_INLINE vectorVector(const PaddedPODArray & a, const PaddedPODArray & b, PaddedPODArray & c) { size_t size = a.size(); for (size_t i = 0; i < size; ++i) c[i] = apply(a[i], b[i]); } - static void NO_INLINE vector_constant(const PaddedPODArray & a, B b, PaddedPODArray & c) + static void NO_INLINE vectorConstant(const PaddedPODArray & a, B b, PaddedPODArray & c) { size_t size = a.size(); for (size_t i = 0; i < size; ++i) c[i] = apply(a[i], b); } - static void NO_INLINE constant_vector(A a, const PaddedPODArray & b, PaddedPODArray & c) + static void NO_INLINE constantVector(A a, const PaddedPODArray & b, PaddedPODArray & c) { size_t size = b.size(); for (size_t i = 0; i < size; ++i) @@ -95,9 +95,10 @@ public: void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t) override { - auto * left_generic = block.getByPosition(arguments[0]).type.get(); - auto * right_generic = block.getByPosition(arguments[1]).type.get(); - bool valid = castBothTypes(left_generic, right_generic, [&](const auto & left, const auto & right) { + const auto * left_generic = block.getByPosition(arguments[0]).type.get(); + const auto * right_generic = block.getByPosition(arguments[1]).type.get(); + bool valid = castBothTypes(left_generic, right_generic, [&](const auto & left, const auto & right) + { using LeftDataType = std::decay_t; using RightDataType = std::decay_t; using T0 = typename LeftDataType::FieldType; @@ -108,8 +109,8 @@ public: using OpImpl = BitHammingDistanceImpl; - auto col_left_raw = block.getByPosition(arguments[0]).column.get(); - auto col_right_raw = block.getByPosition(arguments[1]).column.get(); + const auto col_left_raw = block.getByPosition(arguments[0]).column.get(); + const auto col_right_raw = block.getByPosition(arguments[1]).column.get(); typename ColVecResult::MutablePtr col_res = nullptr; col_res = ColVecResult::create(); @@ -122,7 +123,7 @@ public: if (auto col_right = checkAndGetColumn(col_right_raw)) { // constant integer - non-constant integer - OpImpl::constant_vector(col_left_const->template getValue(), col_right->getData(), vec_res); + OpImpl::constantVector(col_left_const->template getValue(), col_right->getData(), vec_res); } else return false; @@ -131,10 +132,10 @@ public: { if (auto col_right = checkAndGetColumn(col_right_raw)) // non-constant integer - non-constant integer - OpImpl::vector_vector(col_left->getData(), col_right->getData(), vec_res); + OpImpl::vectorVector(col_left->getData(), col_right->getData(), vec_res); else if (auto col_right_const = checkAndGetColumnConst(col_right_raw)) // non-constant integer - constant integer - OpImpl::vector_constant(col_left->getData(), col_right_const->template getValue(), vec_res); + OpImpl::vectorConstant(col_left->getData(), col_right_const->template getValue(), vec_res); else return false; } diff --git a/src/Functions/tupleHammingDistance.cpp b/src/Functions/tupleHammingDistance.cpp index 8b3f9a696aa..c2d0ae66875 100644 --- a/src/Functions/tupleHammingDistance.cpp +++ b/src/Functions/tupleHammingDistance.cpp @@ -20,7 +20,7 @@ struct TupleHammingDistanceImpl { using ResultType = UInt8; - static void NO_INLINE vector_vector( + static void NO_INLINE vectorVector( const PaddedPODArray & a1, const PaddedPODArray & b1, const PaddedPODArray & a2, @@ -33,7 +33,7 @@ struct TupleHammingDistanceImpl } static void NO_INLINE - vector_constant(const PaddedPODArray & a1, const PaddedPODArray & b1, UInt64 a2, UInt64 b2, PaddedPODArray & c) + vectorConstant(const PaddedPODArray & a1, const PaddedPODArray & b1, UInt64 a2, UInt64 b2, PaddedPODArray & c) { size_t size = a1.size(); for (size_t i = 0; i < size; ++i) @@ -41,14 +41,14 @@ struct TupleHammingDistanceImpl } static void NO_INLINE - constant_vector(UInt64 a1, UInt64 b1, const PaddedPODArray & a2, const PaddedPODArray & b2, PaddedPODArray & c) + constantVector(UInt64 a1, UInt64 b1, const PaddedPODArray & a2, const PaddedPODArray & b2, PaddedPODArray & c) { size_t size = a2.size(); for (size_t i = 0; i < size; ++i) c[i] = apply(a1, a2[i]) + apply(b1, b2[i]); } - static ResultType constant_constant(UInt64 a1, UInt64 b1, UInt64 a2, UInt64 b2) { return apply(a1, a2) + apply(b1, b2); } + static ResultType constantConstant(UInt64 a1, UInt64 b1, UInt64 a2, UInt64 b2) { return apply(a1, a2) + apply(b1, b2); } private: static inline UInt8 apply(UInt64 a, UInt64 b) { return a != b; } @@ -112,7 +112,8 @@ public: throw Exception( "Illegal column of arguments of function " + getName() + ", tuple should have exactly two elements.", ErrorCodes::ILLEGAL_COLUMN); - bool valid = castBothTypes(left_elems[0].get(), right_elems[0].get(), [&](const auto & left, const auto & right) { + bool valid = castBothTypes(left_elems[0].get(), right_elems[0].get(), [&](const auto & left, const auto & right) + { using LeftDataType = std::decay_t; using RightDataType = std::decay_t; using T0 = typename LeftDataType::FieldType; @@ -137,7 +138,7 @@ public: cols1[1]->get(0, b1); cols2[0]->get(0, a2); cols2[1]->get(0, b2); - auto res = OpImpl::constant_constant(a1.get(), b1.get(), a2.get(), b2.get()); + auto res = OpImpl::constantConstant(a1.get(), b1.get(), a2.get(), b2.get()); block.getByPosition(result).column = DataTypeUInt8().createColumnConst(const_col_left->size(), toField(res)); return true; } @@ -159,7 +160,7 @@ public: auto col_r1 = checkAndGetColumn(&col_right->getColumn(0)); auto col_r2 = checkAndGetColumn(&col_right->getColumn(1)); if (col_r1 && col_r2) - OpImpl::constant_vector(a1.get(), b1.get(), col_r1->getData(), col_r2->getData(), vec_res); + OpImpl::constantVector(a1.get(), b1.get(), col_r1->getData(), col_r2->getData(), vec_res); else return false; } @@ -179,7 +180,7 @@ public: Field a2, b2; const_cols[0]->get(0, a2); const_cols[1]->get(0, b2); - OpImpl::vector_constant(col_l1->getData(), col_l2->getData(), a2.get(), a2.get(), vec_res); + OpImpl::vectorConstant(col_l1->getData(), col_l2->getData(), a2.get(), a2.get(), vec_res); } // non-constant tuple - non-constant tuple else if (const ColumnTuple * col_right = typeid_cast(arg2.column.get())) @@ -187,7 +188,7 @@ public: auto col_r1 = checkAndGetColumn(&col_right->getColumn(0)); auto col_r2 = checkAndGetColumn(&col_right->getColumn(1)); if (col_r1 && col_r2) - OpImpl::vector_vector(col_l1->getData(), col_l2->getData(), col_r1->getData(), col_r2->getData(), vec_res); + OpImpl::vectorVector(col_l1->getData(), col_l2->getData(), col_r1->getData(), col_r2->getData(), vec_res); else return false; } From 6bc8da633c9121875679b0fa659213476f2b6849 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 16 Aug 2020 14:52:55 +0300 Subject: [PATCH 014/504] Minor modification --- programs/server/Server.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index 05b05934d35..04d116af3fa 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -309,7 +309,8 @@ void forkAndWatch(Poco::Logger * log) else LOG_FATAL(log, "Child process was not exited normally by unknown reason."); - LOG_INFO(log, "Will restart."); + exit(status); + // LOG_INFO(log, "Will restart."); } } @@ -317,7 +318,9 @@ void forkAndWatch(Poco::Logger * log) int Server::main(const std::vector & /*args*/) { Poco::Logger * log = &logger(); - forkAndWatch(log); + + if (!isInteractive()) + forkAndWatch(log); UseSSL use_ssl; From 9a370a03ef19e6e306e479cd5f7ac445a67fad75 Mon Sep 17 00:00:00 2001 From: feng lv Date: Thu, 10 Sep 2020 15:36:38 +0800 Subject: [PATCH 015/504] fix fix --- src/Functions/FunctionsStringHash.h | 1 - src/Functions/bitHammingDistance.cpp | 7 +++---- src/Functions/tupleHammingDistance.cpp | 4 ++-- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/Functions/FunctionsStringHash.h b/src/Functions/FunctionsStringHash.h index 64ee7f9fe59..19fea2d4fc6 100644 --- a/src/Functions/FunctionsStringHash.h +++ b/src/Functions/FunctionsStringHash.h @@ -15,7 +15,6 @@ namespace DB namespace ErrorCodes { extern const int ILLEGAL_TYPE_OF_ARGUMENT; - extern const int TOO_LARGE_STRING_SIZE; } // FunctionStringHash diff --git a/src/Functions/bitHammingDistance.cpp b/src/Functions/bitHammingDistance.cpp index cb79b498aa6..cb34634b00d 100644 --- a/src/Functions/bitHammingDistance.cpp +++ b/src/Functions/bitHammingDistance.cpp @@ -97,8 +97,7 @@ public: { const auto * left_generic = block.getByPosition(arguments[0]).type.get(); const auto * right_generic = block.getByPosition(arguments[1]).type.get(); - bool valid = castBothTypes(left_generic, right_generic, [&](const auto & left, const auto & right) - { + bool valid = castBothTypes(left_generic, right_generic, [&](const auto & left, const auto & right) { using LeftDataType = std::decay_t; using RightDataType = std::decay_t; using T0 = typename LeftDataType::FieldType; @@ -109,8 +108,8 @@ public: using OpImpl = BitHammingDistanceImpl; - const auto col_left_raw = block.getByPosition(arguments[0]).column.get(); - const auto col_right_raw = block.getByPosition(arguments[1]).column.get(); + const auto * const col_left_raw = block.getByPosition(arguments[0]).column.get(); + const auto * const col_right_raw = block.getByPosition(arguments[1]).column.get(); typename ColVecResult::MutablePtr col_res = nullptr; col_res = ColVecResult::create(); diff --git a/src/Functions/tupleHammingDistance.cpp b/src/Functions/tupleHammingDistance.cpp index 2f0475f3a6c..aa38426d228 100644 --- a/src/Functions/tupleHammingDistance.cpp +++ b/src/Functions/tupleHammingDistance.cpp @@ -106,8 +106,8 @@ public: const ColumnWithTypeAndName & arg2 = block.getByPosition(arguments[1]); const DataTypeTuple & type1 = static_cast(*arg1.type); const DataTypeTuple & type2 = static_cast(*arg2.type); - auto & left_elems = type1.getElements(); - auto & right_elems = type2.getElements(); + const auto & left_elems = type1.getElements(); + const auto & right_elems = type2.getElements(); if (left_elems.size() != 2 || right_elems.size() != 2) throw Exception( "Illegal column of arguments of function " + getName() + ", tuple should have exactly two elements.", From 5884b1e79b0eb40d9c39b019d345d9dbc3c45640 Mon Sep 17 00:00:00 2001 From: Olga Revyakina Date: Fri, 30 Oct 2020 04:23:38 +0300 Subject: [PATCH 016/504] my changes to gitignore --- .gitignore | 3 +++ contrib/AMQP-CPP | 2 +- contrib/cyrus-sasl | 2 +- contrib/grpc | 2 +- contrib/jemalloc | 2 +- contrib/libhdfs3 | 2 +- contrib/mariadb-connector-c | 2 +- contrib/openssl | 2 +- contrib/poco | 2 +- contrib/replxx | 2 +- 10 files changed, 12 insertions(+), 9 deletions(-) diff --git a/.gitignore b/.gitignore index 52d58e68cb6..6a88dbf59e9 100644 --- a/.gitignore +++ b/.gitignore @@ -124,3 +124,6 @@ website/package-lock.json # Toolchains /cmake/toolchain/* + +# My changes +/contrib/* diff --git a/contrib/AMQP-CPP b/contrib/AMQP-CPP index d63e1f01658..1c08399ab0a 160000 --- a/contrib/AMQP-CPP +++ b/contrib/AMQP-CPP @@ -1 +1 @@ -Subproject commit d63e1f016582e9faaaf279aa24513087a07bc6e7 +Subproject commit 1c08399ab0ab9e4042ef8e2bbe9e208e5dcbc13b diff --git a/contrib/cyrus-sasl b/contrib/cyrus-sasl index 9995bf9d8e1..6054630889f 160000 --- a/contrib/cyrus-sasl +++ b/contrib/cyrus-sasl @@ -1 +1 @@ -Subproject commit 9995bf9d8e14f58934d9313ac64f13780d6dd3c9 +Subproject commit 6054630889fd1cd8d0659573d69badcee1e23a00 diff --git a/contrib/grpc b/contrib/grpc index a6570b863cf..8aea4e168e7 160000 --- a/contrib/grpc +++ b/contrib/grpc @@ -1 +1 @@ -Subproject commit a6570b863cf76c9699580ba51c7827d5bffaac43 +Subproject commit 8aea4e168e78f3eb9828080740fc8cb73d53bf79 diff --git a/contrib/jemalloc b/contrib/jemalloc index 93e27e435ca..026764f1999 160000 --- a/contrib/jemalloc +++ b/contrib/jemalloc @@ -1 +1 @@ -Subproject commit 93e27e435cac846028da20cd9b0841fbc9110bd2 +Subproject commit 026764f19995c53583ab25a3b9c06a2fd74e4689 diff --git a/contrib/libhdfs3 b/contrib/libhdfs3 index 30552ac527f..1b666578c85 160000 --- a/contrib/libhdfs3 +++ b/contrib/libhdfs3 @@ -1 +1 @@ -Subproject commit 30552ac527f2c14070d834e171493b2e7f662375 +Subproject commit 1b666578c85094306b061352078022f6350bfab8 diff --git a/contrib/mariadb-connector-c b/contrib/mariadb-connector-c index 1485b0de3ea..3f512fedf0b 160000 --- a/contrib/mariadb-connector-c +++ b/contrib/mariadb-connector-c @@ -1 +1 @@ -Subproject commit 1485b0de3eaa1508dfe49a5ba1e4aa2a71fd8335 +Subproject commit 3f512fedf0ba0f769a1b4852b4bac542d92c5b20 diff --git a/contrib/openssl b/contrib/openssl index 237260dd6a4..07e96230645 160000 --- a/contrib/openssl +++ b/contrib/openssl @@ -1 +1 @@ -Subproject commit 237260dd6a4bca5cb5a321d366a8a9c807957455 +Subproject commit 07e9623064508d15dd61367f960ebe7fc9aecd77 diff --git a/contrib/poco b/contrib/poco index 757d947235b..297fc905e16 160000 --- a/contrib/poco +++ b/contrib/poco @@ -1 +1 @@ -Subproject commit 757d947235b307675cff964f29b19d388140a9eb +Subproject commit 297fc905e166392156f83b96aaa5f44e8a6a35c4 diff --git a/contrib/replxx b/contrib/replxx index 8cf626c04e9..94b1f568d16 160000 --- a/contrib/replxx +++ b/contrib/replxx @@ -1 +1 @@ -Subproject commit 8cf626c04e9a74313fb0b474cdbe2297c0f3cdc8 +Subproject commit 94b1f568d16183214d26c7c0e9ce69a4ce407f65 From 078a52ae62c014ad0a2a37bdd8b94679301c30b4 Mon Sep 17 00:00:00 2001 From: George Date: Thu, 5 Nov 2020 04:35:44 +0300 Subject: [PATCH 017/504] Updated description --- docs/en/sql-reference/functions/type-conversion-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 5c8dc5fd272..c8a88545e97 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -325,7 +325,7 @@ This function accepts a number or date or date with time, and returns a FixedStr ## reinterpretAsUUID {#reinterpretasuuid} -This function accepts FixedString, and returns UUID. Takes 16 bytes string. If the string isn't long enough, the functions work as if the string is padded with the necessary number of null bytes to the end. If the string longer than 16 bytes, the extra bytes at the end are ignored. +This function accepts big-endian `FixedString`, and returns `UUID`. Takes 16 bytes string. If the string isn't long enough, the functions work as if the string is padded with the necessary number of null bytes to the end. If the string longer than 16 bytes, the extra bytes at the end are ignored. ## CAST(x, T) {#type_conversion_function-cast} From b9f287ed76be6505daced918a6cb66b8b49862c5 Mon Sep 17 00:00:00 2001 From: George Date: Fri, 6 Nov 2020 22:22:04 +0300 Subject: [PATCH 018/504] Updated description --- .../functions/type-conversion-functions.md | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index c8a88545e97..f9506606d92 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -325,7 +325,16 @@ This function accepts a number or date or date with time, and returns a FixedStr ## reinterpretAsUUID {#reinterpretasuuid} -This function accepts big-endian `FixedString`, and returns `UUID`. Takes 16 bytes string. If the string isn't long enough, the functions work as if the string is padded with the necessary number of null bytes to the end. If the string longer than 16 bytes, the extra bytes at the end are ignored. +This function accepts big-endian [FixedString](../../sql-reference/data-types/fixedstring), and returns [UUID](../../sql-reference/data-types/uuid). Takes 16 bytes string. If the string isn't long enough, the functions work as if the string is padded with the necessary number of null bytes to the end. If the string longer than 16 bytes, the extra bytes at the end are ignored. + +**Syntax** + +``` sql +reinterpretAsUUID(fixed_string) +``` +**Returned value** + +- `UUID`. ## CAST(x, T) {#type_conversion_function-cast} From 80f9ba3e5b002a6a86fdbc7a94b4513121e45824 Mon Sep 17 00:00:00 2001 From: George Date: Fri, 6 Nov 2020 22:22:59 +0300 Subject: [PATCH 019/504] Added translation --- .../functions/type-conversion-functions.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index 773850b65ce..19d7e68b4e4 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -319,6 +319,19 @@ SELECT toFixedString('foo\0bar', 8) AS s, toStringCutToZero(s) AS s_cut Функция принимает число или дату или дату-с-временем и возвращает строку, содержащую байты, представляющие соответствующее значение в host order (little endian). При этом, отбрасываются нулевые байты с конца. Например, значение 255 типа UInt32 будет строкой длины 1 байт. +## reinterpretAsUUID {#reinterpretasuuid} + +Функция принимает шестнадцатибайтную big-endian строку типа [FixedString](../../sql-reference/data-types/fixedstring) и возвращает [UUID](../../sql-reference/data-types/uuid). Если строка имеет недостаточную длину, то функция работает так, как будто строка дополнена необходимым количетсвом нулевых байт с конца. Если строка длиннее, чем шестнадцать байт, то игнорируются лишние байты с конца. + +**Syntax** + +``` sql +reinterpretAsUUID(fixed_string) +``` +**Returned value** + +- `UUID`. + ## CAST(x, T) {#type_conversion_function-cast} Преобразует x в тип данных t. From 5d1f67a5e2e4d909cc0c06d485c7eab5896c94b9 Mon Sep 17 00:00:00 2001 From: George Date: Fri, 6 Nov 2020 22:46:45 +0300 Subject: [PATCH 020/504] Fixed links --- docs/en/sql-reference/functions/type-conversion-functions.md | 2 +- docs/ru/sql-reference/functions/type-conversion-functions.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index f9506606d92..e865abd141d 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -325,7 +325,7 @@ This function accepts a number or date or date with time, and returns a FixedStr ## reinterpretAsUUID {#reinterpretasuuid} -This function accepts big-endian [FixedString](../../sql-reference/data-types/fixedstring), and returns [UUID](../../sql-reference/data-types/uuid). Takes 16 bytes string. If the string isn't long enough, the functions work as if the string is padded with the necessary number of null bytes to the end. If the string longer than 16 bytes, the extra bytes at the end are ignored. +This function accepts big-endian [FixedString](../../sql-reference/data-types/fixedstring.md#fixedstring), and returns [UUID](../../sql-reference/data-types/uuid.md#uuid). Takes 16 bytes string. If the string isn't long enough, the functions work as if the string is padded with the necessary number of null bytes to the end. If the string longer than 16 bytes, the extra bytes at the end are ignored. **Syntax** diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index 19d7e68b4e4..132ff5adb7c 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -321,7 +321,7 @@ SELECT toFixedString('foo\0bar', 8) AS s, toStringCutToZero(s) AS s_cut ## reinterpretAsUUID {#reinterpretasuuid} -Функция принимает шестнадцатибайтную big-endian строку типа [FixedString](../../sql-reference/data-types/fixedstring) и возвращает [UUID](../../sql-reference/data-types/uuid). Если строка имеет недостаточную длину, то функция работает так, как будто строка дополнена необходимым количетсвом нулевых байт с конца. Если строка длиннее, чем шестнадцать байт, то игнорируются лишние байты с конца. +Функция принимает шестнадцатибайтную big-endian строку типа [FixedString](../../sql-reference/data-types/fixedstring.md#fixedstring) и возвращает [UUID](../../sql-reference/data-types/uuid.md#uuid). Если строка имеет недостаточную длину, то функция работает так, как будто строка дополнена необходимым количетсвом нулевых байт с конца. Если строка длиннее, чем шестнадцать байт, то игнорируются лишние байты с конца. **Syntax** From efd24ac0181c07a44c4f2a37c0b9482d70880bf6 Mon Sep 17 00:00:00 2001 From: George Date: Fri, 6 Nov 2020 23:07:31 +0300 Subject: [PATCH 021/504] Fixed links 2 --- docs/en/sql-reference/functions/type-conversion-functions.md | 2 +- docs/ru/sql-reference/functions/type-conversion-functions.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index e865abd141d..71802ad08bf 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -325,7 +325,7 @@ This function accepts a number or date or date with time, and returns a FixedStr ## reinterpretAsUUID {#reinterpretasuuid} -This function accepts big-endian [FixedString](../../sql-reference/data-types/fixedstring.md#fixedstring), and returns [UUID](../../sql-reference/data-types/uuid.md#uuid). Takes 16 bytes string. If the string isn't long enough, the functions work as if the string is padded with the necessary number of null bytes to the end. If the string longer than 16 bytes, the extra bytes at the end are ignored. +This function accepts big-endian [FixedString](../../sql-reference/data-types/fixedstring.md#fixedstring), and returns [UUID](../../sql-reference/data-types/uuid.md#uuid-data-type). Takes 16 bytes string. If the string isn't long enough, the functions work as if the string is padded with the necessary number of null bytes to the end. If the string longer than 16 bytes, the extra bytes at the end are ignored. **Syntax** diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index 132ff5adb7c..d874520e22b 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -321,7 +321,7 @@ SELECT toFixedString('foo\0bar', 8) AS s, toStringCutToZero(s) AS s_cut ## reinterpretAsUUID {#reinterpretasuuid} -Функция принимает шестнадцатибайтную big-endian строку типа [FixedString](../../sql-reference/data-types/fixedstring.md#fixedstring) и возвращает [UUID](../../sql-reference/data-types/uuid.md#uuid). Если строка имеет недостаточную длину, то функция работает так, как будто строка дополнена необходимым количетсвом нулевых байт с конца. Если строка длиннее, чем шестнадцать байт, то игнорируются лишние байты с конца. +Функция принимает шестнадцатибайтную big-endian строку типа [FixedString](../../sql-reference/data-types/fixedstring.md#fixedstring) и возвращает [UUID](../../sql-reference/data-types/uuid.md#uuid-data-type). Если строка имеет недостаточную длину, то функция работает так, как будто строка дополнена необходимым количетсвом нулевых байт с конца. Если строка длиннее, чем шестнадцать байт, то игнорируются лишние байты с конца. **Syntax** From 24497801655fc2a4e6caef745ac000a978ba7927 Mon Sep 17 00:00:00 2001 From: George Date: Fri, 6 Nov 2020 23:19:26 +0300 Subject: [PATCH 022/504] Fixed mistakes in translation --- docs/ru/sql-reference/functions/type-conversion-functions.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index d874520e22b..7ec37bda741 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -323,12 +323,13 @@ SELECT toFixedString('foo\0bar', 8) AS s, toStringCutToZero(s) AS s_cut Функция принимает шестнадцатибайтную big-endian строку типа [FixedString](../../sql-reference/data-types/fixedstring.md#fixedstring) и возвращает [UUID](../../sql-reference/data-types/uuid.md#uuid-data-type). Если строка имеет недостаточную длину, то функция работает так, как будто строка дополнена необходимым количетсвом нулевых байт с конца. Если строка длиннее, чем шестнадцать байт, то игнорируются лишние байты с конца. -**Syntax** +**Синтаксис** ``` sql reinterpretAsUUID(fixed_string) ``` -**Returned value** + +**Возвращаемое значение** - `UUID`. From b480c3c2d32d0995c036161854c8b53960ddb686 Mon Sep 17 00:00:00 2001 From: Olga Revyakina Date: Sun, 8 Nov 2020 08:56:56 +0300 Subject: [PATCH 023/504] Docs for the Regexp input format (en) --- docs/en/interfaces/formats.md | 107 ++++++++++++++++++++++++++-------- 1 file changed, 84 insertions(+), 23 deletions(-) diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index d310705d1c1..071c0d72d27 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -57,6 +57,7 @@ The supported formats are: | [XML](#xml) | ✗ | ✔ | | [CapnProto](#capnproto) | ✔ | ✗ | | [LineAsString](#lineasstring) | ✔ | ✗ | +| [Regexp](#data-format-regexp) | ✔ | ✗ | You can control some format processing parameters with the ClickHouse settings. For more information read the [Settings](../operations/settings/settings.md) section. @@ -1290,6 +1291,89 @@ $ cat filename.orc | clickhouse-client --query="INSERT INTO some_table FORMAT OR To exchange data with Hadoop, you can use [HDFS table engine](../engines/table-engines/integrations/hdfs.md). +## LineAsString {#lineasstring} + +In this format, a sequence of string objects separated by a newline character is interpreted as a single value. This format can only be parsed for table with a single field of type [String](../sql-reference/data-types/string.md). The remaining columns must be set to [DEFAULT](../sql-reference/statements/create/table.md#default) or [MATERIALIZED](../sql-reference/statements/create/table.md#materialized), or omitted. + +**Example** + +Query: + +``` sql +DROP TABLE IF EXISTS line_as_string; +CREATE TABLE line_as_string (field String) ENGINE = Memory; +INSERT INTO line_as_string FORMAT LineAsString "I love apple", "I love banana", "I love orange"; +SELECT * FROM line_as_string; +``` + +Result: + +``` text +┌─field─────────────────────────────────────────────┐ +│ "I love apple", "I love banana", "I love orange"; │ +└───────────────────────────────────────────────────┘ +``` + +## Regexp {#data-format-regexp} + +When working with the `Regexp` format, you can use the following settings: + +- `format_regexp` — [String](../sql-reference/data-types/string.md). Contains regular expression in the [re2](https://github.com/google/re2/wiki/Syntax) format. +- `format_regexp_escaping_rule` — [String](../sql-reference/data-types/string.md). The following escaping rules are supported: + - CSV (similarly to [CSV](#csv)) + - JSON (similarly to [JSONEachRow](#jsoneachrow)) + - Escaped (similarly to [TSV](#tabseparated)) + - Quoted (similarly to [Values](#data-format-values)) + - Raw (extracts subpatterns as a whole, no escaping rules) +- `format_regexp_skip_unmatched` — [UInt8](../sql-reference/data-types/int-uint.md). Defines the need to throw an exeption in case the `format_regexp` expression does not match the imported data. Can be set to `0` or `1`. + +**Usage** + +The regular expression from `format_regexp` setting is applied to every line of imported data. The number of subpatterns in the regular expression must be equal to the number of columns in imported dataset. + +Lines of the imported data must be separated by newline character `'\n'` or DOS-style newline `"\r\n"` (except the `Raw` format, which does not support any escaping characters). + +The content of every matched subpattern is parsed with the method of corresponding data type, according to `format_regexp_escaping_rule` setting. + +If the regular expression does not match the line and `format_regexp_skip_unmatched` is set to 1, the line is silently skipped. If `format_regexp_skip_unmatched` is set to 0, exception is thrown. + +**Example** + +Consider the file data.tsv: + +```text +id: 1 array: [1,2,3] string: str1 date: 2020-01-01 +id: 2 array: [1,2,3] string: str2 date: 2020-01-02 +id: 3 array: [1,2,3] string: str3 date: 2020-01-03 +``` +and the table: + +```sql +CREATE TABLE imp_regex_table (id UInt32, array Array(UInt32), string String, date Date) ENGINE = Memory; +``` + +Import command: + +```bash +$ cat data.tsv | clickhouse-client --query "INSERT INTO imp_regex_table FORMAT Regexp SETTINGS format_regexp='id: (.+?) array: (.+?) string: (.+?) date: (.+?)', format_regexp_escaping_rule='Escaped', format_regexp_skip_unmatched=0;" +``` + +Query: + +```sql +SELECT * FROM imp_regex_table; +``` + +Result: + +```txt +┌─id─┬─array───┬─string─┬───────date─┐ +│ 1 │ [1,2,3] │ str1 │ 2020-01-01 │ +│ 2 │ [1,2,3] │ str2 │ 2020-01-02 │ +│ 3 │ [1,2,3] │ str3 │ 2020-01-03 │ +└────┴─────────┴────────┴────────────┘ +``` + ## Format Schema {#formatschema} The file name containing the format schema is set by the setting `format_schema`. @@ -1315,27 +1399,4 @@ Limitations: - In case of parsing error `JSONEachRow` skips all data until the new line (or EOF), so rows must be delimited by `\n` to count errors correctly. - `Template` and `CustomSeparated` use delimiter after the last column and delimiter between rows to find the beginning of next row, so skipping errors works only if at least one of them is not empty. -## LineAsString {#lineasstring} - -In this format, a sequence of string objects separated by a newline character is interpreted as a single value. This format can only be parsed for table with a single field of type [String](../sql-reference/data-types/string.md). The remaining columns must be set to [DEFAULT](../sql-reference/statements/create/table.md#default) or [MATERIALIZED](../sql-reference/statements/create/table.md#materialized), or omitted. - -**Example** - -Query: - -``` sql -DROP TABLE IF EXISTS line_as_string; -CREATE TABLE line_as_string (field String) ENGINE = Memory; -INSERT INTO line_as_string FORMAT LineAsString "I love apple", "I love banana", "I love orange"; -SELECT * FROM line_as_string; -``` - -Result: - -``` text -┌─field─────────────────────────────────────────────┐ -│ "I love apple", "I love banana", "I love orange"; │ -└───────────────────────────────────────────────────┘ -``` - [Original article](https://clickhouse.tech/docs/en/interfaces/formats/) From fd93d31950c95c15dd489aedd8641416f3db1321 Mon Sep 17 00:00:00 2001 From: Olga Revyakina Date: Sun, 8 Nov 2020 22:33:42 +0300 Subject: [PATCH 024/504] Revert "my changes to gitignore" This reverts commit 5884b1e79b0eb40d9c39b019d345d9dbc3c45640. --- .gitignore | 3 --- contrib/AMQP-CPP | 2 +- contrib/cyrus-sasl | 2 +- contrib/grpc | 2 +- contrib/jemalloc | 2 +- contrib/libhdfs3 | 2 +- contrib/mariadb-connector-c | 2 +- contrib/openssl | 2 +- contrib/poco | 2 +- contrib/replxx | 2 +- 10 files changed, 9 insertions(+), 12 deletions(-) diff --git a/.gitignore b/.gitignore index 6a88dbf59e9..52d58e68cb6 100644 --- a/.gitignore +++ b/.gitignore @@ -124,6 +124,3 @@ website/package-lock.json # Toolchains /cmake/toolchain/* - -# My changes -/contrib/* diff --git a/contrib/AMQP-CPP b/contrib/AMQP-CPP index 1c08399ab0a..d63e1f01658 160000 --- a/contrib/AMQP-CPP +++ b/contrib/AMQP-CPP @@ -1 +1 @@ -Subproject commit 1c08399ab0ab9e4042ef8e2bbe9e208e5dcbc13b +Subproject commit d63e1f016582e9faaaf279aa24513087a07bc6e7 diff --git a/contrib/cyrus-sasl b/contrib/cyrus-sasl index 6054630889f..9995bf9d8e1 160000 --- a/contrib/cyrus-sasl +++ b/contrib/cyrus-sasl @@ -1 +1 @@ -Subproject commit 6054630889fd1cd8d0659573d69badcee1e23a00 +Subproject commit 9995bf9d8e14f58934d9313ac64f13780d6dd3c9 diff --git a/contrib/grpc b/contrib/grpc index 8aea4e168e7..a6570b863cf 160000 --- a/contrib/grpc +++ b/contrib/grpc @@ -1 +1 @@ -Subproject commit 8aea4e168e78f3eb9828080740fc8cb73d53bf79 +Subproject commit a6570b863cf76c9699580ba51c7827d5bffaac43 diff --git a/contrib/jemalloc b/contrib/jemalloc index 026764f1999..93e27e435ca 160000 --- a/contrib/jemalloc +++ b/contrib/jemalloc @@ -1 +1 @@ -Subproject commit 026764f19995c53583ab25a3b9c06a2fd74e4689 +Subproject commit 93e27e435cac846028da20cd9b0841fbc9110bd2 diff --git a/contrib/libhdfs3 b/contrib/libhdfs3 index 1b666578c85..30552ac527f 160000 --- a/contrib/libhdfs3 +++ b/contrib/libhdfs3 @@ -1 +1 @@ -Subproject commit 1b666578c85094306b061352078022f6350bfab8 +Subproject commit 30552ac527f2c14070d834e171493b2e7f662375 diff --git a/contrib/mariadb-connector-c b/contrib/mariadb-connector-c index 3f512fedf0b..1485b0de3ea 160000 --- a/contrib/mariadb-connector-c +++ b/contrib/mariadb-connector-c @@ -1 +1 @@ -Subproject commit 3f512fedf0ba0f769a1b4852b4bac542d92c5b20 +Subproject commit 1485b0de3eaa1508dfe49a5ba1e4aa2a71fd8335 diff --git a/contrib/openssl b/contrib/openssl index 07e96230645..237260dd6a4 160000 --- a/contrib/openssl +++ b/contrib/openssl @@ -1 +1 @@ -Subproject commit 07e9623064508d15dd61367f960ebe7fc9aecd77 +Subproject commit 237260dd6a4bca5cb5a321d366a8a9c807957455 diff --git a/contrib/poco b/contrib/poco index 297fc905e16..757d947235b 160000 --- a/contrib/poco +++ b/contrib/poco @@ -1 +1 @@ -Subproject commit 297fc905e166392156f83b96aaa5f44e8a6a35c4 +Subproject commit 757d947235b307675cff964f29b19d388140a9eb diff --git a/contrib/replxx b/contrib/replxx index 94b1f568d16..8cf626c04e9 160000 --- a/contrib/replxx +++ b/contrib/replxx @@ -1 +1 @@ -Subproject commit 94b1f568d16183214d26c7c0e9ce69a4ce407f65 +Subproject commit 8cf626c04e9a74313fb0b474cdbe2297c0f3cdc8 From 1721341fe8b4339aee32abce7710f5934e108cff Mon Sep 17 00:00:00 2001 From: George Date: Mon, 9 Nov 2020 17:19:12 +0300 Subject: [PATCH 025/504] Improved description and added examples --- .../functions/type-conversion-functions.md | 47 ++++++++++++++++++- .../functions/type-conversion-functions.md | 46 +++++++++++++++++- 2 files changed, 89 insertions(+), 4 deletions(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 71802ad08bf..d55d5a0fecb 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -325,16 +325,59 @@ This function accepts a number or date or date with time, and returns a FixedStr ## reinterpretAsUUID {#reinterpretasuuid} -This function accepts big-endian [FixedString](../../sql-reference/data-types/fixedstring.md#fixedstring), and returns [UUID](../../sql-reference/data-types/uuid.md#uuid-data-type). Takes 16 bytes string. If the string isn't long enough, the functions work as if the string is padded with the necessary number of null bytes to the end. If the string longer than 16 bytes, the extra bytes at the end are ignored. +This function accepts 16 bytes string, and returns UUID containing bytes representing the corresponding value in network byte order (big-endian). If the string isn't long enough, the functions work as if the string is padded with the necessary number of null bytes to the end. If the string longer than 16 bytes, the extra bytes at the end are ignored. **Syntax** ``` sql reinterpretAsUUID(fixed_string) ``` + +**Parameters** + +- `fixed_string` — Big-endian byte string. [FixedString](../../sql-reference/data-types/fixedstring.md#fixedstring). + **Returned value** -- `UUID`. +- The UUID type value. [UUID](../../sql-reference/data-types/uuid.md#uuid-data-type). + +**Examples** + +String to UUID. + +Query: + +``` sql +SELECT reinterpretAsUUID(reverse(unhex('000102030405060708090a0b0c0d0e0f'))) +``` + +Result: + +``` text +┌─reinterpretAsUUID(reverse(unhex('000102030405060708090a0b0c0d0e0f')))─┐ +│ 08090a0b-0c0d-0e0f-0001-020304050607 │ +└───────────────────────────────────────────────────────────────────────┘ +``` + +Going back and forth from String to UUID. + +Query: + +``` sql +WITH + generateUUIDv4() AS uuid, + identity(lower(hex(reverse(reinterpretAsString(uuid))))) AS str, + reinterpretAsUUID(reverse(unhex(str))) AS uuid2 +SELECT uuid = uuid2; +``` + +Result: + +``` text +┌─equals(uuid, uuid2)─┐ +│ 1 │ +└─────────────────────┘ +``` ## CAST(x, T) {#type_conversion_function-cast} diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index 7ec37bda741..62b045543d4 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -321,7 +321,7 @@ SELECT toFixedString('foo\0bar', 8) AS s, toStringCutToZero(s) AS s_cut ## reinterpretAsUUID {#reinterpretasuuid} -Функция принимает шестнадцатибайтную big-endian строку типа [FixedString](../../sql-reference/data-types/fixedstring.md#fixedstring) и возвращает [UUID](../../sql-reference/data-types/uuid.md#uuid-data-type). Если строка имеет недостаточную длину, то функция работает так, как будто строка дополнена необходимым количетсвом нулевых байт с конца. Если строка длиннее, чем шестнадцать байт, то игнорируются лишние байты с конца. +Функция принимает шестнадцатибайтную строку и интерпретирует ее байты в network order (big-endian). Если строка имеет недостаточную длину, то функция работает так, как будто строка дополнена необходимым количетсвом нулевых байт с конца. Если строка длиннее, чем шестнадцать байт, то игнорируются лишние байты с конца. **Синтаксис** @@ -329,9 +329,51 @@ SELECT toFixedString('foo\0bar', 8) AS s, toStringCutToZero(s) AS s_cut reinterpretAsUUID(fixed_string) ``` +**Параметры** + +- `fixed_string` — Строка с big-endian порядком байтов. [FixedString](../../sql-reference/data-types/fixedstring.md#fixedstring). + **Возвращаемое значение** -- `UUID`. +- Значение типа UUID. [UUID](../../sql-reference/data-types/uuid.md#uuid-data-type). + +**Примеры** + +Интерпретация строки как UUID. + +Запрос: + +``` sql +SELECT reinterpretAsUUID(reverse(unhex('000102030405060708090a0b0c0d0e0f'))) +``` + +Результат: + +``` text +┌─reinterpretAsUUID(reverse(unhex('000102030405060708090a0b0c0d0e0f')))─┐ +│ 08090a0b-0c0d-0e0f-0001-020304050607 │ +└───────────────────────────────────────────────────────────────────────┘ +``` + +Переход в UUID и обратно. + +Запрос: + +``` sql +WITH + generateUUIDv4() AS uuid, + identity(lower(hex(reverse(reinterpretAsString(uuid))))) AS str, + reinterpretAsUUID(reverse(unhex(str))) AS uuid2 +SELECT uuid = uuid2; +``` + +Результат: + +``` text +┌─equals(uuid, uuid2)─┐ +│ 1 │ +└─────────────────────┘ +``` ## CAST(x, T) {#type_conversion_function-cast} From 683960725a0646609d4fe4619762758ba6d913ea Mon Sep 17 00:00:00 2001 From: George Date: Mon, 9 Nov 2020 17:30:26 +0300 Subject: [PATCH 026/504] Minor fixes --- docs/ru/sql-reference/functions/type-conversion-functions.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index 62b045543d4..e80157d70fb 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -331,11 +331,11 @@ reinterpretAsUUID(fixed_string) **Параметры** -- `fixed_string` — Строка с big-endian порядком байтов. [FixedString](../../sql-reference/data-types/fixedstring.md#fixedstring). +- `fixed_string` — cтрока с big-endian порядком байтов. [FixedString](../../sql-reference/data-types/fixedstring.md#fixedstring). **Возвращаемое значение** -- Значение типа UUID. [UUID](../../sql-reference/data-types/uuid.md#uuid-data-type). +- Значение типа [UUID](../../sql-reference/data-types/uuid.md#uuid-data-type). **Примеры** From 4b17188de1b7a1b73ba637baac277d7c1b1bde5f Mon Sep 17 00:00:00 2001 From: Olga Revyakina Date: Mon, 9 Nov 2020 22:40:32 +0300 Subject: [PATCH 027/504] Minor fix in en. translated to ru. Plus template updates (administrative).. --- .../template-system-table.md | 2 +- docs/en/interfaces/formats.md | 4 +- docs/ru/interfaces/formats.md | 98 ++++++++++++++++--- 3 files changed, 89 insertions(+), 15 deletions(-) diff --git a/docs/_description_templates/template-system-table.md b/docs/_description_templates/template-system-table.md index 137766a34b6..3fdf9788d79 100644 --- a/docs/_description_templates/template-system-table.md +++ b/docs/_description_templates/template-system-table.md @@ -1,4 +1,4 @@ -## system.table_name {#system-tables_table-name} +# system.table_name {#system-tables_table-name} Description. diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index 071c0d72d27..70cd4d57600 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -1293,7 +1293,7 @@ To exchange data with Hadoop, you can use [HDFS table engine](../engines/table-e ## LineAsString {#lineasstring} -In this format, a sequence of string objects separated by a newline character is interpreted as a single value. This format can only be parsed for table with a single field of type [String](../sql-reference/data-types/string.md). The remaining columns must be set to [DEFAULT](../sql-reference/statements/create/table.md#default) or [MATERIALIZED](../sql-reference/statements/create/table.md#materialized), or omitted. +In this format, every line of input data is interpreted as a single string value. This format can only be parsed for table with a single field of type [String](../sql-reference/data-types/string.md). The remaining columns must be set to [DEFAULT](../sql-reference/statements/create/table.md#default) or [MATERIALIZED](../sql-reference/statements/create/table.md#materialized), or omitted. **Example** @@ -1316,6 +1316,8 @@ Result: ## Regexp {#data-format-regexp} +Each line of imported data is parsed according to the regular expression. + When working with the `Regexp` format, you can use the following settings: - `format_regexp` — [String](../sql-reference/data-types/string.md). Contains regular expression in the [re2](https://github.com/google/re2/wiki/Syntax) format. diff --git a/docs/ru/interfaces/formats.md b/docs/ru/interfaces/formats.md index 042c62e310c..0d16e999488 100644 --- a/docs/ru/interfaces/formats.md +++ b/docs/ru/interfaces/formats.md @@ -1209,22 +1209,10 @@ $ cat filename.orc | clickhouse-client --query="INSERT INTO some_table FORMAT OR Для обмена данных с Hadoop можно использовать [движок таблиц HDFS](../engines/table-engines/integrations/hdfs.md). -## Схема формата {#formatschema} - -Имя файла со схемой записывается в настройке `format_schema`. При использовании форматов `Cap'n Proto` и `Protobuf` требуется указать схему. -Схема представляет собой имя файла и имя типа в этом файле, разделенные двоеточием, например `schemafile.proto:MessageType`. -Если файл имеет стандартное расширение для данного формата (например `.proto` для `Protobuf`), -то можно его не указывать и записывать схему так `schemafile:MessageType`. - -Если для ввода/вывода данных используется [клиент](../interfaces/cli.md) в [интерактивном режиме](../interfaces/cli.md#cli_usage), то при записи схемы можно использовать абсолютный путь или записывать путь -относительно текущей директории на клиенте. Если клиент используется в [batch режиме](../interfaces/cli.md#cli_usage), то в записи схемы допускается только относительный путь, из соображений безопасности. - -Если для ввода/вывода данных используется [HTTP-интерфейс](../interfaces/http.md), то файл со схемой должен располагаться на сервере в каталоге, -указанном в параметре [format_schema_path](../operations/server-configuration-parameters/settings.md#server_configuration_parameters-format_schema_path) конфигурации сервера. ## LineAsString {#lineasstring} - В этом формате последовательность строковых объектов, разделенных символом новой строки, интерпретируется как одно значение. Парситься может только таблица с единственным полем типа [String](../sql-reference/data-types/string.md). Остальные столбцы должны быть заданы как [DEFAULT](../sql-reference/statements/create/table.md#create-default-values) или [MATERIALIZED](../sql-reference/statements/create/table.md#create-default-values), либо отсутствовать. + В этом формате каждая строка импортируемых данных интерпретируется как одно строковое значение. Парситься может только таблица с единственным полем типа [String](../sql-reference/data-types/string.md). Остальные столбцы должны быть заданы как [DEFAULT](../sql-reference/statements/create/table.md#create-default-values) или [MATERIALIZED](../sql-reference/statements/create/table.md#create-default-values), либо отсутствовать. **Пример** @@ -1245,4 +1233,88 @@ SELECT * FROM line_as_string; └───────────────────────────────────────────────────┘ ``` +## Regexp {#data-format-regexp} + +Каждая строка импортируемых данных разбирается в соответствии с регулярным выражением. + +При работе с форматом `Regexp` можно использовать следующие параметры: + +- `format_regexp` — [String](../sql-reference/data-types/string.md). Строка с регулярным выражением в формате [re2](https://github.com/google/re2/wiki/Syntax). +- `format_regexp_escaping_rule` — [String](../sql-reference/data-types/string.md). Правило сериализации. Поддерживаются следующие правила: + - CSV (как в [CSV](#csv)) + - JSON (как в [JSONEachRow](#jsoneachrow)) + - Escaped (как в [TSV](#tabseparated)) + - Quoted (как в [Values](#data-format-values)) + - Raw (данные импортируются как есть, без сериализации) +- `format_regexp_skip_unmatched` — [UInt8](../sql-reference/data-types/int-uint.md). Признак, будет ли генерироваться исключение в случае, если импортируемые данные не соответствуют регулярному выражению `format_regexp`. Может принимать значение `0` или `1`. + +**Использование** + +Регулярное выражение (шаблон) из параметра `format_regexp` применяется к каждой строке импортируемых данных. Количество частей в шаблоне (подшаблонов) должно соответствовать количеству колонок в импортируемых данных. + +Строки импортируемых данных должны разделяться символом новой строки `'\n'` или символами `"\r\n"` (перенос строки в формате DOS), за исключением формата `Raw`, который не поддерживает сериализацию. + +Данные, выделенные по подшаблонам, интерпретируются в соответствии с типом, указанным в параметре `format_regexp_escaping_rule`. + +Если строка импортируемых данных не соответствует регулярному выражению и параметр `format_regexp_skip_unmatched` равен 1, строка просто игнорируется. Если же параметр `format_regexp_skip_unmatched` равен 0, генерируется исключение. + +**Пример** + +Рассмотрим файл data.tsv: + +```text +id: 1 array: [1,2,3] string: str1 date: 2020-01-01 +id: 2 array: [1,2,3] string: str2 date: 2020-01-02 +id: 3 array: [1,2,3] string: str3 date: 2020-01-03 +``` +и таблицу: + +```sql +CREATE TABLE imp_regex_table (id UInt32, array Array(UInt32), string String, date Date) ENGINE = Memory; +``` + +Команда импорта: + +```bash +$ cat data.tsv | clickhouse-client --query "INSERT INTO imp_regex_table FORMAT Regexp SETTINGS format_regexp='id: (.+?) array: (.+?) string: (.+?) date: (.+?)', format_regexp_escaping_rule='Escaped', format_regexp_skip_unmatched=0;" +``` + +Запрос: + +```sql +SELECT * FROM imp_regex_table; +``` + +Результат: + +```txt +┌─id─┬─array───┬─string─┬───────date─┐ +│ 1 │ [1,2,3] │ str1 │ 2020-01-01 │ +│ 2 │ [1,2,3] │ str2 │ 2020-01-02 │ +│ 3 │ [1,2,3] │ str3 │ 2020-01-03 │ +└────┴─────────┴────────┴────────────┘ +``` + + +## Схема формата {#formatschema} + +Имя файла со схемой записывается в настройке `format_schema`. При использовании форматов `Cap'n Proto` и `Protobuf` требуется указать схему. +Схема представляет собой имя файла и имя типа в этом файле, разделенные двоеточием, например `schemafile.proto:MessageType`. +Если файл имеет стандартное расширение для данного формата (например `.proto` для `Protobuf`), +то можно его не указывать и записывать схему так `schemafile:MessageType`. + +Если для ввода/вывода данных используется [клиент](../interfaces/cli.md) в [интерактивном режиме](../interfaces/cli.md#cli_usage), то при записи схемы можно использовать абсолютный путь или записывать путь +относительно текущей директории на клиенте. Если клиент используется в [batch режиме](../interfaces/cli.md#cli_usage), то в записи схемы допускается только относительный путь, из соображений безопасности. + +Если для ввода/вывода данных используется [HTTP-интерфейс](../interfaces/http.md), то файл со схемой должен располагаться на сервере в каталоге, +указанном в параметре [format_schema_path](../operations/server-configuration-parameters/settings.md#server_configuration_parameters-format_schema_path) конфигурации сервера. + +## Игнорирование ошибок {#skippingerrors} + +Некоторые форматы, такие как `CSV`, `TabSeparated`, `TSKV`, `JSONEachRow`, `Template`, `CustomSeparated` и `Protobuf`, могут игнорировать строки, которые не соответствуют правилам и разбор которых может вызвать ошибку. При этом обработка импортируемых данных продолжается со следующей строки. См. настройки [input_format_allow_errors_num](../operations/settings/settings.md#settings-input_format_allow_errors_num) и +[input_format_allow_errors_ratio](../operations/settings/settings.md#settings-input_format_allow_errors_ratio). +Ограничения: +- В формате `JSONEachRow` в случае ошибки игнорируются все данные до конца текущей строки (или до конца файла). Поэтому строки должны быть разделены символом `\n`, чтобы ошибки обрабатывались корректно. +- Форматы `Template` и `CustomSeparated` используют разделитель после последней колонки и разделитель между строками. Поэтому игнорирование ошибок работает только если хотя бы одна из строк не пустая. + [Оригинальная статья](https://clickhouse.tech/docs/ru/interfaces/formats/) From 713d809096f07ab6119dd2122e1a99559aa4c8ee Mon Sep 17 00:00:00 2001 From: Olga Revyakina Date: Mon, 9 Nov 2020 22:50:43 +0300 Subject: [PATCH 028/504] Links fixed (ru) --- docs/ru/interfaces/formats.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/ru/interfaces/formats.md b/docs/ru/interfaces/formats.md index 0d16e999488..dd75927db7a 100644 --- a/docs/ru/interfaces/formats.md +++ b/docs/ru/interfaces/formats.md @@ -1311,8 +1311,8 @@ SELECT * FROM imp_regex_table; ## Игнорирование ошибок {#skippingerrors} -Некоторые форматы, такие как `CSV`, `TabSeparated`, `TSKV`, `JSONEachRow`, `Template`, `CustomSeparated` и `Protobuf`, могут игнорировать строки, которые не соответствуют правилам и разбор которых может вызвать ошибку. При этом обработка импортируемых данных продолжается со следующей строки. См. настройки [input_format_allow_errors_num](../operations/settings/settings.md#settings-input_format_allow_errors_num) и -[input_format_allow_errors_ratio](../operations/settings/settings.md#settings-input_format_allow_errors_ratio). +Некоторые форматы, такие как `CSV`, `TabSeparated`, `TSKV`, `JSONEachRow`, `Template`, `CustomSeparated` и `Protobuf`, могут игнорировать строки, которые не соответствуют правилам и разбор которых может вызвать ошибку. При этом обработка импортируемых данных продолжается со следующей строки. См. настройки [input_format_allow_errors_num](../operations/settings/settings.md#input_format_allow_errors_num) и +[input_format_allow_errors_ratio](../operations/settings/settings.md#input_format_allow_errors_ratio). Ограничения: - В формате `JSONEachRow` в случае ошибки игнорируются все данные до конца текущей строки (или до конца файла). Поэтому строки должны быть разделены символом `\n`, чтобы ошибки обрабатывались корректно. - Форматы `Template` и `CustomSeparated` используют разделитель после последней колонки и разделитель между строками. Поэтому игнорирование ошибок работает только если хотя бы одна из строк не пустая. From fc4410ed5c55e1a938dbb829a7d560ac6372d270 Mon Sep 17 00:00:00 2001 From: Olga Revyakina Date: Mon, 9 Nov 2020 22:59:34 +0300 Subject: [PATCH 029/504] Now really --- docs/ru/interfaces/formats.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/ru/interfaces/formats.md b/docs/ru/interfaces/formats.md index dd75927db7a..287ae142bdd 100644 --- a/docs/ru/interfaces/formats.md +++ b/docs/ru/interfaces/formats.md @@ -1311,8 +1311,8 @@ SELECT * FROM imp_regex_table; ## Игнорирование ошибок {#skippingerrors} -Некоторые форматы, такие как `CSV`, `TabSeparated`, `TSKV`, `JSONEachRow`, `Template`, `CustomSeparated` и `Protobuf`, могут игнорировать строки, которые не соответствуют правилам и разбор которых может вызвать ошибку. При этом обработка импортируемых данных продолжается со следующей строки. См. настройки [input_format_allow_errors_num](../operations/settings/settings.md#input_format_allow_errors_num) и -[input_format_allow_errors_ratio](../operations/settings/settings.md#input_format_allow_errors_ratio). +Некоторые форматы, такие как `CSV`, `TabSeparated`, `TSKV`, `JSONEachRow`, `Template`, `CustomSeparated` и `Protobuf`, могут игнорировать строки, которые не соответствуют правилам и разбор которых может вызвать ошибку. При этом обработка импортируемых данных продолжается со следующей строки. См. настройки [input_format_allow_errors_num](../operations/settings/settings.md#input-format-allow-errors-num) и +[input_format_allow_errors_ratio](../operations/settings/settings.md#input-format-allow-errors-ratio). Ограничения: - В формате `JSONEachRow` в случае ошибки игнорируются все данные до конца текущей строки (или до конца файла). Поэтому строки должны быть разделены символом `\n`, чтобы ошибки обрабатывались корректно. - Форматы `Template` и `CustomSeparated` используют разделитель после последней колонки и разделитель между строками. Поэтому игнорирование ошибок работает только если хотя бы одна из строк не пустая. From d3d78286424d32c8bc08be779fb785eb2688c6b7 Mon Sep 17 00:00:00 2001 From: Olga Revyakina Date: Mon, 9 Nov 2020 23:07:34 +0300 Subject: [PATCH 030/504] Minor formatting error fixed --- docs/en/interfaces/formats.md | 2 +- docs/ru/interfaces/formats.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index 70cd4d57600..6c57fb62939 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -1368,7 +1368,7 @@ SELECT * FROM imp_regex_table; Result: -```txt +```text ┌─id─┬─array───┬─string─┬───────date─┐ │ 1 │ [1,2,3] │ str1 │ 2020-01-01 │ │ 2 │ [1,2,3] │ str2 │ 2020-01-02 │ diff --git a/docs/ru/interfaces/formats.md b/docs/ru/interfaces/formats.md index 287ae142bdd..153653d8cff 100644 --- a/docs/ru/interfaces/formats.md +++ b/docs/ru/interfaces/formats.md @@ -1287,7 +1287,7 @@ SELECT * FROM imp_regex_table; Результат: -```txt +```text ┌─id─┬─array───┬─string─┬───────date─┐ │ 1 │ [1,2,3] │ str1 │ 2020-01-01 │ │ 2 │ [1,2,3] │ str2 │ 2020-01-02 │ From ad9a0c6144c228a8e4c0ade2652fe483be3934bc Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Mon, 30 Nov 2020 15:43:37 +0300 Subject: [PATCH 031/504] Update poco --- contrib/poco | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/poco b/contrib/poco index f49c6ab8d3a..4d06db3947a 160000 --- a/contrib/poco +++ b/contrib/poco @@ -1 +1 @@ -Subproject commit f49c6ab8d3aa71828bd1b411485c21722e8c9d82 +Subproject commit 4d06db3947ac2343133220a5cd5d9b35bc89814c From fa9814921c758fb1e5b439843832f51a1fe22ca5 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Mon, 30 Nov 2020 17:57:31 +0300 Subject: [PATCH 032/504] Updae boost. --- contrib/boost | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/boost b/contrib/boost index a04e72c0464..d2da6db25de 160000 --- a/contrib/boost +++ b/contrib/boost @@ -1 +1 @@ -Subproject commit a04e72c0464f0c31d3384f18f0c0db36a05538e0 +Subproject commit d2da6db25deff1a927b166ce62cf8967a40273c9 From 0f293e60e1e411cfcee762625ff5522b119d287d Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Mon, 30 Nov 2020 18:22:34 +0300 Subject: [PATCH 033/504] Update CMakeLists.txt for boost --- contrib/boost-cmake/CMakeLists.txt | 92 ++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) diff --git a/contrib/boost-cmake/CMakeLists.txt b/contrib/boost-cmake/CMakeLists.txt index fd860c9f9b0..b3bf3b38f8b 100644 --- a/contrib/boost-cmake/CMakeLists.txt +++ b/contrib/boost-cmake/CMakeLists.txt @@ -11,6 +11,7 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY) iostreams program_options regex + context ) if(Boost_INCLUDE_DIR AND Boost_FILESYSTEM_LIBRARY AND Boost_FILESYSTEM_LIBRARY AND @@ -27,18 +28,21 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY) add_library (_boost_program_options INTERFACE) add_library (_boost_regex INTERFACE) add_library (_boost_system INTERFACE) + add_library (_boost_context INTERFACE) target_link_libraries (_boost_filesystem INTERFACE ${Boost_FILESYSTEM_LIBRARY}) target_link_libraries (_boost_iostreams INTERFACE ${Boost_IOSTREAMS_LIBRARY}) target_link_libraries (_boost_program_options INTERFACE ${Boost_PROGRAM_OPTIONS_LIBRARY}) target_link_libraries (_boost_regex INTERFACE ${Boost_REGEX_LIBRARY}) target_link_libraries (_boost_system INTERFACE ${Boost_SYSTEM_LIBRARY}) + target_link_libraries (_boost_context INTERFACE ${Boost_CONTEXT_LIBRARY}) add_library (boost::filesystem ALIAS _boost_filesystem) add_library (boost::iostreams ALIAS _boost_iostreams) add_library (boost::program_options ALIAS _boost_program_options) add_library (boost::regex ALIAS _boost_regex) add_library (boost::system ALIAS _boost_system) + add_library (boost::context ALIAS _boost_context) else() set(EXTERNAL_BOOST_FOUND 0) message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find system boost") @@ -142,4 +146,92 @@ if (NOT EXTERNAL_BOOST_FOUND) add_library (_boost_system ${SRCS_SYSTEM}) add_library (boost::system ALIAS _boost_system) target_include_directories (_boost_system PRIVATE ${LIBRARY_DIR}) + + # context + + set (SRCS_CONTEXT + ${LIBRARY_DIR}/libs/context/src/asm/jump_arm64_aapcs_elf_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/jump_arm64_aapcs_macho_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/jump_arm_aapcs_elf_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/jump_arm_aapcs_macho_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/jump_arm_aapcs_pe_armasm.asm + ${LIBRARY_DIR}/libs/context/src/asm/jump_combined_sysv_macho_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/jump_i386_ms_pe_gas.asm + ${LIBRARY_DIR}/libs/context/src/asm/jump_i386_ms_pe_masm.asm + ${LIBRARY_DIR}/libs/context/src/asm/jump_i386_sysv_elf_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/jump_i386_sysv_macho_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/jump_i386_x86_64_sysv_macho_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/jump_mips32_o32_elf_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/jump_mips64_n64_elf_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/jump_ppc32_ppc64_sysv_macho_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/jump_ppc32_sysv_elf_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/jump_ppc32_sysv_macho_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/jump_ppc32_sysv_xcoff_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/jump_ppc64_sysv_elf_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/jump_ppc64_sysv_macho_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/jump_ppc64_sysv_xcoff_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/jump_x86_64_ms_pe_gas.asm + ${LIBRARY_DIR}/libs/context/src/asm/jump_x86_64_ms_pe_masm.asm + ${LIBRARY_DIR}/libs/context/src/asm/jump_x86_64_sysv_elf_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/jump_x86_64_sysv_macho_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/make_arm64_aapcs_elf_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/make_arm64_aapcs_macho_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/make_arm_aapcs_elf_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/make_arm_aapcs_macho_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/make_arm_aapcs_pe_armasm.asm + ${LIBRARY_DIR}/libs/context/src/asm/make_combined_sysv_macho_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/make_i386_ms_pe_gas.asm + ${LIBRARY_DIR}/libs/context/src/asm/make_i386_ms_pe_masm.asm + ${LIBRARY_DIR}/libs/context/src/asm/make_i386_sysv_elf_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/make_i386_sysv_macho_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/make_i386_x86_64_sysv_macho_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/make_mips32_o32_elf_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/make_mips64_n64_elf_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/make_ppc32_ppc64_sysv_macho_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/make_ppc32_sysv_elf_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/make_ppc32_sysv_macho_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/make_ppc32_sysv_xcoff_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/make_ppc64_sysv_elf_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/make_ppc64_sysv_macho_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/make_ppc64_sysv_xcoff_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/make_x86_64_ms_pe_gas.asm + ${LIBRARY_DIR}/libs/context/src/asm/make_x86_64_ms_pe_masm.asm + ${LIBRARY_DIR}/libs/context/src/asm/make_x86_64_sysv_elf_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/make_x86_64_sysv_macho_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/ontop_arm64_aapcs_elf_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/ontop_arm64_aapcs_macho_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/ontop_arm_aapcs_elf_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/ontop_arm_aapcs_macho_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/ontop_arm_aapcs_pe_armasm.asm + ${LIBRARY_DIR}/libs/context/src/asm/ontop_combined_sysv_macho_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/ontop_i386_ms_pe_gas.asm + ${LIBRARY_DIR}/libs/context/src/asm/ontop_i386_ms_pe_masm.asm + ${LIBRARY_DIR}/libs/context/src/asm/ontop_i386_sysv_elf_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/ontop_i386_sysv_macho_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/ontop_i386_x86_64_sysv_macho_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/ontop_mips32_o32_elf_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/ontop_mips64_n64_elf_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/ontop_ppc32_ppc64_sysv_macho_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/ontop_ppc32_sysv_elf_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/ontop_ppc32_sysv_macho_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/ontop_ppc32_sysv_xcoff_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/ontop_ppc64_sysv_elf_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/ontop_ppc64_sysv_macho_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/ontop_ppc64_sysv_xcoff_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/ontop_x86_64_ms_pe_gas.asm + ${LIBRARY_DIR}/libs/context/src/asm/ontop_x86_64_ms_pe_masm.asm + ${LIBRARY_DIR}/libs/context/src/asm/ontop_x86_64_sysv_elf_gas.S + ${LIBRARY_DIR}/libs/context/src/asm/ontop_x86_64_sysv_macho_gas.S + ${LIBRARY_DIR}/libs/context/src/continuation.cpp + ${LIBRARY_DIR}/libs/context/src/dummy.cpp + ${LIBRARY_DIR}/libs/context/src/execution_context.cpp + ${LIBRARY_DIR}/libs/context/src/fiber.cpp + ${LIBRARY_DIR}/libs/context/src/posix/stack_traits.cpp + ${LIBRARY_DIR}/libs/context/src/untested.cpp + ${LIBRARY_DIR}/libs/context/src/windows/stack_traits.cpp + ) + + add_library (_boost_context ${SRCS_CONTEXT}) + add_library (boost::context ALIAS _boost_context) + target_include_directories (_boost_context PRIVATE ${LIBRARY_DIR}) endif () From 4f442cd8f396cc0297052b17b4a50bcb3984d53d Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Mon, 30 Nov 2020 18:32:01 +0300 Subject: [PATCH 034/504] Update CMakeLists.txt for boost --- contrib/boost-cmake/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/contrib/boost-cmake/CMakeLists.txt b/contrib/boost-cmake/CMakeLists.txt index b3bf3b38f8b..59deea802cf 100644 --- a/contrib/boost-cmake/CMakeLists.txt +++ b/contrib/boost-cmake/CMakeLists.txt @@ -148,6 +148,8 @@ if (NOT EXTERNAL_BOOST_FOUND) target_include_directories (_boost_system PRIVATE ${LIBRARY_DIR}) # context + enable_language(ASM) + SET(ASM_OPTIONS "-x assembler-with-cpp") set (SRCS_CONTEXT ${LIBRARY_DIR}/libs/context/src/asm/jump_arm64_aapcs_elf_gas.S From d50a0e63e6189b98581309f0ed2c3eb4e780b6ef Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Mon, 30 Nov 2020 18:46:58 +0300 Subject: [PATCH 035/504] Added example from boost. --- src/Interpreters/tests/CMakeLists.txt | 3 +++ src/Interpreters/tests/context.cpp | 26 ++++++++++++++++++++++++++ 2 files changed, 29 insertions(+) create mode 100644 src/Interpreters/tests/context.cpp diff --git a/src/Interpreters/tests/CMakeLists.txt b/src/Interpreters/tests/CMakeLists.txt index 20aa73166fb..9bdedc7b76b 100644 --- a/src/Interpreters/tests/CMakeLists.txt +++ b/src/Interpreters/tests/CMakeLists.txt @@ -29,6 +29,9 @@ target_link_libraries (string_hash_map PRIVATE dbms) add_executable (string_hash_map_aggregation string_hash_map.cpp) target_link_libraries (string_hash_map_aggregation PRIVATE dbms) +add_executable (context context.cpp) +target_link_libraries (context PRIVATE dbms) + add_executable (two_level_hash_map two_level_hash_map.cpp) target_include_directories (two_level_hash_map SYSTEM BEFORE PRIVATE ${SPARSEHASH_INCLUDE_DIR}) target_link_libraries (two_level_hash_map PRIVATE dbms) diff --git a/src/Interpreters/tests/context.cpp b/src/Interpreters/tests/context.cpp new file mode 100644 index 00000000000..eed7ca60790 --- /dev/null +++ b/src/Interpreters/tests/context.cpp @@ -0,0 +1,26 @@ +#include +#include + +int main(int, char **) +{ + namespace ctx=boost::context; + int a; + ctx::fiber source{[&a](ctx::fiber&& sink) + { + a=0; + int b=1; + while (true) + { + sink=std::move(sink).resume(); + int next=a+b; + a=b; + b=next; + } + return std::move(sink); + }}; + for (int j=0;j<10;++j) + { + source=std::move(source).resume(); + std::cout << a << " "; + } +} From 0e043202598010d9ffab947ca475244e66370054 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Mon, 30 Nov 2020 19:10:18 +0300 Subject: [PATCH 036/504] Update CMakeLists.txt --- contrib/boost-cmake/CMakeLists.txt | 138 +++++++++++++------------- src/Interpreters/tests/CMakeLists.txt | 2 +- 2 files changed, 70 insertions(+), 70 deletions(-) diff --git a/contrib/boost-cmake/CMakeLists.txt b/contrib/boost-cmake/CMakeLists.txt index 59deea802cf..712aa7cdac0 100644 --- a/contrib/boost-cmake/CMakeLists.txt +++ b/contrib/boost-cmake/CMakeLists.txt @@ -152,78 +152,78 @@ if (NOT EXTERNAL_BOOST_FOUND) SET(ASM_OPTIONS "-x assembler-with-cpp") set (SRCS_CONTEXT - ${LIBRARY_DIR}/libs/context/src/asm/jump_arm64_aapcs_elf_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/jump_arm64_aapcs_macho_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/jump_arm_aapcs_elf_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/jump_arm_aapcs_macho_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/jump_arm_aapcs_pe_armasm.asm - ${LIBRARY_DIR}/libs/context/src/asm/jump_combined_sysv_macho_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/jump_i386_ms_pe_gas.asm - ${LIBRARY_DIR}/libs/context/src/asm/jump_i386_ms_pe_masm.asm - ${LIBRARY_DIR}/libs/context/src/asm/jump_i386_sysv_elf_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/jump_i386_sysv_macho_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/jump_i386_x86_64_sysv_macho_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/jump_mips32_o32_elf_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/jump_mips64_n64_elf_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/jump_ppc32_ppc64_sysv_macho_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/jump_ppc32_sysv_elf_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/jump_ppc32_sysv_macho_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/jump_ppc32_sysv_xcoff_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/jump_ppc64_sysv_elf_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/jump_ppc64_sysv_macho_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/jump_ppc64_sysv_xcoff_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/jump_x86_64_ms_pe_gas.asm - ${LIBRARY_DIR}/libs/context/src/asm/jump_x86_64_ms_pe_masm.asm +# ${LIBRARY_DIR}/libs/context/src/asm/jump_arm64_aapcs_elf_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/jump_arm64_aapcs_macho_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/jump_arm_aapcs_elf_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/jump_arm_aapcs_macho_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/jump_arm_aapcs_pe_armasm.asm +# ${LIBRARY_DIR}/libs/context/src/asm/jump_combined_sysv_macho_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/jump_i386_ms_pe_gas.asm +# ${LIBRARY_DIR}/libs/context/src/asm/jump_i386_ms_pe_masm.asm +# ${LIBRARY_DIR}/libs/context/src/asm/jump_i386_sysv_elf_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/jump_i386_sysv_macho_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/jump_i386_x86_64_sysv_macho_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/jump_mips32_o32_elf_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/jump_mips64_n64_elf_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/jump_ppc32_ppc64_sysv_macho_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/jump_ppc32_sysv_elf_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/jump_ppc32_sysv_macho_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/jump_ppc32_sysv_xcoff_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/jump_ppc64_sysv_elf_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/jump_ppc64_sysv_macho_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/jump_ppc64_sysv_xcoff_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/jump_x86_64_ms_pe_gas.asm +# ${LIBRARY_DIR}/libs/context/src/asm/jump_x86_64_ms_pe_masm.asm ${LIBRARY_DIR}/libs/context/src/asm/jump_x86_64_sysv_elf_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/jump_x86_64_sysv_macho_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/make_arm64_aapcs_elf_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/make_arm64_aapcs_macho_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/make_arm_aapcs_elf_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/make_arm_aapcs_macho_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/make_arm_aapcs_pe_armasm.asm - ${LIBRARY_DIR}/libs/context/src/asm/make_combined_sysv_macho_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/make_i386_ms_pe_gas.asm - ${LIBRARY_DIR}/libs/context/src/asm/make_i386_ms_pe_masm.asm - ${LIBRARY_DIR}/libs/context/src/asm/make_i386_sysv_elf_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/make_i386_sysv_macho_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/make_i386_x86_64_sysv_macho_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/make_mips32_o32_elf_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/make_mips64_n64_elf_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/make_ppc32_ppc64_sysv_macho_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/make_ppc32_sysv_elf_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/make_ppc32_sysv_macho_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/make_ppc32_sysv_xcoff_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/make_ppc64_sysv_elf_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/make_ppc64_sysv_macho_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/make_ppc64_sysv_xcoff_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/make_x86_64_ms_pe_gas.asm - ${LIBRARY_DIR}/libs/context/src/asm/make_x86_64_ms_pe_masm.asm +# ${LIBRARY_DIR}/libs/context/src/asm/jump_x86_64_sysv_macho_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/make_arm64_aapcs_elf_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/make_arm64_aapcs_macho_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/make_arm_aapcs_elf_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/make_arm_aapcs_macho_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/make_arm_aapcs_pe_armasm.asm +# ${LIBRARY_DIR}/libs/context/src/asm/make_combined_sysv_macho_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/make_i386_ms_pe_gas.asm +# ${LIBRARY_DIR}/libs/context/src/asm/make_i386_ms_pe_masm.asm +# ${LIBRARY_DIR}/libs/context/src/asm/make_i386_sysv_elf_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/make_i386_sysv_macho_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/make_i386_x86_64_sysv_macho_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/make_mips32_o32_elf_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/make_mips64_n64_elf_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/make_ppc32_ppc64_sysv_macho_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/make_ppc32_sysv_elf_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/make_ppc32_sysv_macho_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/make_ppc32_sysv_xcoff_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/make_ppc64_sysv_elf_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/make_ppc64_sysv_macho_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/make_ppc64_sysv_xcoff_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/make_x86_64_ms_pe_gas.asm +# ${LIBRARY_DIR}/libs/context/src/asm/make_x86_64_ms_pe_masm.asm ${LIBRARY_DIR}/libs/context/src/asm/make_x86_64_sysv_elf_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/make_x86_64_sysv_macho_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/ontop_arm64_aapcs_elf_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/ontop_arm64_aapcs_macho_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/ontop_arm_aapcs_elf_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/ontop_arm_aapcs_macho_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/ontop_arm_aapcs_pe_armasm.asm - ${LIBRARY_DIR}/libs/context/src/asm/ontop_combined_sysv_macho_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/ontop_i386_ms_pe_gas.asm - ${LIBRARY_DIR}/libs/context/src/asm/ontop_i386_ms_pe_masm.asm - ${LIBRARY_DIR}/libs/context/src/asm/ontop_i386_sysv_elf_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/ontop_i386_sysv_macho_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/ontop_i386_x86_64_sysv_macho_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/ontop_mips32_o32_elf_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/ontop_mips64_n64_elf_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/ontop_ppc32_ppc64_sysv_macho_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/ontop_ppc32_sysv_elf_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/ontop_ppc32_sysv_macho_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/ontop_ppc32_sysv_xcoff_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/ontop_ppc64_sysv_elf_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/ontop_ppc64_sysv_macho_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/ontop_ppc64_sysv_xcoff_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/ontop_x86_64_ms_pe_gas.asm - ${LIBRARY_DIR}/libs/context/src/asm/ontop_x86_64_ms_pe_masm.asm +# ${LIBRARY_DIR}/libs/context/src/asm/make_x86_64_sysv_macho_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/ontop_arm64_aapcs_elf_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/ontop_arm64_aapcs_macho_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/ontop_arm_aapcs_elf_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/ontop_arm_aapcs_macho_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/ontop_arm_aapcs_pe_armasm.asm +# ${LIBRARY_DIR}/libs/context/src/asm/ontop_combined_sysv_macho_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/ontop_i386_ms_pe_gas.asm +# ${LIBRARY_DIR}/libs/context/src/asm/ontop_i386_ms_pe_masm.asm +# ${LIBRARY_DIR}/libs/context/src/asm/ontop_i386_sysv_elf_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/ontop_i386_sysv_macho_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/ontop_i386_x86_64_sysv_macho_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/ontop_mips32_o32_elf_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/ontop_mips64_n64_elf_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/ontop_ppc32_ppc64_sysv_macho_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/ontop_ppc32_sysv_elf_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/ontop_ppc32_sysv_macho_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/ontop_ppc32_sysv_xcoff_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/ontop_ppc64_sysv_elf_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/ontop_ppc64_sysv_macho_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/ontop_ppc64_sysv_xcoff_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/ontop_x86_64_ms_pe_gas.asm +# ${LIBRARY_DIR}/libs/context/src/asm/ontop_x86_64_ms_pe_masm.asm ${LIBRARY_DIR}/libs/context/src/asm/ontop_x86_64_sysv_elf_gas.S - ${LIBRARY_DIR}/libs/context/src/asm/ontop_x86_64_sysv_macho_gas.S +# ${LIBRARY_DIR}/libs/context/src/asm/ontop_x86_64_sysv_macho_gas.S ${LIBRARY_DIR}/libs/context/src/continuation.cpp ${LIBRARY_DIR}/libs/context/src/dummy.cpp ${LIBRARY_DIR}/libs/context/src/execution_context.cpp diff --git a/src/Interpreters/tests/CMakeLists.txt b/src/Interpreters/tests/CMakeLists.txt index 9bdedc7b76b..f8fa33d8d2e 100644 --- a/src/Interpreters/tests/CMakeLists.txt +++ b/src/Interpreters/tests/CMakeLists.txt @@ -30,7 +30,7 @@ add_executable (string_hash_map_aggregation string_hash_map.cpp) target_link_libraries (string_hash_map_aggregation PRIVATE dbms) add_executable (context context.cpp) -target_link_libraries (context PRIVATE dbms) +target_link_libraries (context PRIVATE dbms _boost_context) add_executable (two_level_hash_map two_level_hash_map.cpp) target_include_directories (two_level_hash_map SYSTEM BEFORE PRIVATE ${SPARSEHASH_INCLUDE_DIR}) From 319d36a3b76cc7b9000d05cc5b4236e1e4aa6739 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Mon, 30 Nov 2020 19:11:56 +0300 Subject: [PATCH 037/504] Update CMakeLists.txt --- contrib/boost-cmake/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/contrib/boost-cmake/CMakeLists.txt b/contrib/boost-cmake/CMakeLists.txt index 712aa7cdac0..5bf09ca29bd 100644 --- a/contrib/boost-cmake/CMakeLists.txt +++ b/contrib/boost-cmake/CMakeLists.txt @@ -229,8 +229,8 @@ if (NOT EXTERNAL_BOOST_FOUND) ${LIBRARY_DIR}/libs/context/src/execution_context.cpp ${LIBRARY_DIR}/libs/context/src/fiber.cpp ${LIBRARY_DIR}/libs/context/src/posix/stack_traits.cpp - ${LIBRARY_DIR}/libs/context/src/untested.cpp - ${LIBRARY_DIR}/libs/context/src/windows/stack_traits.cpp +# ${LIBRARY_DIR}/libs/context/src/untested.cpp +# ${LIBRARY_DIR}/libs/context/src/windows/stack_traits.cpp ) add_library (_boost_context ${SRCS_CONTEXT}) From 0fae325d76abb49c4a74e73ebe4bcccfcde9f88f Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Wed, 2 Dec 2020 14:18:46 +0300 Subject: [PATCH 038/504] Add FiberStack --- src/Common/FiberStack.h | 45 +++++++++++++++++++ src/Interpreters/tests/context.cpp | 72 ++++++++++++++++++++++++++++-- 2 files changed, 113 insertions(+), 4 deletions(-) create mode 100644 src/Common/FiberStack.h diff --git a/src/Common/FiberStack.h b/src/Common/FiberStack.h new file mode 100644 index 00000000000..c133cb6250c --- /dev/null +++ b/src/Common/FiberStack.h @@ -0,0 +1,45 @@ +#include +#include + +#if defined(BOOST_USE_VALGRIND) +#include +#endif + +/// This is an implementation of allocator for fiber stack. +/// It uses internal allocator, so we track memory usage. It is the main reason why this class is needed. +/// The reference implementations are pooled_fixedsize_stack and protected_fixedsize_stack from boost::context. +template > +class FiberStack +{ +private: + size_t stack_size; +public: + /// 8MB of memory per fiber stack may seem too expensive. It is indeed. + /// The reason is that current (patched) libunwind needs > 4MB of stack memory to unwind stack. + /// If we allocate less memory, any thrown exception inside fiber will cause segfault. + static constexpr size_t default_stack_size = 8 * 1024 * 1024; + + explicit FiberStack(size_t stack_size_ = default_stack_size) : stack_size(stack_size_) {} + + boost::context::stack_context allocate() + { + void * vp = TAllocator().alloc(stack_size); + + boost::context::stack_context sctx; + sctx.size = stack_size; + sctx.sp = static_cast< char * >(vp) + sctx.size; +#if defined(BOOST_USE_VALGRIND) + sctx.valgrind_stack_id = VALGRIND_STACK_REGISTER(sctx.sp, vp); +#endif + return sctx; + } + + void deallocate(boost::context::stack_context & sctx) + { +#if defined(BOOST_USE_VALGRIND) + VALGRIND_STACK_DEREGISTER( sctx.valgrind_stack_id); +#endif + void * vp = static_cast< char * >(sctx.sp) - sctx.size; + TAllocator().free(vp, stack_size); + } +}; diff --git a/src/Interpreters/tests/context.cpp b/src/Interpreters/tests/context.cpp index eed7ca60790..407a7a9ba3b 100644 --- a/src/Interpreters/tests/context.cpp +++ b/src/Interpreters/tests/context.cpp @@ -1,26 +1,90 @@ #include +/// #define BOOST_USE_UCONTEXT #include +#include +#include +#include +#include + +void __attribute__((__noinline__)) foo(std::exception_ptr exception) +{ + if (exception) + std::rethrow_exception(exception); +} + +void __attribute__((__noinline__)) bar(int a) +{ + std::cout << StackTrace().toString() << std::endl; + + if (a > 0) + throw DB::Exception(0, "hello"); +} + +void __attribute__((__noinline__)) gar(int a) +{ + char buf[1024]; + buf[1023] = a & 255; + if (a > 2) + return gar(a - 1); + else + bar(a); +} int main(int, char **) -{ +try { namespace ctx=boost::context; int a; - ctx::fiber source{[&a](ctx::fiber&& sink) + std::exception_ptr exception; + // ctx::protected_fixedsize allocator + // ctx::pooled_fixedsize_stack(1024 * 64 + 2 * 2 * 1024 * 1024 * 16, 1) + ctx::fiber source{std::allocator_arg_t(), FiberStack(), [&](ctx::fiber&& sink) { a=0; int b=1; - while (true) + for (size_t i = 0; i < 9; ++i) { sink=std::move(sink).resume(); int next=a+b; a=b; b=next; } + try + { + gar(1024); + } + catch (...) + { + std::cout << "Saving exception\n"; + exception = std::current_exception(); + } return std::move(sink); }}; + for (int j=0;j<10;++j) { - source=std::move(source).resume(); + try + { + source=std::move(source).resume(); + } + catch (DB::Exception & e) + { + std::cout << "Caught exception in resume " << e.getStackTraceString() << std::endl; + } std::cout << a << " "; } + + std::cout << std::endl; + + try + { + foo(exception); + } + catch (const DB::Exception & e) + { + std::cout << e.getStackTraceString() << std::endl; + } +} +catch (...) +{ + std::cerr << "Uncaught exception\n"; } From e3946bc2b54069b3549f3eb9dcccdc910674604d Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Wed, 2 Dec 2020 20:02:14 +0300 Subject: [PATCH 039/504] Add async read to RemoteQueryExecutor. --- src/Client/Connection.h | 5 +- src/Client/MultiplexedConnections.h | 2 + src/DataStreams/RemoteQueryExecutor.cpp | 184 ++++++++++++++++-------- src/DataStreams/RemoteQueryExecutor.h | 24 ++++ src/IO/ReadBufferFromPocoSocket.cpp | 16 ++- src/IO/ReadBufferFromPocoSocket.h | 13 ++ 6 files changed, 186 insertions(+), 58 deletions(-) diff --git a/src/Client/Connection.h b/src/Client/Connection.h index f4c25001f3e..bc4decb67be 100644 --- a/src/Client/Connection.h +++ b/src/Client/Connection.h @@ -16,6 +16,7 @@ #include #include +#include #include #include @@ -188,6 +189,8 @@ public: size_t outBytesCount() const { return out ? out->count() : 0; } size_t inBytesCount() const { return in ? in->count() : 0; } + void setFiber(ReadBufferFromPocoSocket::Fiber * fiber) { in->setFiber(fiber); } + private: String host; UInt16 port; @@ -224,7 +227,7 @@ private: String server_display_name; std::unique_ptr socket; - std::shared_ptr in; + std::shared_ptr in; std::shared_ptr out; std::optional last_input_packet_type; diff --git a/src/Client/MultiplexedConnections.h b/src/Client/MultiplexedConnections.h index eaec7f744bc..1ec424593b8 100644 --- a/src/Client/MultiplexedConnections.h +++ b/src/Client/MultiplexedConnections.h @@ -67,6 +67,8 @@ public: /// Without locking, because sendCancel() does not change the state of the replicas. bool hasActiveConnections() const { return active_connection_count > 0; } + void setFiber(ReadBufferFromPocoSocket::Fiber * fiber) { current_connection->setFiber(fiber); } + private: /// Internal version of `receivePacket` function without locking. Packet receivePacketUnlocked(); diff --git a/src/DataStreams/RemoteQueryExecutor.cpp b/src/DataStreams/RemoteQueryExecutor.cpp index a7fe9d99688..e4cbf21066d 100644 --- a/src/DataStreams/RemoteQueryExecutor.cpp +++ b/src/DataStreams/RemoteQueryExecutor.cpp @@ -9,6 +9,7 @@ #include #include #include +#include namespace DB { @@ -199,65 +200,136 @@ Block RemoteQueryExecutor::read() Packet packet = multiplexed_connections->receivePacket(); - switch (packet.type) + if (auto block = processPacket(std::move(packet))) + return *block; + } +} + +void RemoteQueryExecutor::read(ReadContext & read_context) +{ + if (!sent_query) + { + sendQuery(); + + if (context.getSettingsRef().skip_unavailable_shards && (0 == multiplexed_connections->size())) { - case Protocol::Server::Data: - /// If the block is not empty and is not a header block - if (packet.block && (packet.block.rows() > 0)) - return adaptBlockStructure(packet.block, header); - break; /// If the block is empty - we will receive other packets before EndOfStream. - - case Protocol::Server::Exception: - got_exception_from_replica = true; - packet.exception->rethrow(); - break; - - case Protocol::Server::EndOfStream: - if (!multiplexed_connections->hasActiveConnections()) - { - finished = true; - return Block(); - } - break; - - case Protocol::Server::Progress: - /** We use the progress from a remote server. - * We also include in ProcessList, - * and we use it to check - * constraints (for example, the minimum speed of query execution) - * and quotas (for example, the number of lines to read). - */ - if (progress_callback) - progress_callback(packet.progress); - break; - - case Protocol::Server::ProfileInfo: - /// Use own (client-side) info about read bytes, it is more correct info than server-side one. - if (profile_info_callback) - profile_info_callback(packet.profile_info); - break; - - case Protocol::Server::Totals: - totals = packet.block; - break; - - case Protocol::Server::Extremes: - extremes = packet.block; - break; - - case Protocol::Server::Log: - /// Pass logs from remote server to client - if (auto log_queue = CurrentThread::getInternalTextLogsQueue()) - log_queue->pushBlock(std::move(packet.block)); - break; - - default: - got_unknown_packet_from_replica = true; - throw Exception(ErrorCodes::UNKNOWN_PACKET_FROM_SERVER, "Unknown packet {} from one of the following replicas: {}", - toString(packet.type), - multiplexed_connections->dumpAddresses()); + read_context.is_read_in_progress = false; + read_context.result.clear(); + return; } } + + do + { + if (!read_context.is_read_in_progress) + { + auto routine = [&read_context, this](boost::context::fiber && sink) + { + read_context.fiber_context.fiber = std::move(sink); + + try + { + multiplexed_connections->setFiber(&read_context.fiber_context); + read_context.packet = multiplexed_connections->receivePacket(); + multiplexed_connections->setFiber(nullptr); + } + catch (...) + { + read_context.exception = std::current_exception(); + } + + return std::move(read_context.fiber_context.fiber); + }; + + read_context.fiber = boost::context::fiber(std::allocator_arg_t(), read_context.stack, std::move(routine)); + } + + read_context.fiber = std::move(read_context.fiber).resume(); + + if (read_context.exception) + std::rethrow_exception(std::move(read_context.exception)); + + if (read_context.fiber) + { + read_context.is_read_in_progress = true; + read_context.fd = read_context.fiber_context.fd; + return; + } + else + { + read_context.is_read_in_progress = false; + if (auto data = processPacket(std::move(read_context.packet))) + { + read_context.result = std::move(*data); + return; + } + } + } + while (true); +} + +std::optional RemoteQueryExecutor::processPacket(Packet packet) +{ + switch (packet.type) + { + case Protocol::Server::Data: + /// If the block is not empty and is not a header block + if (packet.block && (packet.block.rows() > 0)) + return adaptBlockStructure(packet.block, header); + break; /// If the block is empty - we will receive other packets before EndOfStream. + + case Protocol::Server::Exception: + got_exception_from_replica = true; + packet.exception->rethrow(); + break; + + case Protocol::Server::EndOfStream: + if (!multiplexed_connections->hasActiveConnections()) + { + finished = true; + return Block(); + } + break; + + case Protocol::Server::Progress: + /** We use the progress from a remote server. + * We also include in ProcessList, + * and we use it to check + * constraints (for example, the minimum speed of query execution) + * and quotas (for example, the number of lines to read). + */ + if (progress_callback) + progress_callback(packet.progress); + break; + + case Protocol::Server::ProfileInfo: + /// Use own (client-side) info about read bytes, it is more correct info than server-side one. + if (profile_info_callback) + profile_info_callback(packet.profile_info); + break; + + case Protocol::Server::Totals: + totals = packet.block; + break; + + case Protocol::Server::Extremes: + extremes = packet.block; + break; + + case Protocol::Server::Log: + /// Pass logs from remote server to client + if (auto log_queue = CurrentThread::getInternalTextLogsQueue()) + log_queue->pushBlock(std::move(packet.block)); + break; + + default: + got_unknown_packet_from_replica = true; + throw Exception(ErrorCodes::UNKNOWN_PACKET_FROM_SERVER, "Unknown packet {} from one of the following replicas: {}", + toString(packet.type), + multiplexed_connections->dumpAddresses()); + } + + return {}; } void RemoteQueryExecutor::finish() diff --git a/src/DataStreams/RemoteQueryExecutor.h b/src/DataStreams/RemoteQueryExecutor.h index 0db0e0218be..633d442ffc9 100644 --- a/src/DataStreams/RemoteQueryExecutor.h +++ b/src/DataStreams/RemoteQueryExecutor.h @@ -3,6 +3,7 @@ #include #include #include +#include namespace DB { @@ -46,11 +47,31 @@ public: ~RemoteQueryExecutor(); + struct ReadContext + { + bool is_read_in_progress = false; + + /// If is_read_in_progress, use this fd to poll + int fd; + + /// If not is_read_in_progress, result block is set. + Block result; + + /// Internal data + + boost::context::fiber fiber; + Packet packet; + std::exception_ptr exception; + FiberStack<> stack; + ReadBufferFromPocoSocket::Fiber fiber_context; + }; + /// Create connection and send query, external tables and scalars. void sendQuery(); /// Read next block of data. Returns empty block if query is finished. Block read(); + void read(ReadContext & read_context); /// Receive all remain packets and finish query. /// It should be cancelled after read returned empty block. @@ -159,6 +180,9 @@ private: /// Returns true if exception was thrown bool hasThrownException() const; + + /// Process packet for read and return data block if possible. + std::optional processPacket(Packet packet); }; } diff --git a/src/IO/ReadBufferFromPocoSocket.cpp b/src/IO/ReadBufferFromPocoSocket.cpp index 5c66c3209f6..85249002c51 100644 --- a/src/IO/ReadBufferFromPocoSocket.cpp +++ b/src/IO/ReadBufferFromPocoSocket.cpp @@ -28,10 +28,24 @@ bool ReadBufferFromPocoSocket::nextImpl() ssize_t bytes_read = 0; Stopwatch watch; + int flags = 0; + if (fiber) + flags |= MSG_DONTWAIT; + /// Add more details to exceptions. try { - bytes_read = socket.impl()->receiveBytes(internal_buffer.begin(), internal_buffer.size()); + bytes_read = socket.impl()->receiveBytes(internal_buffer.begin(), internal_buffer.size(), flags); + + /// If fiber is specified, and read is blocking, run fiber and try again later. + /// It is expected that file descriptor may be polled externally. + /// Note that receive timeout is not checked here. External code should check it while polling. + while (bytes_read < 0 && fiber && (errno == POCO_EAGAIN || errno == POCO_EWOULDBLOCK)) + { + fiber->fd = socket.impl()->sockfd(); + fiber->fiber = std::move(fiber->fiber).resume(); + bytes_read = socket.impl()->receiveBytes(internal_buffer.begin(), internal_buffer.size(), flags); + } } catch (const Poco::Net::NetException & e) { diff --git a/src/IO/ReadBufferFromPocoSocket.h b/src/IO/ReadBufferFromPocoSocket.h index f328b89d99c..790a350a4de 100644 --- a/src/IO/ReadBufferFromPocoSocket.h +++ b/src/IO/ReadBufferFromPocoSocket.h @@ -5,6 +5,8 @@ #include #include +#include + namespace DB { @@ -28,6 +30,17 @@ public: ReadBufferFromPocoSocket(Poco::Net::Socket & socket_, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE); bool poll(size_t timeout_microseconds); + + struct Fiber + { + boost::context::fiber fiber; + int fd; + }; + + void setFiber(Fiber * fiber_) { fiber = fiber_; } + +private: + Fiber * fiber; }; } From 8ee86e35d24e969c6b3f34b8ac7fa290c38715de Mon Sep 17 00:00:00 2001 From: Alexander Kuzmenkov Date: Wed, 2 Dec 2020 21:16:31 +0300 Subject: [PATCH 040/504] debug --- .../AggregateFunctionSum.cpp | 1 + .../registerFunctionsMiscellaneous.cpp | 2 + src/Functions/runningAccumulate.cpp | 132 ++++++++++++++++++ src/Interpreters/ExpressionAnalyzer.cpp | 68 +++++++++ src/Interpreters/ExpressionAnalyzer.h | 2 + .../Transforms/ExpressionTransform.h | 6 +- src/Storages/IStorage.cpp | 39 ++++++ src/Storages/SelectQueryInfo.h | 4 + 8 files changed, 253 insertions(+), 1 deletion(-) diff --git a/src/AggregateFunctions/AggregateFunctionSum.cpp b/src/AggregateFunctions/AggregateFunctionSum.cpp index 6afae98ef2d..e937769f3a3 100644 --- a/src/AggregateFunctions/AggregateFunctionSum.cpp +++ b/src/AggregateFunctions/AggregateFunctionSum.cpp @@ -52,6 +52,7 @@ template using AggregateFunctionSumKahan = template