From 952b5ea24a09528d9f3caa4d3e033b182143a060 Mon Sep 17 00:00:00 2001 From: Danila Kutenin Date: Wed, 6 Mar 2019 01:42:28 +0300 Subject: [PATCH 01/25] Rename trigramDistance to ngramDistance, add more functions with CaseInsensitive and UTF, update docs, more job done in perf, added some perf tests for string search that I would like to see --- .../Functions/FunctionsStringSimilarity.cpp | 340 ++++++++---- .../src/Functions/FunctionsStringSimilarity.h | 3 +- dbms/tests/performance/website/url_hits.xml | 6 + .../00909_ngram_distance.reference | 509 ++++++++++++++++++ .../0_stateless/00909_ngram_distance.sql | 106 ++++ .../00909_trigram_distance.reference | 119 ---- .../0_stateless/00909_trigram_distance.sql | 29 - .../functions/string_search_functions.md | 8 + .../functions/string_search_functions.md | 8 + 9 files changed, 878 insertions(+), 250 deletions(-) create mode 100644 dbms/tests/queries/0_stateless/00909_ngram_distance.reference create mode 100644 dbms/tests/queries/0_stateless/00909_ngram_distance.sql delete mode 100644 dbms/tests/queries/0_stateless/00909_trigram_distance.reference delete mode 100644 dbms/tests/queries/0_stateless/00909_trigram_distance.sql diff --git a/dbms/src/Functions/FunctionsStringSimilarity.cpp b/dbms/src/Functions/FunctionsStringSimilarity.cpp index 7f0267d6d59..a90c7e82acd 100644 --- a/dbms/src/Functions/FunctionsStringSimilarity.cpp +++ b/dbms/src/Functions/FunctionsStringSimilarity.cpp @@ -8,164 +8,271 @@ #include +#include + #include +#include #include #include #include +#include #ifdef __SSE4_2__ -#include +# include #endif namespace DB { /** Distance function implementation. - * We calculate all the trigrams from left string and count by the index of + * We calculate all the n-grams from left string and count by the index of * 16 bits hash of them in the map. - * Then calculate all the trigrams from the right string and calculate - * the trigram distance on the flight by adding and subtracting from the hashmap. + * Then calculate all the n-grams from the right string and calculate + * the n-gram distance on the flight by adding and subtracting from the hashmap. * Then return the map into the condition of which it was after the left string * calculation. If the right string size is big (more than 2**15 bytes), * the strings are not similar at all and we return 1. */ -struct TrigramDistanceImpl +template +struct NgramDistanceImpl { using ResultType = Float32; - using CodePoint = UInt32; - /// map_size for trigram difference + /// map_size for ngram difference. static constexpr size_t map_size = 1u << 16; - /// If the haystack size is bigger than this, behaviour is unspecified for this function + /// If the haystack size is bigger than this, behaviour is unspecified for this function. static constexpr size_t max_string_size = 1u << 15; + /// Default padding to read safely. + static constexpr size_t default_padding = 16; + + /// Max codepoints to store at once. 16 is for batching usage and PODArray has this padding. + static constexpr size_t simultaneously_codepoints_num = default_padding + N - 1; + /** This fits mostly in L2 cache all the time. * Actually use UInt16 as addings and subtractions do not UB overflow. But think of it as a signed * integer array. */ - using TrigramStats = UInt16[map_size]; + using NgramStats = UInt16[map_size]; - static ALWAYS_INLINE UInt16 trigramHash(CodePoint one, CodePoint two, CodePoint three) + static ALWAYS_INLINE UInt16 ASCIIHash(const CodePoint * code_points) { - UInt64 combined = (static_cast(one) << 32) | two; + return intHashCRC32(unalignedLoad(code_points)) & 0xFFFFu; + } + + static ALWAYS_INLINE UInt16 UTF8Hash(const CodePoint * code_points) + { + UInt64 combined = (static_cast(code_points[0]) << 32) | code_points[1]; #ifdef __SSE4_2__ - return _mm_crc32_u64(three, combined) & 0xFFFFu; + return _mm_crc32_u64(code_points[2], combined) & 0xFFFFu; #else - return (intHashCRC32(combined) ^ intHashCRC32(three)) & 0xFFFFu; + return (intHashCRC32(combined) ^ intHashCRC32(code_points[2])) & 0xFFFFu; #endif } - static ALWAYS_INLINE CodePoint readCodePoint(const char *& pos, const char * end) noexcept + template + static ALWAYS_INLINE inline void unrollLowering(Container & cont, const std::index_sequence &) { - size_t length = UTF8::seqLength(*pos); - - if (pos + length > end) - length = end - pos; - - CodePoint res; - /// This is faster than just memcpy because of compiler optimizations with moving bytes. - switch (length) - { - case 1: - res = 0; - memcpy(&res, pos, 1); - break; - case 2: - res = 0; - memcpy(&res, pos, 2); - break; - case 3: - res = 0; - memcpy(&res, pos, 3); - break; - default: - memcpy(&res, pos, 4); - } - - pos += length; - return res; + ((cont[Offset + I] = std::tolower(cont[Offset + I])), ...); } - static inline size_t calculateNeedleStats(const char * data, const size_t size, TrigramStats & trigram_stats) noexcept + static ALWAYS_INLINE size_t readASCIICodePoints(CodePoint * code_points, const char *& pos, const char * end) { - size_t len = 0; - const char * start = data; - const char * end = data + size; - CodePoint cp1 = 0; - CodePoint cp2 = 0; - CodePoint cp3 = 0; + /// Offset before which we copy some data. + constexpr size_t padding_offset = default_padding - N + 1; + /// We have an array like this for ASCII (N == 4, other cases are similar) + /// |a0|a1|a2|a3|a4|a5|a6|a7|a8|a9|a10|a11|a12|a13|a14|a15|a16|a17|a18| + /// And we copy ^^^^^^^^^^^^^^^ these bytes to the start + /// Actually it is enough to copy 3 bytes, but memcpy for 4 bytes translates into 1 instruction + memcpy(code_points, code_points + padding_offset, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(CodePoint)); + /// Now we have an array + /// |a13|a14|a15|a16|a4|a5|a6|a7|a8|a9|a10|a11|a12|a13|a14|a15|a16|a17|a18| + /// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + /// Doing unaligned read of 16 bytes and copy them like above + /// 16 is also chosen to do two `movups`. + /// Such copying allow us to have 3 codepoints from the previous read to produce the n-gram with them. + memcpy(code_points + (N - 1), pos, default_padding * sizeof(CodePoint)); - while (start != end) + if constexpr (CaseInsensitive) { - cp1 = cp2; - cp2 = cp3; - cp3 = readCodePoint(start, end); - ++len; - if (len < 3) - continue; - ++trigram_stats[trigramHash(cp1, cp2, cp3)]; + /// We really need template lambdas with C++20 to do it inline + unrollLowering(code_points, std::make_index_sequence()); } - return std::max(static_cast(0), static_cast(len) - 2); + pos += padding_offset; + if (pos > end) + return default_padding - (pos - end); + return default_padding; } - static inline UInt64 calculateHaystackStatsAndMetric(const char * data, const size_t size, TrigramStats & trigram_stats, size_t & distance) + static ALWAYS_INLINE size_t readUTF8CodePoints(CodePoint * code_points, const char *& pos, const char * end) { - size_t len = 0; - size_t trigram_cnt = 0; + /// The same copying as described in the function above. + memcpy(code_points, code_points + default_padding - N + 1, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(CodePoint)); + + size_t num = N - 1; + while (num < default_padding && pos < end) + { + size_t length = UTF8::seqLength(*pos); + + if (pos + length > end) + length = end - pos; + + CodePoint res; + /// This is faster than just memcpy because of compiler optimizations with moving bytes. + switch (length) + { + case 1: + res = 0; + memcpy(&res, pos, 1); + break; + case 2: + res = 0; + memcpy(&res, pos, 2); + break; + case 3: + res = 0; + memcpy(&res, pos, 3); + break; + default: + memcpy(&res, pos, 4); + } + + /// This is not a really true case insensitive utf8. We zero the 5-th bit of every byte. + /// For ASCII it works https://catonmat.net/ascii-case-conversion-trick. For most cyrrilic letters also does. + /// For others, we don't care now. Lowering UTF is not a cheap operation. + if constexpr (CaseInsensitive) + { + switch (length) + { + case 4: + res &= ~(1u << (5 + 3 * CHAR_BIT)); + [[fallthrough]]; + case 3: + res &= ~(1u << (5 + 2 * CHAR_BIT)); + [[fallthrough]]; + case 2: + res &= ~(1u << (5 + CHAR_BIT)); + [[fallthrough]]; + default: + res &= ~(1u << 5); + } + } + + pos += length; + code_points[num++] = res; + } + return num; + } + + static ALWAYS_INLINE inline size_t calculateNeedleStats( + const char * data, + const size_t size, + NgramStats & ngram_stats, + size_t (*read_code_points)(CodePoint *, const char *&, const char *), + UInt16 (*hash_functor)(const CodePoint *)) + { + // To prevent size_t overflow below. + if (size < N) + return 0; + const char * start = data; const char * end = data + size; - CodePoint cp1 = 0; - CodePoint cp2 = 0; - CodePoint cp3 = 0; + CodePoint cp[simultaneously_codepoints_num] = {}; + + /// read_code_points returns the position of cp where it stopped reading codepoints. + size_t found = read_code_points(cp, start, end); + /// We need to start for the first time here, because first N - 1 codepoints mean nothing. + size_t i = N - 1; + /// Initialize with this value because for the first time `found` does not initialize first N - 1 codepoints. + size_t len = -N + 1; + do + { + len += found - N + 1; + for (; i + N <= found; ++i) + ++ngram_stats[hash_functor(cp + i)]; + i = 0; + } while (start < end && (found = read_code_points(cp, start, end))); + + return len; + } + + static ALWAYS_INLINE inline UInt64 calculateHaystackStatsAndMetric( + const char * data, + const size_t size, + NgramStats & ngram_stats, + size_t & distance, + size_t (*read_code_points)(CodePoint *, const char *&, const char *), + UInt16 (*hash_functor)(const CodePoint *)) + { + size_t ngram_cnt = 0; + const char * start = data; + const char * end = data + size; + CodePoint cp[simultaneously_codepoints_num] = {}; /// allocation tricks, most strings are relatively small static constexpr size_t small_buffer_size = 256; std::unique_ptr big_buffer; UInt16 small_buffer[small_buffer_size]; - UInt16 * trigram_storage = small_buffer; + UInt16 * ngram_storage = small_buffer; if (size > small_buffer_size) { - trigram_storage = new UInt16[size]; - big_buffer.reset(trigram_storage); + ngram_storage = new UInt16[size]; + big_buffer.reset(ngram_storage); } - while (start != end) + /// read_code_points returns the position of cp where it stopped reading codepoints. + size_t found = read_code_points(cp, start, end); + /// We need to start for the first time here, because first N - 1 codepoints mean nothing. + size_t iter = N - 1; + + do { - cp1 = cp2; - cp2 = cp3; - cp3 = readCodePoint(start, end); - ++len; - if (len < 3) - continue; + for (; iter + N <= found; ++iter) + { + UInt16 hash = hash_functor(cp + iter); + if (static_cast(ngram_stats[hash]) > 0) + --distance; + else + ++distance; - UInt16 hash = trigramHash(cp1, cp2, cp3); - - if (static_cast(trigram_stats[hash]) > 0) - --distance; - else - ++distance; - - trigram_storage[trigram_cnt++] = hash; - --trigram_stats[hash]; - } + ngram_storage[ngram_cnt++] = hash; + --ngram_stats[hash]; + } + iter = 0; + } while (start < end && (found = read_code_points(cp, start, end))); /// Return the state of hash map to its initial. - for (size_t i = 0; i < trigram_cnt; ++i) - ++trigram_stats[trigram_storage[i]]; - - return trigram_cnt; + for (size_t i = 0; i < ngram_cnt; ++i) + ++ngram_stats[ngram_storage[i]]; + return ngram_cnt; } - static void constant_constant(const std::string & data, const std::string & needle, Float32 & res) + template + static inline size_t dispatchSearcher(Callback callback, Args &&... args) { - TrigramStats common_stats; + if constexpr (!UTF8) + return callback(std::forward(args)..., readASCIICodePoints, ASCIIHash); + else + return callback(std::forward(args)..., readUTF8CodePoints, UTF8Hash); + } + + static void constant_constant(std::string data, std::string needle, Float32 & res) + { + NgramStats common_stats; memset(common_stats, 0, sizeof(common_stats)); - size_t second_size = calculateNeedleStats(needle.data(), needle.size(), common_stats); + + /// We use unsafe versions of getting ngrams, so I decided to use padded strings. + const size_t needle_size = needle.size(); + const size_t data_size = data.size(); + needle.resize(needle_size + default_padding); + data.resize(data_size + default_padding); + + size_t second_size = dispatchSearcher(calculateNeedleStats, needle.data(), needle_size, common_stats); size_t distance = second_size; - if (data.size() <= max_string_size) + if (data_size <= max_string_size) { - size_t first_size = calculateHaystackStatsAndMetric(data.data(), data.size(), common_stats, distance); + size_t first_size = dispatchSearcher(calculateHaystackStatsAndMetric, data.data(), data_size, common_stats, distance); res = distance * 1.f / std::max(first_size + second_size, size_t(1)); } else @@ -175,11 +282,18 @@ struct TrigramDistanceImpl } static void vector_constant( - const ColumnString::Chars & data, const ColumnString::Offsets & offsets, const std::string & needle, PaddedPODArray & res) + const ColumnString::Chars & data, const ColumnString::Offsets & offsets, std::string needle, PaddedPODArray & res) { - TrigramStats common_stats; + /// zeroing our map + NgramStats common_stats; memset(common_stats, 0, sizeof(common_stats)); - const size_t needle_stats_size = calculateNeedleStats(needle.data(), needle.size(), common_stats); + + /// We use unsafe versions of getting ngrams, so I decided to use padded_data even in needle case. + const size_t needle_size = needle.size(); + needle.resize(needle_size + default_padding); + + const size_t needle_stats_size = dispatchSearcher(calculateNeedleStats, needle.data(), needle_size, common_stats); + size_t distance = needle_stats_size; size_t prev_offset = 0; for (size_t i = 0; i < offsets.size(); ++i) @@ -188,12 +302,13 @@ struct TrigramDistanceImpl const size_t haystack_size = offsets[i] - prev_offset - 1; if (haystack_size <= max_string_size) { - size_t haystack_stats_size - = calculateHaystackStatsAndMetric(reinterpret_cast(haystack), haystack_size, common_stats, distance); + size_t haystack_stats_size = dispatchSearcher( + calculateHaystackStatsAndMetric, reinterpret_cast(haystack), haystack_size, common_stats, distance); res[i] = distance * 1.f / std::max(haystack_stats_size + needle_stats_size, size_t(1)); } else { + /// if the strings are too big, we say they are completely not the same res[i] = 1.f; } distance = needle_stats_size; @@ -203,16 +318,39 @@ struct TrigramDistanceImpl }; -struct TrigramDistanceName +struct NgramDistanceName { - static constexpr auto name = "trigramDistance"; + static constexpr auto name = "ngramDistance"; }; -using FunctionTrigramsDistance = FunctionsStringSimilarity; +struct NgramDistanceCaseInsensitiveName +{ + static constexpr auto name = "ngramDistanceCaseInsensitive"; +}; + +struct NgramDistanceUTF8Name +{ + static constexpr auto name = "ngramDistanceUTF8"; +}; + +struct NgramDistanceUTF8CaseInsensitiveName +{ + static constexpr auto name = "ngramDistanceCaseInsensitiveUTF8"; +}; + +using FunctionNgramDistance = FunctionsStringSimilarity, NgramDistanceName>; +using FunctionNgramDistanceCaseInsensitive + = FunctionsStringSimilarity, NgramDistanceCaseInsensitiveName>; +using FunctionNgramDistanceUTF8 = FunctionsStringSimilarity, NgramDistanceUTF8Name>; +using FunctionNgramDistanceCaseInsensitiveUTF8 + = FunctionsStringSimilarity, NgramDistanceUTF8CaseInsensitiveName>; void registerFunctionsStringSimilarity(FunctionFactory & factory) { - factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); } } diff --git a/dbms/src/Functions/FunctionsStringSimilarity.h b/dbms/src/Functions/FunctionsStringSimilarity.h index 00c90e20569..c23d9be999a 100644 --- a/dbms/src/Functions/FunctionsStringSimilarity.h +++ b/dbms/src/Functions/FunctionsStringSimilarity.h @@ -12,8 +12,9 @@ namespace DB /** Calculate similarity metrics: * - * trigramDistance(haystack, needle) --- calculate so called 3-gram distance between haystack and needle. + * ngramDistance(haystack, needle) --- calculate n-gram distance between haystack and needle. * Returns float number from 0 to 1 - the closer to zero, the more strings are similar to each other. + * Also support CaseInsensitive and UTF8 formats. */ namespace ErrorCodes diff --git a/dbms/tests/performance/website/url_hits.xml b/dbms/tests/performance/website/url_hits.xml index f83ec663ef7..88f48705d9a 100644 --- a/dbms/tests/performance/website/url_hits.xml +++ b/dbms/tests/performance/website/url_hits.xml @@ -78,5 +78,11 @@ SELECT URLHash, EventDate, count() AS PageViews FROM {table} WHERE CounterID = 34 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND NOT Refresh AND TraficSourceID IN (-1, 6) AND RefererHash = halfMD5('http://example.ru/') GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 100 SELECT WindowClientWidth, WindowClientHeight, count() AS PageViews FROM {table} WHERE CounterID = 34 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND NOT Refresh AND NOT DontCountHits AND URLHash = halfMD5('http://example.ru/') GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10000 SELECT toStartOfMinute(EventTime) AS Minute, count() AS PageViews FROM {table} WHERE CounterID = 34 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-02' AND NOT Refresh AND NOT DontCountHits GROUP BY Minute ORDER BY Minute +SELECT count(multiSearch(URL, ['yandex', 'google', 'rambler'])) from {table}; +SELECT count(match(URL, 'google|yandex|rambler')) from hits_100m_single; +SELECT count(match(URL, 'google')), count(match(URL, 'yandex')), count(match(URL, 'rambler')) from {table} +SELECT DISTINCT Title, ngramDistance(Title, 'what is love') AS distance FROM {table} ORDER BY distance ASC LIMIT 50 +SELECT DISTINCT Title, ngramDistanceUTF8(Title, 'метрика') AS distance FROM {table} ORDER BY distance ASC LIMIT 50 + diff --git a/dbms/tests/queries/0_stateless/00909_ngram_distance.reference b/dbms/tests/queries/0_stateless/00909_ngram_distance.reference new file mode 100644 index 00000000000..356cc5db466 --- /dev/null +++ b/dbms/tests/queries/0_stateless/00909_ngram_distance.reference @@ -0,0 +1,509 @@ +0 +0 +0 +0 +0 +1000 +1000 +1000 +1000 +1000 +1000 +1000 +1000 +1000 +1000 +0 +0 +0 +0 +0 +77 +77 +77 +77 +77 +636 +636 +636 +636 +636 +1000 +1000 +1000 +1000 +1000 +0 +1000 +1000 +0 +77 +636 +1000 +привет как дела?... Херсон 297 +пап привет как дела - Яндекс.Видео 422 +привет как дела клип - Яндекс.Видео 435 +привет братан как дела - Яндекс.Видео 500 +привет 529 +http://metric.ru/ 1000 +http://autometric.ru/ 1000 +http://metrica.yandex.com/ 1000 +http://metris.ru/ 1000 +http://metrika.ru/ 1000 + 1000 +привет как дела?... Херсон 459 +пап привет как дела - Яндекс.Видео 511 +привет 529 +привет как дела клип - Яндекс.Видео 565 +привет братан как дела - Яндекс.Видео 583 +http://metric.ru/ 1000 +http://autometric.ru/ 1000 +http://metrica.yandex.com/ 1000 +http://metris.ru/ 1000 +http://metrika.ru/ 1000 + 1000 +http://metrika.ru/ 524 +http://metric.ru/ 700 +http://metris.ru/ 700 +http://autometric.ru/ 750 +http://metrica.yandex.com/ 793 +привет как дела?... Херсон 1000 +привет как дела клип - Яндекс.Видео 1000 +привет 1000 +пап привет как дела - Яндекс.Видео 1000 +привет братан как дела - Яндекс.Видео 1000 + 1000 +http://metric.ru/ 600 +http://metrica.yandex.com/ 655 +http://autometric.ru/ 667 +http://metris.ru/ 700 +http://metrika.ru/ 714 +привет как дела?... Херсон 1000 +привет как дела клип - Яндекс.Видео 1000 +привет 1000 +пап привет как дела - Яндекс.Видео 1000 +привет братан как дела - Яндекс.Видео 1000 + 1000 +http://metrika.ru/ 619 +http://metric.ru/ 700 +http://metris.ru/ 700 +http://autometric.ru/ 750 +http://metrica.yandex.com/ 793 +привет как дела?... Херсон 1000 +привет как дела клип - Яндекс.Видео 1000 +привет 1000 +пап привет как дела - Яндекс.Видео 1000 +привет братан как дела - Яндекс.Видео 1000 + 1000 +http://metric.ru/ 600 +http://autometric.ru/ 667 +http://metris.ru/ 700 +http://metrika.ru/ 714 +http://metrica.yandex.com/ 724 +привет как дела?... Херсон 1000 +привет как дела клип - Яндекс.Видео 1000 +привет 1000 +пап привет как дела - Яндекс.Видео 1000 +привет братан как дела - Яндекс.Видео 1000 + 1000 +http://metrica.yandex.com/ 714 +привет как дела?... Херсон 1000 +привет как дела клип - Яндекс.Видео 1000 +привет 1000 +пап привет как дела - Яндекс.Видео 1000 +привет братан как дела - Яндекс.Видео 1000 +http://metric.ru/ 1000 +http://autometric.ru/ 1000 +http://metris.ru/ 1000 +http://metrika.ru/ 1000 + 1000 +0 +0 +0 +0 +0 +1000 +1000 +1000 +1000 +1000 +1000 +1000 +1000 +1000 +1000 +0 +0 +0 +0 +0 +77 +77 +77 +77 +77 +636 +636 +636 +636 +636 +1000 +1000 +1000 +1000 +1000 +0 +1000 +1000 +429 +77 +636 +1000 +привет как дела?... Херсон 297 +пап привет как дела - Яндекс.Видео 422 +привет как дела клип - Яндекс.Видео 435 +привет братан как дела - Яндекс.Видео 500 +привет 529 +http://metric.ru/ 1000 +http://autometric.ru/ 1000 +http://metrica.yandex.com/ 1000 +http://metris.ru/ 1000 +http://metrika.ru/ 1000 + 1000 +привет как дела?... Херсон 676 +пап привет как дела - Яндекс.Видео 733 +привет как дела клип - Яндекс.Видео 739 +привет братан как дела - Яндекс.Видео 750 +привет 882 +http://metric.ru/ 1000 +http://autometric.ru/ 1000 +http://metrica.yandex.com/ 1000 +http://metris.ru/ 1000 +http://metrika.ru/ 1000 + 1000 +http://metrika.ru/ 524 +http://metric.ru/ 700 +http://metris.ru/ 700 +http://autometric.ru/ 750 +http://metrica.yandex.com/ 793 +привет как дела?... Херсон 1000 +привет как дела клип - Яндекс.Видео 1000 +привет 1000 +пап привет как дела - Яндекс.Видео 1000 +привет братан как дела - Яндекс.Видео 1000 + 1000 +http://metrika.ru/ 524 +http://metric.ru/ 700 +http://metris.ru/ 700 +http://autometric.ru/ 750 +http://metrica.yandex.com/ 793 +привет как дела?... Херсон 1000 +привет как дела клип - Яндекс.Видео 1000 +привет 1000 +пап привет как дела - Яндекс.Видео 1000 +привет братан как дела - Яндекс.Видео 1000 + 1000 +http://metric.ru/ 600 +http://metrica.yandex.com/ 655 +http://autometric.ru/ 667 +http://metris.ru/ 700 +http://metrika.ru/ 714 +привет как дела?... Херсон 1000 +привет как дела клип - Яндекс.Видео 1000 +привет 1000 +пап привет как дела - Яндекс.Видео 1000 +привет братан как дела - Яндекс.Видео 1000 + 1000 +http://metrika.ru/ 619 +http://metric.ru/ 700 +http://metris.ru/ 700 +http://autometric.ru/ 750 +http://metrica.yandex.com/ 793 +привет как дела?... Херсон 1000 +привет как дела клип - Яндекс.Видео 1000 +привет 1000 +пап привет как дела - Яндекс.Видео 1000 +привет братан как дела - Яндекс.Видео 1000 + 1000 +http://metric.ru/ 600 +http://autometric.ru/ 667 +http://metris.ru/ 700 +http://metrika.ru/ 714 +http://metrica.yandex.com/ 724 +привет как дела?... Херсон 1000 +привет как дела клип - Яндекс.Видео 1000 +привет 1000 +пап привет как дела - Яндекс.Видео 1000 +привет братан как дела - Яндекс.Видео 1000 + 1000 +http://metrica.yandex.com/ 714 +привет как дела?... Херсон 1000 +привет как дела клип - Яндекс.Видео 1000 +привет 1000 +пап привет как дела - Яндекс.Видео 1000 +привет братан как дела - Яндекс.Видео 1000 +http://metric.ru/ 1000 +http://autometric.ru/ 1000 +http://metris.ru/ 1000 +http://metrika.ru/ 1000 + 1000 +привет как дела клип - Яндекс.Видео 182 +пап привет как дела - Яндекс.Видео 354 +привет братан как дела - Яндекс.Видео 382 +привет как дела?... Херсон 649 +привет 838 +http://metric.ru/ 1000 +http://autometric.ru/ 1000 +http://metrica.yandex.com/ 1000 +http://metris.ru/ 1000 +http://metrika.ru/ 1000 + 1000 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +111 +111 +111 +111 +111 +429 +429 +429 +429 +429 +1000 +1000 +1000 +1000 +1000 +0 +0 +0 +0 +111 +429 +1000 +привет как дела?... Херсон 254 +пап привет как дела - Яндекс.Видео 398 +привет как дела клип - Яндекс.Видео 412 +привет братан как дела - Яндекс.Видео 461 +привет 471 +http://metric.ru/ 1000 +http://autometric.ru/ 1000 +http://metrica.yandex.com/ 1000 +http://metris.ru/ 1000 +http://metrika.ru/ 1000 + 1000 +привет как дела?... Херсон 343 +пап привет как дела - Яндекс.Видео 446 +привет 471 +привет как дела клип - Яндекс.Видео 482 +привет братан как дела - Яндекс.Видео 506 +http://metric.ru/ 1000 +http://autometric.ru/ 1000 +http://metrica.yandex.com/ 1000 +http://metris.ru/ 1000 +http://metrika.ru/ 1000 + 1000 +http://metrika.ru/ 579 +http://metric.ru/ 778 +http://metris.ru/ 778 +http://autometric.ru/ 818 +http://metrica.yandex.com/ 852 +привет как дела?... Херсон 1000 +привет как дела клип - Яндекс.Видео 1000 +привет 1000 +пап привет как дела - Яндекс.Видео 1000 +привет братан как дела - Яндекс.Видео 1000 + 1000 +http://metric.ru/ 667 +http://metrica.yandex.com/ 704 +http://autometric.ru/ 727 +http://metris.ru/ 778 +http://metrika.ru/ 789 +привет как дела?... Херсон 1000 +привет как дела клип - Яндекс.Видео 1000 +привет 1000 +пап привет как дела - Яндекс.Видео 1000 +привет братан как дела - Яндекс.Видео 1000 + 1000 +http://metrika.ru/ 684 +http://metric.ru/ 778 +http://metris.ru/ 778 +http://autometric.ru/ 818 +http://metrica.yandex.com/ 852 +привет как дела?... Херсон 1000 +привет как дела клип - Яндекс.Видео 1000 +привет 1000 +пап привет как дела - Яндекс.Видео 1000 +привет братан как дела - Яндекс.Видео 1000 + 1000 +http://metric.ru/ 667 +http://autometric.ru/ 727 +http://metrica.yandex.com/ 778 +http://metris.ru/ 778 +http://metrika.ru/ 789 +привет как дела?... Херсон 1000 +привет как дела клип - Яндекс.Видео 1000 +привет 1000 +пап привет как дела - Яндекс.Видео 1000 +привет братан как дела - Яндекс.Видео 1000 + 1000 +http://metrica.yandex.com/ 769 +привет как дела?... Херсон 1000 +привет как дела клип - Яндекс.Видео 1000 +привет 1000 +пап привет как дела - Яндекс.Видео 1000 +привет братан как дела - Яндекс.Видео 1000 +http://metric.ru/ 1000 +http://autometric.ru/ 1000 +http://metris.ru/ 1000 +http://metrika.ru/ 1000 + 1000 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +111 +111 +111 +111 +111 +600 +600 +600 +600 +600 +1000 +1000 +1000 +1000 +1000 +0 +0 +0 +0 +111 +600 +1000 +привет как дела?... Херсон 910 +пап привет как дела - Яндекс.Видео 928 +привет как дела клип - Яндекс.Видео 929 +привет братан как дела - Яндекс.Видео 955 +привет 1000 +http://metric.ru/ 1000 +http://autometric.ru/ 1000 +http://metrica.yandex.com/ 1000 +http://metris.ru/ 1000 +http://metrika.ru/ 1000 + 1000 +привет как дела?... Херсон 672 +пап привет как дела - Яндекс.Видео 735 +привет как дела клип - Яндекс.Видео 741 +привет братан как дела - Яндекс.Видео 753 +привет 1000 +http://metric.ru/ 1000 +http://autometric.ru/ 1000 +http://metrica.yandex.com/ 1000 +http://metris.ru/ 1000 +http://metrika.ru/ 1000 + 1000 +http://metrika.ru/ 579 +http://metric.ru/ 778 +http://metris.ru/ 778 +http://autometric.ru/ 818 +http://metrica.yandex.com/ 852 +привет как дела?... Херсон 1000 +привет как дела клип - Яндекс.Видео 1000 +привет 1000 +пап привет как дела - Яндекс.Видео 1000 +привет братан как дела - Яндекс.Видео 1000 + 1000 +http://metrika.ru/ 579 +http://metric.ru/ 778 +http://metris.ru/ 778 +http://autometric.ru/ 818 +http://metrica.yandex.com/ 852 +привет как дела?... Херсон 1000 +привет как дела клип - Яндекс.Видео 1000 +привет 1000 +пап привет как дела - Яндекс.Видео 1000 +привет братан как дела - Яндекс.Видео 1000 + 1000 +http://metric.ru/ 667 +http://metrica.yandex.com/ 704 +http://autometric.ru/ 727 +http://metris.ru/ 778 +http://metrika.ru/ 789 +привет как дела?... Херсон 1000 +привет как дела клип - Яндекс.Видео 1000 +привет 1000 +пап привет как дела - Яндекс.Видео 1000 +привет братан как дела - Яндекс.Видео 1000 + 1000 +http://metrika.ru/ 684 +http://metric.ru/ 778 +http://metris.ru/ 778 +http://autometric.ru/ 818 +http://metrica.yandex.com/ 852 +привет как дела?... Херсон 1000 +привет как дела клип - Яндекс.Видео 1000 +привет 1000 +пап привет как дела - Яндекс.Видео 1000 +привет братан как дела - Яндекс.Видео 1000 + 1000 +http://metric.ru/ 667 +http://autometric.ru/ 727 +http://metrica.yandex.com/ 778 +http://metris.ru/ 778 +http://metrika.ru/ 789 +привет как дела?... Херсон 1000 +привет как дела клип - Яндекс.Видео 1000 +привет 1000 +пап привет как дела - Яндекс.Видео 1000 +привет братан как дела - Яндекс.Видео 1000 + 1000 +http://metrica.yandex.com/ 769 +привет как дела?... Херсон 1000 +привет как дела клип - Яндекс.Видео 1000 +привет 1000 +пап привет как дела - Яндекс.Видео 1000 +привет братан как дела - Яндекс.Видео 1000 +http://metric.ru/ 1000 +http://autometric.ru/ 1000 +http://metris.ru/ 1000 +http://metrika.ru/ 1000 + 1000 diff --git a/dbms/tests/queries/0_stateless/00909_ngram_distance.sql b/dbms/tests/queries/0_stateless/00909_ngram_distance.sql new file mode 100644 index 00000000000..867e69f4fe7 --- /dev/null +++ b/dbms/tests/queries/0_stateless/00909_ngram_distance.sql @@ -0,0 +1,106 @@ +select round(1000 * ngramDistanceUTF8(materialize(''), '')) from system.numbers limit 5; +select round(1000 * ngramDistanceUTF8(materialize('абв'), '')) from system.numbers limit 5; +select round(1000 * ngramDistanceUTF8(materialize(''), 'абв')) from system.numbers limit 5; +select round(1000 * ngramDistanceUTF8(materialize('абвгдеёжз'), 'абвгдеёжз')) from system.numbers limit 5; +select round(1000 * ngramDistanceUTF8(materialize('абвгдеёжз'), 'абвгдеёж')) from system.numbers limit 5; +select round(1000 * ngramDistanceUTF8(materialize('абвгдеёжз'), 'гдеёзд')) from system.numbers limit 5; +select round(1000 * ngramDistanceUTF8(materialize('абвгдеёжз'), 'ёёёёёёёё')) from system.numbers limit 5; + +select round(1000 * ngramDistanceUTF8('', '')); +select round(1000 * ngramDistanceUTF8('абв', '')); +select round(1000 * ngramDistanceUTF8('', 'абв')); +select round(1000 * ngramDistanceUTF8('абвгдеёжз', 'абвгдеёжз')); +select round(1000 * ngramDistanceUTF8('абвгдеёжз', 'абвгдеёж')); +select round(1000 * ngramDistanceUTF8('абвгдеёжз', 'гдеёзд')); +select round(1000 * ngramDistanceUTF8('абвгдеёжз', 'ёёёёёёёё')); + +drop table if exists test.test_distance; +create table test.test_distance (Title String) engine = Memory; +insert into test.test_distance values ('привет как дела?... Херсон'), ('привет как дела клип - Яндекс.Видео'), ('привет'), ('пап привет как дела - Яндекс.Видео'), ('привет братан как дела - Яндекс.Видео'), ('http://metric.ru/'), ('http://autometric.ru/'), ('http://metrica.yandex.com/'), ('http://metris.ru/'), ('http://metrika.ru/'), (''); + +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceUTF8(Title, 'привет как дела') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceUTF8(Title, 'как привет дела') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceUTF8(Title, 'metrika') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceUTF8(Title, 'metrica') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceUTF8(Title, 'metriks') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceUTF8(Title, 'metrics') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceUTF8(Title, 'yandex') as distance; + + +select round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize(''), '')) from system.numbers limit 5; +select round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('абв'), '')) from system.numbers limit 5; +select round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize(''), 'абв')) from system.numbers limit 5; +select round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('абвГДЕёжз'), 'АбвгдЕёжз')) from system.numbers limit 5; +select round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('аБВГдеёЖз'), 'АбвГдеёж')) from system.numbers limit 5; +select round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('абвгдеёжз'), 'гдеёЗД')) from system.numbers limit 5; +select round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('абвгдеёжз'), 'ЁЁЁЁЁЁЁЁ')) from system.numbers limit 5; + +select round(1000 * ngramDistanceCaseInsensitiveUTF8('', '')); +select round(1000 * ngramDistanceCaseInsensitiveUTF8('абв', '')); +select round(1000 * ngramDistanceCaseInsensitiveUTF8('', 'абв')); +select round(1000 * ngramDistanceCaseInsensitiveUTF8('абвГДЕёжз', 'АбвгдЕЁжз')); +select round(1000 * ngramDistanceCaseInsensitiveUTF8('аБВГдеёЖз', 'АбвГдеёж')); +select round(1000 * ngramDistanceCaseInsensitiveUTF8('абвгдеёжз', 'гдеёЗД')); +select round(1000 * ngramDistanceCaseInsensitiveUTF8('АБВГДеёжз', 'ЁЁЁЁЁЁЁЁ')); + +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitiveUTF8(Title, 'ПрИвЕт кАК ДЕЛа') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitiveUTF8(Title, 'как ПРИВЕТ дела') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitiveUTF8(Title, 'metrika') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitiveUTF8(Title, 'Metrika') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitiveUTF8(Title, 'mEtrica') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitiveUTF8(Title, 'metriKS') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitiveUTF8(Title, 'metrics') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitiveUTF8(Title, 'YanDEX') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitiveUTF8(Title, 'приВЕТ КАк ДеЛа КлИп - яндеКс.видео') as distance; + + +select round(1000 * ngramDistance(materialize(''), '')) from system.numbers limit 5; +select round(1000 * ngramDistance(materialize('abc'), '')) from system.numbers limit 5; +select round(1000 * ngramDistance(materialize(''), 'abc')) from system.numbers limit 5; +select round(1000 * ngramDistance(materialize('abcdefgh'), 'abcdefgh')) from system.numbers limit 5; +select round(1000 * ngramDistance(materialize('abcdefgh'), 'abcdefg')) from system.numbers limit 5; +select round(1000 * ngramDistance(materialize('abcdefgh'), 'defgh')) from system.numbers limit 5; +select round(1000 * ngramDistance(materialize('abcdefgh'), 'aaaaaaaa')) from system.numbers limit 5; + +select round(1000 * ngramDistance('', '')); +select round(1000 * ngramDistance('abc', '')); +select round(1000 * ngramDistance('', 'abc')); +select round(1000 * ngramDistance('abcdefgh', 'abcdefgh')); +select round(1000 * ngramDistance('abcdefgh', 'abcdefg')); +select round(1000 * ngramDistance('abcdefgh', 'defgh')); +select round(1000 * ngramDistance('abcdefgh', 'aaaaaaaa')); + +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistance(Title, 'привет как дела') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistance(Title, 'как привет дела') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistance(Title, 'metrika') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistance(Title, 'metrica') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistance(Title, 'metriks') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistance(Title, 'metrics') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistance(Title, 'yandex') as distance; + +select round(1000 * ngramDistanceCaseInsensitive(materialize(''), '')) from system.numbers limit 5; +select round(1000 * ngramDistanceCaseInsensitive(materialize('abc'), '')) from system.numbers limit 5; +select round(1000 * ngramDistanceCaseInsensitive(materialize(''), 'abc')) from system.numbers limit 5; +select round(1000 * ngramDistanceCaseInsensitive(materialize('abCdefgH'), 'Abcdefgh')) from system.numbers limit 5; +select round(1000 * ngramDistanceCaseInsensitive(materialize('abcdefgh'), 'abcdeFG')) from system.numbers limit 5; +select round(1000 * ngramDistanceCaseInsensitive(materialize('AAAAbcdefgh'), 'defgh')) from system.numbers limit 5; +select round(1000 * ngramDistanceCaseInsensitive(materialize('ABCdefgH'), 'aaaaaaaa')) from system.numbers limit 5; + +select round(1000 * ngramDistanceCaseInsensitive('', '')); +select round(1000 * ngramDistanceCaseInsensitive('abc', '')); +select round(1000 * ngramDistanceCaseInsensitive('', 'abc')); +select round(1000 * ngramDistanceCaseInsensitive('abCdefgH', 'Abcdefgh')); +select round(1000 * ngramDistanceCaseInsensitive('abcdefgh', 'abcdeFG')); +select round(1000 * ngramDistanceCaseInsensitive('AAAAbcdefgh', 'defgh')); +select round(1000 * ngramDistanceCaseInsensitive('ABCdefgH', 'aaaaaaaa')); + +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitive(Title, 'ПрИвЕт кАК ДЕЛа') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitive(Title, 'как ПРИВЕТ дела') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitive(Title, 'metrika') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitive(Title, 'Metrika') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitive(Title, 'mEtrica') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitive(Title, 'metriKS') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitive(Title, 'metrics') as distance; +SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitive(Title, 'YanDEX') as distance; + +drop table if exists test.test_distance; diff --git a/dbms/tests/queries/0_stateless/00909_trigram_distance.reference b/dbms/tests/queries/0_stateless/00909_trigram_distance.reference deleted file mode 100644 index 14dba2a2dcf..00000000000 --- a/dbms/tests/queries/0_stateless/00909_trigram_distance.reference +++ /dev/null @@ -1,119 +0,0 @@ -0 -0 -0 -0 -0 -1000 -1000 -1000 -1000 -1000 -1000 -1000 -1000 -1000 -1000 -0 -0 -0 -0 -0 -77 -77 -77 -77 -77 -636 -636 -636 -636 -636 -1000 -1000 -1000 -1000 -1000 -0 -1000 -1000 -0 -77 -636 -1000 -привет как дела?... Херсон -пап привет как дела - Яндекс.Видео -привет как дела клип - Яндекс.Видео -привет братан как дела - Яндекс.Видео -привет -http://metric.ru/ -http://autometric.ru/ -http://metrica.yandex.com/ -http://metris.ru/ -http://metrika.ru/ - -привет как дела?... Херсон -пап привет как дела - Яндекс.Видео -привет -привет как дела клип - Яндекс.Видео -привет братан как дела - Яндекс.Видео -http://metric.ru/ -http://autometric.ru/ -http://metrica.yandex.com/ -http://metris.ru/ -http://metrika.ru/ - -http://metrika.ru/ -http://metric.ru/ -http://metris.ru/ -http://autometric.ru/ -http://metrica.yandex.com/ -привет как дела?... Херсон -привет как дела клип - Яндекс.Видео -привет -пап привет как дела - Яндекс.Видео -привет братан как дела - Яндекс.Видео - -http://metric.ru/ -http://metrica.yandex.com/ -http://autometric.ru/ -http://metris.ru/ -http://metrika.ru/ -привет как дела?... Херсон -привет как дела клип - Яндекс.Видео -привет -пап привет как дела - Яндекс.Видео -привет братан как дела - Яндекс.Видео - -http://metrika.ru/ -http://metric.ru/ -http://metris.ru/ -http://autometric.ru/ -http://metrica.yandex.com/ -привет как дела?... Херсон -привет как дела клип - Яндекс.Видео -привет -пап привет как дела - Яндекс.Видео -привет братан как дела - Яндекс.Видео - -http://metric.ru/ -http://autometric.ru/ -http://metris.ru/ -http://metrika.ru/ -http://metrica.yandex.com/ -привет как дела?... Херсон -привет как дела клип - Яндекс.Видео -привет -пап привет как дела - Яндекс.Видео -привет братан как дела - Яндекс.Видео - -http://metrica.yandex.com/ -привет как дела?... Херсон -привет как дела клип - Яндекс.Видео -привет -пап привет как дела - Яндекс.Видео -привет братан как дела - Яндекс.Видео -http://metric.ru/ -http://autometric.ru/ -http://metris.ru/ -http://metrika.ru/ - diff --git a/dbms/tests/queries/0_stateless/00909_trigram_distance.sql b/dbms/tests/queries/0_stateless/00909_trigram_distance.sql deleted file mode 100644 index ca6a18d2513..00000000000 --- a/dbms/tests/queries/0_stateless/00909_trigram_distance.sql +++ /dev/null @@ -1,29 +0,0 @@ -select round(1000 * trigramDistance(materialize(''), '')) from system.numbers limit 5; -select round(1000 * trigramDistance(materialize('абв'), '')) from system.numbers limit 5; -select round(1000 * trigramDistance(materialize(''), 'абв')) from system.numbers limit 5; -select round(1000 * trigramDistance(materialize('абвгдеёжз'), 'абвгдеёжз')) from system.numbers limit 5; -select round(1000 * trigramDistance(materialize('абвгдеёжз'), 'абвгдеёж')) from system.numbers limit 5; -select round(1000 * trigramDistance(materialize('абвгдеёжз'), 'гдеёзд')) from system.numbers limit 5; -select round(1000 * trigramDistance(materialize('абвгдеёжз'), 'ёёёёёёёё')) from system.numbers limit 5; - -select round(1000 * trigramDistance('', '')); -select round(1000 * trigramDistance('абв', '')); -select round(1000 * trigramDistance('', 'абв')); -select round(1000 * trigramDistance('абвгдеёжз', 'абвгдеёжз')); -select round(1000 * trigramDistance('абвгдеёжз', 'абвгдеёж')); -select round(1000 * trigramDistance('абвгдеёжз', 'гдеёзд')); -select round(1000 * trigramDistance('абвгдеёжз', 'ёёёёёёёё')); - -drop table if exists test.test_distance; -create table test.test_distance (Title String) engine = Memory; -insert into test.test_distance values ('привет как дела?... Херсон'), ('привет как дела клип - Яндекс.Видео'), ('привет'), ('пап привет как дела - Яндекс.Видео'), ('привет братан как дела - Яндекс.Видео'), ('http://metric.ru/'), ('http://autometric.ru/'), ('http://metrica.yandex.com/'), ('http://metris.ru/'), ('http://metrika.ru/'), (''); - -SELECT Title FROM test.test_distance ORDER BY trigramDistance(Title, 'привет как дела'); -SELECT Title FROM test.test_distance ORDER BY trigramDistance(Title, 'как привет дела'); -SELECT Title FROM test.test_distance ORDER BY trigramDistance(Title, 'metrika'); -SELECT Title FROM test.test_distance ORDER BY trigramDistance(Title, 'metrica'); -SELECT Title FROM test.test_distance ORDER BY trigramDistance(Title, 'metriks'); -SELECT Title FROM test.test_distance ORDER BY trigramDistance(Title, 'metrics'); -SELECT Title FROM test.test_distance ORDER BY trigramDistance(Title, 'yandex'); - -drop table if exists test.test_distance; diff --git a/docs/en/query_language/functions/string_search_functions.md b/docs/en/query_language/functions/string_search_functions.md index b3b8b63d136..26890c4c920 100644 --- a/docs/en/query_language/functions/string_search_functions.md +++ b/docs/en/query_language/functions/string_search_functions.md @@ -70,5 +70,13 @@ For other regular expressions, the code is the same as for the 'match' function. The same thing as 'like', but negative. +## ngramDistance(haystack, needle) + +Calculate the 4-gram distance between `haystack` and `needle`: count the symmetric difference between two sets of 4-grams and normalize it by the sum of their cardinality. Returns float number from 0 to 1 - the closer to zero, the more strings are similar to each other. If the `needle` is more than 32Kb, throw an exception. If some of the `haystack` strings are more than 32Kb, the distance is always one. + +For case-insensitive search or/and in UTF-8 format use functions `ngramDistanceCaseInsensitive, ngramDistanceUTF8, ngramDistanceCaseInsensitiveUTF8`. + +Notes: For UTF-8 case we use 3-gram distance. All these are not perfectly fair n-gram distances. We use 2 bytes hashes to hash n-grams and then calculate the symmetric difference between these hash tables -- collisions may occur. With UTF-8 case-insensitive format we do not use fair `tolower` function -- we zero the 5-th bit of each codepoint byte -- this works for Latin and mostly for all Cyrillic letters. + [Original article](https://clickhouse.yandex/docs/en/query_language/functions/string_search_functions/) diff --git a/docs/ru/query_language/functions/string_search_functions.md b/docs/ru/query_language/functions/string_search_functions.md index a79ea043716..b0f72e6474d 100644 --- a/docs/ru/query_language/functions/string_search_functions.md +++ b/docs/ru/query_language/functions/string_search_functions.md @@ -59,4 +59,12 @@ ## notLike(haystack, pattern), оператор haystack NOT LIKE pattern То же, что like, но с отрицанием. +## ngramDistance(haystack, needle) + +Вычисление 4-граммного расстояния между `haystack` и `needle`: считается симметрическая разность между двумя множествами 4-грам и нормализается на сумму их мощностей. Возвращает число float от 0 до 1 - чем ближе к нулю, тем больше строки похожи друг на друга. Если `needle` больше чем 32КБ, кидается исключение. Если некоторые строка из `haystack` больше 32КБ, расстояние всегда равно единице. + +Для поиска без учета регистра и/или в формате UTF-8 используйте функции `ngramDistanceCaseInsensitive, ngramDistanceUTF8, ngramDistanceCaseInsensitiveUTF8`. + +Примечание: для случая UTF-8 мы используем триграммное расстояние. Вычисление n-граммовного расстояния не совсем честное. Мы используем 2-х байтные хэши для хэширования n-грамм, а затем вычисляем симметричную разность между хэш-таблицами - могут возникнуть коллизии. В формате UTF-8 без учета регистра мы не используем честную функцию `tolower` - мы обнуляем 5-й бит каждого байта кодовой точки - это работает для латиницы и почти для всех кириллических букв. + [Оригинальная статья](https://clickhouse.yandex/docs/ru/query_language/functions/string_search_functions/) From 24cc9e4e65a3554d69b15305584393c582467827 Mon Sep 17 00:00:00 2001 From: Danila Kutenin Date: Wed, 6 Mar 2019 01:44:47 +0300 Subject: [PATCH 02/25] Fix docs --- docs/en/query_language/functions/string_search_functions.md | 2 +- docs/ru/query_language/functions/string_search_functions.md | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/en/query_language/functions/string_search_functions.md b/docs/en/query_language/functions/string_search_functions.md index 26890c4c920..c900b52cf94 100644 --- a/docs/en/query_language/functions/string_search_functions.md +++ b/docs/en/query_language/functions/string_search_functions.md @@ -72,7 +72,7 @@ The same thing as 'like', but negative. ## ngramDistance(haystack, needle) -Calculate the 4-gram distance between `haystack` and `needle`: count the symmetric difference between two sets of 4-grams and normalize it by the sum of their cardinality. Returns float number from 0 to 1 - the closer to zero, the more strings are similar to each other. If the `needle` is more than 32Kb, throw an exception. If some of the `haystack` strings are more than 32Kb, the distance is always one. +Calculate the 4-gram distance between `haystack` and `needle`: count the symmetric difference between two sets of 4-grams and normalize it by the sum of their cardinalities. Returns float number from 0 to 1 - the closer to zero, the more strings are similar to each other. If the `needle` is more than 32Kb, throw an exception. If some of the `haystack` strings are more than 32Kb, the distance is always one. For case-insensitive search or/and in UTF-8 format use functions `ngramDistanceCaseInsensitive, ngramDistanceUTF8, ngramDistanceCaseInsensitiveUTF8`. diff --git a/docs/ru/query_language/functions/string_search_functions.md b/docs/ru/query_language/functions/string_search_functions.md index b0f72e6474d..48a255ded71 100644 --- a/docs/ru/query_language/functions/string_search_functions.md +++ b/docs/ru/query_language/functions/string_search_functions.md @@ -61,10 +61,10 @@ ## ngramDistance(haystack, needle) -Вычисление 4-граммного расстояния между `haystack` и `needle`: считается симметрическая разность между двумя множествами 4-грам и нормализается на сумму их мощностей. Возвращает число float от 0 до 1 - чем ближе к нулю, тем больше строки похожи друг на друга. Если `needle` больше чем 32КБ, кидается исключение. Если некоторые строка из `haystack` больше 32КБ, расстояние всегда равно единице. +Вычисление 4-граммного расстояния между `haystack` и `needle`: считается симметрическая разность между двумя множествами 4-грамм и нормализается на сумму их мощностей. Возвращает число float от 0 до 1 - чем ближе к нулю, тем больше строки похожи друг на друга. Если `needle` больше чем 32КБ, кидается исключение. Если некоторые строка из `haystack` больше 32КБ, расстояние всегда равно единице. Для поиска без учета регистра и/или в формате UTF-8 используйте функции `ngramDistanceCaseInsensitive, ngramDistanceUTF8, ngramDistanceCaseInsensitiveUTF8`. -Примечание: для случая UTF-8 мы используем триграммное расстояние. Вычисление n-граммовного расстояния не совсем честное. Мы используем 2-х байтные хэши для хэширования n-грамм, а затем вычисляем симметричную разность между хэш-таблицами - могут возникнуть коллизии. В формате UTF-8 без учета регистра мы не используем честную функцию `tolower` - мы обнуляем 5-й бит каждого байта кодовой точки - это работает для латиницы и почти для всех кириллических букв. +Примечание: для случая UTF-8 мы используем триграммное расстояние. Вычисление n-граммного расстояния не совсем честное. Мы используем 2-х байтные хэши для хэширования n-грамм, а затем вычисляем симметричную разность между хэш-таблицами - могут возникнуть коллизии. В формате UTF-8 без учета регистра мы не используем честную функцию `tolower` - мы обнуляем 5-й бит каждого байта кодовой точки - это работает для латиницы и почти для всех кириллических букв. [Оригинальная статья](https://clickhouse.yandex/docs/ru/query_language/functions/string_search_functions/) From dcfd3fe37f1b025384ac0f444f189b79ba232079 Mon Sep 17 00:00:00 2001 From: Danila Kutenin Date: Wed, 6 Mar 2019 02:08:08 +0300 Subject: [PATCH 03/25] Comment in FunctionsStringSimilarity --- dbms/src/Functions/FunctionsStringSimilarity.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Functions/FunctionsStringSimilarity.cpp b/dbms/src/Functions/FunctionsStringSimilarity.cpp index a90c7e82acd..7c77857345a 100644 --- a/dbms/src/Functions/FunctionsStringSimilarity.cpp +++ b/dbms/src/Functions/FunctionsStringSimilarity.cpp @@ -90,7 +90,7 @@ struct NgramDistanceImpl /// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ /// Doing unaligned read of 16 bytes and copy them like above /// 16 is also chosen to do two `movups`. - /// Such copying allow us to have 3 codepoints from the previous read to produce the n-gram with them. + /// Such copying allow us to have 3 codepoints from the previous read to produce the 4-grams with them. memcpy(code_points + (N - 1), pos, default_padding * sizeof(CodePoint)); if constexpr (CaseInsensitive) From 8800134b9a1d8ae9d5f0b87bf5ccefb2c1222455 Mon Sep 17 00:00:00 2001 From: Danila Kutenin Date: Wed, 6 Mar 2019 02:43:23 +0300 Subject: [PATCH 04/25] remove public perf tests --- dbms/tests/performance/website/url_hits.xml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/dbms/tests/performance/website/url_hits.xml b/dbms/tests/performance/website/url_hits.xml index 88f48705d9a..f83ec663ef7 100644 --- a/dbms/tests/performance/website/url_hits.xml +++ b/dbms/tests/performance/website/url_hits.xml @@ -78,11 +78,5 @@ SELECT URLHash, EventDate, count() AS PageViews FROM {table} WHERE CounterID = 34 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND NOT Refresh AND TraficSourceID IN (-1, 6) AND RefererHash = halfMD5('http://example.ru/') GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 100 SELECT WindowClientWidth, WindowClientHeight, count() AS PageViews FROM {table} WHERE CounterID = 34 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND NOT Refresh AND NOT DontCountHits AND URLHash = halfMD5('http://example.ru/') GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10000 SELECT toStartOfMinute(EventTime) AS Minute, count() AS PageViews FROM {table} WHERE CounterID = 34 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-02' AND NOT Refresh AND NOT DontCountHits GROUP BY Minute ORDER BY Minute -SELECT count(multiSearch(URL, ['yandex', 'google', 'rambler'])) from {table}; -SELECT count(match(URL, 'google|yandex|rambler')) from hits_100m_single; -SELECT count(match(URL, 'google')), count(match(URL, 'yandex')), count(match(URL, 'rambler')) from {table} -SELECT DISTINCT Title, ngramDistance(Title, 'what is love') AS distance FROM {table} ORDER BY distance ASC LIMIT 50 -SELECT DISTINCT Title, ngramDistanceUTF8(Title, 'метрика') AS distance FROM {table} ORDER BY distance ASC LIMIT 50 - From cb7158f615ebb445c0c013fc25091cb1e6615d76 Mon Sep 17 00:00:00 2001 From: Danila Kutenin Date: Wed, 6 Mar 2019 02:52:19 +0300 Subject: [PATCH 05/25] perf test for distance functions in a proper folder --- .../string_search/ngram_distance.xml | 42 +++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 dbms/tests/performance/string_search/ngram_distance.xml diff --git a/dbms/tests/performance/string_search/ngram_distance.xml b/dbms/tests/performance/string_search/ngram_distance.xml new file mode 100644 index 00000000000..557928cbf12 --- /dev/null +++ b/dbms/tests/performance/string_search/ngram_distance.xml @@ -0,0 +1,42 @@ + + Distance search performance search + + + search + + + + hits_100m_single + + + loop + + + + 5 + 10000 + + + 50 + 60000 + + + + SELECT DISTINCT Title, ngramDistance(Title, 'what is love') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50 + SELECT DISTINCT Title, ngramDistance(Title, 'baby dont hurt me') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50 + SELECT DISTINCT Title, ngramDistance(Title, 'no more') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50 + SELECT DISTINCT Title, ngramDistanceCaseInsensitive(Title, 'wHAt Is lovE') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50 + SELECT DISTINCT Title, ngramDistanceCaseInsensitive(Title, 'BABY DonT hUrT me') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50 + SELECT DISTINCT Title, ngramDistanceCaseInsensitive(Title, 'nO MOrE') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50 + SELECT DISTINCT Title, ngramDistanceUTF8(Title, 'метрика') AS distance FROM {table} ORDER BY distance ASC LIMIT 50 + SELECT DISTINCT URL, ngramDistanceUTF8(URL, 'как дела') AS distance FROM {table} ORDER BY distance ASC LIMIT 50 + SELECT DISTINCT URL, ngramDistanceUTF8(URL, 'как дела') AS distance FROM {table} ORDER BY distance ASC LIMIT 50 + + SELECT DISTINCT Title, ngramDistanceUTF8CaseInsensitive(Title, 'Метрика') AS distance FROM {table} ORDER BY distance ASC LIMIT 50 + SELECT DISTINCT URL, ngramDistanceUTF8CaseInsensitive(URL, 'как дЕлА') AS distance FROM {table} ORDER BY distance ASC LIMIT 50 + SELECT DISTINCT URL, ngramDistanceUTF8CaseInsensitive(URL, 'Чем зАнимаешЬся') AS distance FROM {table} ORDER BY distance ASC LIMIT 50 + + + + + From d09f2023c975c2b25156e2259e1ecb183b01e05c Mon Sep 17 00:00:00 2001 From: Danila Kutenin Date: Wed, 6 Mar 2019 02:54:00 +0300 Subject: [PATCH 06/25] typos in perf tests for distance function --- .../performance/string_search/ngram_distance.xml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/dbms/tests/performance/string_search/ngram_distance.xml b/dbms/tests/performance/string_search/ngram_distance.xml index 557928cbf12..2c75cd967cb 100644 --- a/dbms/tests/performance/string_search/ngram_distance.xml +++ b/dbms/tests/performance/string_search/ngram_distance.xml @@ -28,13 +28,13 @@ SELECT DISTINCT Title, ngramDistanceCaseInsensitive(Title, 'wHAt Is lovE') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50 SELECT DISTINCT Title, ngramDistanceCaseInsensitive(Title, 'BABY DonT hUrT me') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50 SELECT DISTINCT Title, ngramDistanceCaseInsensitive(Title, 'nO MOrE') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50 - SELECT DISTINCT Title, ngramDistanceUTF8(Title, 'метрика') AS distance FROM {table} ORDER BY distance ASC LIMIT 50 - SELECT DISTINCT URL, ngramDistanceUTF8(URL, 'как дела') AS distance FROM {table} ORDER BY distance ASC LIMIT 50 - SELECT DISTINCT URL, ngramDistanceUTF8(URL, 'как дела') AS distance FROM {table} ORDER BY distance ASC LIMIT 50 + SELECT DISTINCT Title, ngramDistanceUTF8(Title, 'метрика') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50 + SELECT DISTINCT URL, ngramDistanceUTF8(URL, 'как дела') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50 + SELECT DISTINCT URL, ngramDistanceUTF8(URL, 'как дела') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50 - SELECT DISTINCT Title, ngramDistanceUTF8CaseInsensitive(Title, 'Метрика') AS distance FROM {table} ORDER BY distance ASC LIMIT 50 - SELECT DISTINCT URL, ngramDistanceUTF8CaseInsensitive(URL, 'как дЕлА') AS distance FROM {table} ORDER BY distance ASC LIMIT 50 - SELECT DISTINCT URL, ngramDistanceUTF8CaseInsensitive(URL, 'Чем зАнимаешЬся') AS distance FROM {table} ORDER BY distance ASC LIMIT 50 + SELECT DISTINCT Title, ngramDistanceCaseInsensitiveUTF8(Title, 'Метрика') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50 + SELECT DISTINCT URL, ngramDistanceCaseInsensitiveUTF8(URL, 'как дЕлА') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50 + SELECT DISTINCT URL, ngramDistanceCaseInsensitiveUTF8(URL, 'Чем зАнимаешЬся') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50 From 97349fb83ea66fcaaf05bfe9ef1fd40227f3dafa Mon Sep 17 00:00:00 2001 From: Danila Kutenin Date: Wed, 6 Mar 2019 02:55:51 +0300 Subject: [PATCH 07/25] typos in perf tests for distance function --- dbms/tests/performance/string_search/ngram_distance.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/tests/performance/string_search/ngram_distance.xml b/dbms/tests/performance/string_search/ngram_distance.xml index 2c75cd967cb..84b599dd882 100644 --- a/dbms/tests/performance/string_search/ngram_distance.xml +++ b/dbms/tests/performance/string_search/ngram_distance.xml @@ -1,5 +1,5 @@ - Distance search performance search + Distance search performance test search From 570af60bfa60d993941b95a5d3335fb8b3277249 Mon Sep 17 00:00:00 2001 From: Danila Kutenin Date: Wed, 6 Mar 2019 03:05:14 +0300 Subject: [PATCH 08/25] more typos to the god of typos in distance perf test --- dbms/tests/performance/string_search/ngram_distance.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/tests/performance/string_search/ngram_distance.xml b/dbms/tests/performance/string_search/ngram_distance.xml index 84b599dd882..16960811067 100644 --- a/dbms/tests/performance/string_search/ngram_distance.xml +++ b/dbms/tests/performance/string_search/ngram_distance.xml @@ -30,7 +30,7 @@ SELECT DISTINCT Title, ngramDistanceCaseInsensitive(Title, 'nO MOrE') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50 SELECT DISTINCT Title, ngramDistanceUTF8(Title, 'метрика') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50 SELECT DISTINCT URL, ngramDistanceUTF8(URL, 'как дела') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50 - SELECT DISTINCT URL, ngramDistanceUTF8(URL, 'как дела') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50 + SELECT DISTINCT URL, ngramDistanceUTF8(URL, 'чем занимаешься') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50 SELECT DISTINCT Title, ngramDistanceCaseInsensitiveUTF8(Title, 'Метрика') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50 SELECT DISTINCT URL, ngramDistanceCaseInsensitiveUTF8(URL, 'как дЕлА') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50 From dd22d1fb89e491020b7e9b5c13133e8add2f9967 Mon Sep 17 00:00:00 2001 From: Danila Kutenin Date: Wed, 6 Mar 2019 03:08:12 +0300 Subject: [PATCH 09/25] Better docs to the distance functions --- docs/en/query_language/functions/string_search_functions.md | 4 ++-- docs/ru/query_language/functions/string_search_functions.md | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/en/query_language/functions/string_search_functions.md b/docs/en/query_language/functions/string_search_functions.md index c900b52cf94..bde56693c36 100644 --- a/docs/en/query_language/functions/string_search_functions.md +++ b/docs/en/query_language/functions/string_search_functions.md @@ -72,11 +72,11 @@ The same thing as 'like', but negative. ## ngramDistance(haystack, needle) -Calculate the 4-gram distance between `haystack` and `needle`: count the symmetric difference between two sets of 4-grams and normalize it by the sum of their cardinalities. Returns float number from 0 to 1 - the closer to zero, the more strings are similar to each other. If the `needle` is more than 32Kb, throw an exception. If some of the `haystack` strings are more than 32Kb, the distance is always one. +Calculate the 4-gram distance between `haystack` and `needle`: count the symmetric difference between two multisets of 4-grams and normalize it by the sum of their cardinalities. Returns float number from 0 to 1 - the closer to zero, the more strings are similar to each other. If the `needle` is more than 32Kb, throw an exception. If some of the `haystack` strings are more than 32Kb, the distance is always one. For case-insensitive search or/and in UTF-8 format use functions `ngramDistanceCaseInsensitive, ngramDistanceUTF8, ngramDistanceCaseInsensitiveUTF8`. -Notes: For UTF-8 case we use 3-gram distance. All these are not perfectly fair n-gram distances. We use 2 bytes hashes to hash n-grams and then calculate the symmetric difference between these hash tables -- collisions may occur. With UTF-8 case-insensitive format we do not use fair `tolower` function -- we zero the 5-th bit of each codepoint byte -- this works for Latin and mostly for all Cyrillic letters. +Notes: For UTF-8 case we use 3-gram distance. All these are not perfectly fair n-gram distances. We use 2 bytes hashes to hash n-grams and then calculate the symmetric difference between these hash tables -- collisions may occur. With UTF-8 case-insensitive format we do not use fair `tolower` function -- we zero the 5-th bit (starting from zero) of each codepoint byte -- this works for Latin and mostly for all Cyrillic letters. [Original article](https://clickhouse.yandex/docs/en/query_language/functions/string_search_functions/) diff --git a/docs/ru/query_language/functions/string_search_functions.md b/docs/ru/query_language/functions/string_search_functions.md index 48a255ded71..6658cc4ee19 100644 --- a/docs/ru/query_language/functions/string_search_functions.md +++ b/docs/ru/query_language/functions/string_search_functions.md @@ -61,10 +61,10 @@ ## ngramDistance(haystack, needle) -Вычисление 4-граммного расстояния между `haystack` и `needle`: считается симметрическая разность между двумя множествами 4-грамм и нормализается на сумму их мощностей. Возвращает число float от 0 до 1 - чем ближе к нулю, тем больше строки похожи друг на друга. Если `needle` больше чем 32КБ, кидается исключение. Если некоторые строка из `haystack` больше 32КБ, расстояние всегда равно единице. +Вычисление 4-граммного расстояния между `haystack` и `needle`: считается симметрическая разность между двумя мультимножествами 4-грамм и нормализается на сумму их мощностей. Возвращает число float от 0 до 1 - чем ближе к нулю, тем больше строки похожи друг на друга. Если `needle` больше чем 32КБ, кидается исключение. Если некоторые строка из `haystack` больше 32КБ, расстояние всегда равно единице. Для поиска без учета регистра и/или в формате UTF-8 используйте функции `ngramDistanceCaseInsensitive, ngramDistanceUTF8, ngramDistanceCaseInsensitiveUTF8`. -Примечание: для случая UTF-8 мы используем триграммное расстояние. Вычисление n-граммного расстояния не совсем честное. Мы используем 2-х байтные хэши для хэширования n-грамм, а затем вычисляем симметричную разность между хэш-таблицами - могут возникнуть коллизии. В формате UTF-8 без учета регистра мы не используем честную функцию `tolower` - мы обнуляем 5-й бит каждого байта кодовой точки - это работает для латиницы и почти для всех кириллических букв. +Примечание: для случая UTF-8 мы используем триграммное расстояние. Вычисление n-граммного расстояния не совсем честное. Мы используем 2-х байтные хэши для хэширования n-грамм, а затем вычисляем симметричную разность между хэш таблицами - могут возникнуть коллизии. В формате UTF-8 без учета регистра мы не используем честную функцию `tolower` - мы обнуляем 5-й бит (нумерация с нуля) каждого байта кодовой точки - это работает для латиницы и почти для всех кириллических букв. [Оригинальная статья](https://clickhouse.yandex/docs/ru/query_language/functions/string_search_functions/) From bccbd52d783b7bdceb364e5f8e6051995cabfa70 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Mon, 4 Mar 2019 18:40:49 +0100 Subject: [PATCH 10/25] Rework system.graphite_retentions table --- .../GraphiteRollupSortedBlockInputStream.h | 1 + .../MergeTree/registerStorageMergeTree.cpp | 1 + .../Storages/System/StorageSystemGraphite.cpp | 253 ++++++++---------- .../Storages/System/StorageSystemGraphite.h | 14 + 4 files changed, 124 insertions(+), 145 deletions(-) diff --git a/dbms/src/DataStreams/GraphiteRollupSortedBlockInputStream.h b/dbms/src/DataStreams/GraphiteRollupSortedBlockInputStream.h index bb2f81fc81f..dc5260be0e7 100644 --- a/dbms/src/DataStreams/GraphiteRollupSortedBlockInputStream.h +++ b/dbms/src/DataStreams/GraphiteRollupSortedBlockInputStream.h @@ -124,6 +124,7 @@ namespace Graphite struct Params { + String config_name; String path_column_name; String time_column_name; String value_column_name; diff --git a/dbms/src/Storages/MergeTree/registerStorageMergeTree.cpp b/dbms/src/Storages/MergeTree/registerStorageMergeTree.cpp index 103be508564..4b934ea3122 100644 --- a/dbms/src/Storages/MergeTree/registerStorageMergeTree.cpp +++ b/dbms/src/Storages/MergeTree/registerStorageMergeTree.cpp @@ -165,6 +165,7 @@ static void setGraphitePatternsFromConfig(const Context & context, throw Exception("No '" + config_element + "' element in configuration file", ErrorCodes::NO_ELEMENTS_IN_CONFIG); + params.config_name = config_element; params.path_column_name = config.getString(config_element + ".path_column_name", "Path"); params.time_column_name = config.getString(config_element + ".time_column_name", "Time"); params.value_column_name = config.getString(config_element + ".value_column_name", "Value"); diff --git a/dbms/src/Storages/System/StorageSystemGraphite.cpp b/dbms/src/Storages/System/StorageSystemGraphite.cpp index d75eb71841e..ed37235e270 100644 --- a/dbms/src/Storages/System/StorageSystemGraphite.cpp +++ b/dbms/src/Storages/System/StorageSystemGraphite.cpp @@ -1,175 +1,138 @@ #include +#include +#include -#include -#include -#include -#include -#include -#include -#include #include -#include - namespace DB { -namespace ErrorCodes -{ - extern const int NO_ELEMENTS_IN_CONFIG; -} - -namespace -{ - -using namespace Poco::Util; - -struct Pattern -{ - struct Retention - { - UInt64 age; - UInt64 precision; - }; - - std::string regexp; - std::string function; - std::vector retentions; - UInt16 priority; - UInt8 is_default; -}; - -static Pattern readOnePattern( - const AbstractConfiguration & config, - const std::string & path) -{ - Pattern pattern; - AbstractConfiguration::Keys keys; - - config.keys(path, keys); - - if (keys.empty()) - throw Exception("Empty pattern in Graphite rollup configuration", ErrorCodes::NO_ELEMENTS_IN_CONFIG); - - for (const auto & key : keys) - { - const String key_path = path + "." + key; - - if (startsWith(key, "regexp")) - { - pattern.regexp = config.getString(key_path); - } - else if (startsWith(key, "function")) - { - pattern.function = config.getString(key_path); - } - else if (startsWith(key, "retention")) - { - pattern.retentions.push_back(Pattern::Retention{0, 0}); - pattern.retentions.back().age = config.getUInt64(key_path + ".age", 0); - pattern.retentions.back().precision = config.getUInt64(key_path + ".precision", 0); - } - } - - return pattern; -} - -static std::vector readPatterns( - const AbstractConfiguration & config, - const std::string & section) -{ - AbstractConfiguration::Keys keys; - std::vector result; - size_t count = 0; - - config.keys(section, keys); - - for (const auto & key : keys) - { - if (startsWith(key, "pattern")) - { - Pattern pattern(readOnePattern(config, section + "." + key)); - pattern.is_default = false; - pattern.priority = ++count; - result.push_back(pattern); - } - else if (startsWith(key, "default")) - { - Pattern pattern(readOnePattern(config, section + "." + key)); - pattern.is_default = true; - pattern.priority = std::numeric_limits::max(); - result.push_back(pattern); - } - } - - return result; -} - -static Strings getAllGraphiteSections(const AbstractConfiguration & config) -{ - Strings result; - - AbstractConfiguration::Keys keys; - config.keys(keys); - - for (const auto & key : keys) - { - if (startsWith(key, "graphite_")) - result.push_back(key); - } - - return result; -} - -} // namespace - NamesAndTypesList StorageSystemGraphite::getNamesAndTypes() { return { - {"config_name", std::make_shared()}, - {"regexp", std::make_shared()}, - {"function", std::make_shared()}, - {"age", std::make_shared()}, - {"precision", std::make_shared()}, - {"priority", std::make_shared()}, - {"is_default", std::make_shared()}, + {"config_name", std::make_shared()}, + {"regexp", std::make_shared()}, + {"function", std::make_shared()}, + {"age", std::make_shared()}, + {"precision", std::make_shared()}, + {"priority", std::make_shared()}, + {"is_default", std::make_shared()}, + {"Tables.database", std::make_shared(std::make_shared())}, + {"Tables.table", std::make_shared(std::make_shared())}, }; } +/* + * Looking for (Replicated)*GraphiteMergeTree and get all configuration parameters for them + */ +StorageSystemGraphite::Configs StorageSystemGraphite::getConfigs(const Context & context) const +{ + const Databases databases = context.getDatabases(); + Configs graphite_configs; + + for (const auto & db : databases) + { + for (auto iterator = db.second->getIterator(context); iterator->isValid(); iterator->next()) + { + auto & table = iterator->table(); + const MergeTreeData * table_data = nullptr; + + if (const StorageMergeTree * merge_tree = dynamic_cast(table.get())) + { + table_data = & merge_tree->getData(); + } + else if (const StorageReplicatedMergeTree * replicated_merge_tree = dynamic_cast(table.get())) + { + table_data = & replicated_merge_tree->getData(); + } + else + { + continue; + } + + if (table_data->merging_params.mode == MergeTreeData::MergingParams::Graphite) + { + const String config_name = table_data->merging_params.graphite_params.config_name; + + if (graphite_configs.find(config_name) == graphite_configs.end()) + { + Config new_config = { + & table_data->merging_params.graphite_params, + { table_data->getDatabaseName() }, + { table_data->getTableName() }, + }; + graphite_configs.insert(std::make_pair(config_name, new_config)); + } + else + { + graphite_configs[config_name].databases.emplace_back(table_data->getDatabaseName()); + graphite_configs[config_name].tables.emplace_back(table_data->getTableName()); + } + } + } + } + + return graphite_configs; +} + void StorageSystemGraphite::fillData(MutableColumns & res_columns, const Context & context, const SelectQueryInfo &) const { - const auto & config = context.getConfigRef(); + Configs graphite_configs = StorageSystemGraphite::getConfigs(context); - Strings sections = getAllGraphiteSections(config); - for (const auto & section : sections) + for (const auto & config : graphite_configs) { - const auto patterns = readPatterns(config, section); - for (const auto & pattern : patterns) + UInt16 priority = 0; + for (const auto & pattern : config.second.graphite_params->patterns) { + bool is_default = pattern.regexp == nullptr; + String regexp = ""; + String function = ""; + + if (is_default) + { + priority = std::numeric_limits::max(); + } + else + { + priority++; + regexp = pattern.regexp->getRE2()->pattern(); + } + + if (pattern.function) + { + function = pattern.function->getName(); + } + if (!pattern.retentions.empty()) { - for (const auto & ret : pattern.retentions) + for (const auto & retention : pattern.retentions) { - res_columns[0]->insert(section); - res_columns[1]->insert(pattern.regexp); - res_columns[2]->insert(pattern.function); - res_columns[3]->insert(ret.age); - res_columns[4]->insert(ret.precision); - res_columns[5]->insert(pattern.priority); - res_columns[6]->insert(pattern.is_default); + size_t i = 0; + res_columns[i++]->insert(config.first); + res_columns[i++]->insert(regexp); + res_columns[i++]->insert(function); + res_columns[i++]->insert(retention.age); + res_columns[i++]->insert(retention.precision); + res_columns[i++]->insert(priority); + res_columns[i++]->insert(is_default); + res_columns[i++]->insert(config.second.databases); + res_columns[i++]->insert(config.second.tables); } } else { - res_columns[0]->insert(section); - res_columns[1]->insert(pattern.regexp); - res_columns[2]->insert(pattern.function); - res_columns[3]->insert(0); - res_columns[4]->insert(0); - res_columns[5]->insert(pattern.priority); - res_columns[6]->insert(pattern.is_default); + size_t i = 0; + res_columns[i++]->insert(config.first); + res_columns[i++]->insert(regexp); + res_columns[i++]->insert(function); + res_columns[i++]->insert(NULL); + res_columns[i++]->insert(NULL); + res_columns[i++]->insert(priority); + res_columns[i++]->insert(is_default); + res_columns[i++]->insert(config.second.databases); + res_columns[i++]->insert(config.second.tables); } } } diff --git a/dbms/src/Storages/System/StorageSystemGraphite.h b/dbms/src/Storages/System/StorageSystemGraphite.h index fa63c839857..4205f77f1ea 100644 --- a/dbms/src/Storages/System/StorageSystemGraphite.h +++ b/dbms/src/Storages/System/StorageSystemGraphite.h @@ -1,7 +1,10 @@ #pragma once +#include #include +#include #include +#include #include namespace DB @@ -15,10 +18,21 @@ public: static NamesAndTypesList getNamesAndTypes(); + struct Config + { + const Graphite::Params * graphite_params; + Array databases; + Array tables; + }; + + using Configs = std::map; + + protected: using IStorageSystemOneBlock::IStorageSystemOneBlock; void fillData(MutableColumns & res_columns, const Context & context, const SelectQueryInfo & query_info) const override; + StorageSystemGraphite::Configs getConfigs(const Context & context) const; }; } From d1cb4932d7bf7d77a2774ab88cf869c20783346f Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Mon, 4 Mar 2019 19:22:20 +0100 Subject: [PATCH 11/25] Add documentation about system.graphite_retentions --- docs/en/operations/system_tables.md | 16 ++++++++++++++++ docs/ru/operations/system_tables.md | 17 +++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/docs/en/operations/system_tables.md b/docs/en/operations/system_tables.md index 34b44419cce..c6d90c89cb1 100644 --- a/docs/en/operations/system_tables.md +++ b/docs/en/operations/system_tables.md @@ -85,6 +85,22 @@ Columns: - `name`(`String`) – The name of the function. - `is_aggregate`(`UInt8`) — Whether the function is aggregate. +## system.graphite_retentions + +Contains information about parameters [graphite_rollup](../server_settings/settings.md#server_settings-graphite_rollup) which use in tables with [\*GraphiteMergeTree](table_engines/graphitemergetree.md) engines. + +Столбцы: +- `config_name` (String) - `graphite_rollup` parameter name. +- `regexp` (String) - A pattern for the metric name. +- `function` (String) - The name of the aggregating function. +- `age` (UInt64) - The minimum age of the data in seconds. +- `precision` (UInt64) - How precisely to define the age of the data in seconds. +- `priority` (UInt16) - Pattern priority. +- `is_default` (UInt8) - Is pattern default or not. +- `Tables.database` (Array(String)) - Array of databases names of tables, which use `config_name` parameter. +- `Tables.table` (Array(String)) - Array of tables names, which use `config_name` parameter. + + ## system.merges Contains information about merges and part mutations currently in process for tables in the MergeTree family. diff --git a/docs/ru/operations/system_tables.md b/docs/ru/operations/system_tables.md index 82aec59ec29..7a4e69ca1cd 100644 --- a/docs/ru/operations/system_tables.md +++ b/docs/ru/operations/system_tables.md @@ -83,6 +83,23 @@ default_expression String - выражение для значения по ум - `name` (`String`) – Имя функции. - `is_aggregate` (`UInt8`) – Признак, является ли функция агрегатной. + +## system.graphite_retentions + +Содержит информацию о том, какие параметры [graphite_rollup](../server_settings/settings.md#server_settings-graphite_rollup) используются в таблицах с движками [\*GraphiteMergeTree](table_engines/graphitemergetree.md). + +Столбцы: +- `config_name` (String) - Имя параметра, используемого для `graphite_rollup`. +- `regexp` (String) - Шаблон имени метрики. +- `function` (String) - Имя агрегирующей функции. +- `age` (UInt64) - Минимальный возраст данных в секундах. +- `precision` (UInt64) - Точность определения возраста данных в секундах. +- `priority` (UInt16) - Приоритет раздела pattern. +- `is_default` (UInt8) - Является ли раздел pattern дефолтным. +- `Tables.database` (Array(String)) - Массив имён баз данных таблиц, использующих параметр `config_name`. +- `Tables.table` (Array(String)) - Массив имён таблиц, использующих параметр `config_name`. + + ## system.merges Содержит информацию о производящихся прямо сейчас слияниях и мутациях кусков для таблиц семейства MergeTree. From 90466728c6a1fbfe2a21ecc5f35aa226f9f49ddd Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Mon, 4 Mar 2019 19:50:43 +0100 Subject: [PATCH 12/25] Add tests for system.graphite_retentions --- .../test_graphite_merge_tree/test.py | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/dbms/tests/integration/test_graphite_merge_tree/test.py b/dbms/tests/integration/test_graphite_merge_tree/test.py index 8e98c97e077..509fbac97d0 100644 --- a/dbms/tests/integration/test_graphite_merge_tree/test.py +++ b/dbms/tests/integration/test_graphite_merge_tree/test.py @@ -231,6 +231,50 @@ SELECT * FROM test.graphite; assert TSV(result) == TSV(expected) +def test_system_graphite_retentions(graphite_table): + expected = ''' +graphite_rollup \\\\.count$ sum 0 0 1 0 ['test'] ['graphite'] +graphite_rollup \\\\.max$ max 0 0 2 0 ['test'] ['graphite'] +graphite_rollup ^five_min\\\\. 31536000 14400 3 0 ['test'] ['graphite'] +graphite_rollup ^five_min\\\\. 5184000 3600 3 0 ['test'] ['graphite'] +graphite_rollup ^five_min\\\\. 0 300 3 0 ['test'] ['graphite'] +graphite_rollup ^one_min avg 31536000 600 4 0 ['test'] ['graphite'] +graphite_rollup ^one_min avg 7776000 300 4 0 ['test'] ['graphite'] +graphite_rollup ^one_min avg 0 60 4 0 ['test'] ['graphite'] + ''' + result = q('SELECT * from system.graphite_retentions') + + assert TSV(result) == TSV(expected) + + q(''' +DROP TABLE IF EXISTS test.graphite2; +CREATE TABLE test.graphite2 + (metric String, value Float64, timestamp UInt32, date Date, updated UInt32) + ENGINE = GraphiteMergeTree('graphite_rollup') + PARTITION BY toYYYYMM(date) + ORDER BY (metric, timestamp) + SETTINGS index_granularity=8192; + ''') + expected = ''' +graphite_rollup ['test','test'] ['graphite','graphite2'] +graphite_rollup ['test','test'] ['graphite','graphite2'] +graphite_rollup ['test','test'] ['graphite','graphite2'] +graphite_rollup ['test','test'] ['graphite','graphite2'] +graphite_rollup ['test','test'] ['graphite','graphite2'] +graphite_rollup ['test','test'] ['graphite','graphite2'] +graphite_rollup ['test','test'] ['graphite','graphite2'] +graphite_rollup ['test','test'] ['graphite','graphite2'] + ''' + result = q(''' + SELECT + config_name, + Tables.database, + Tables.table + FROM system.graphite_retentions + ''') + assert TSV(result) == TSV(expected) + + def test_path_dangling_pointer(graphite_table): q(''' DROP TABLE IF EXISTS test.graphite2; From 8b0d8644c860c8dbd2117114d53fa6da4471d6bf Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Thu, 7 Mar 2019 19:55:53 +0300 Subject: [PATCH 13/25] Update StorageSystemGraphite.cpp --- .../Storages/System/StorageSystemGraphite.cpp | 27 ++++++++++--------- 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/dbms/src/Storages/System/StorageSystemGraphite.cpp b/dbms/src/Storages/System/StorageSystemGraphite.cpp index ed37235e270..4f9fb755a23 100644 --- a/dbms/src/Storages/System/StorageSystemGraphite.cpp +++ b/dbms/src/Storages/System/StorageSystemGraphite.cpp @@ -41,11 +41,11 @@ StorageSystemGraphite::Configs StorageSystemGraphite::getConfigs(const Context & if (const StorageMergeTree * merge_tree = dynamic_cast(table.get())) { - table_data = & merge_tree->getData(); + table_data = &merge_tree->getData(); } else if (const StorageReplicatedMergeTree * replicated_merge_tree = dynamic_cast(table.get())) { - table_data = & replicated_merge_tree->getData(); + table_data = &replicated_merge_tree->getData(); } else { @@ -54,16 +54,18 @@ StorageSystemGraphite::Configs StorageSystemGraphite::getConfigs(const Context & if (table_data->merging_params.mode == MergeTreeData::MergingParams::Graphite) { - const String config_name = table_data->merging_params.graphite_params.config_name; + const String & config_name = table_data->merging_params.graphite_params.config_name; - if (graphite_configs.find(config_name) == graphite_configs.end()) + if (!graphite_configs.count(config_name)) { - Config new_config = { - & table_data->merging_params.graphite_params, - { table_data->getDatabaseName() }, - { table_data->getTableName() }, - }; - graphite_configs.insert(std::make_pair(config_name, new_config)); + Config new_config = + { + /// FIXME Do we own a table? (possible dangling reference) + &table_data->merging_params.graphite_params, + { table_data->getDatabaseName() }, + { table_data->getTableName() }, + }; + graphite_configs.emplace(config_name, new_config); } else { @@ -87,8 +89,8 @@ void StorageSystemGraphite::fillData(MutableColumns & res_columns, const Context for (const auto & pattern : config.second.graphite_params->patterns) { bool is_default = pattern.regexp == nullptr; - String regexp = ""; - String function = ""; + String regexp; + String function; if (is_default) { @@ -97,6 +99,7 @@ void StorageSystemGraphite::fillData(MutableColumns & res_columns, const Context else { priority++; + /// FIXME Null pointer dereference for trivial patterns. regexp = pattern.regexp->getRE2()->pattern(); } From 9e82b44b625b3150380bc11c06b88cebf1926de9 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Thu, 7 Mar 2019 21:17:06 +0100 Subject: [PATCH 14/25] Review adjustment --- .../DataStreams/GraphiteRollupSortedBlockInputStream.h | 2 ++ dbms/src/Storages/MergeTree/registerStorageMergeTree.cpp | 1 + dbms/src/Storages/System/StorageSystemGraphite.cpp | 8 +++----- dbms/src/Storages/System/StorageSystemGraphite.h | 2 +- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/dbms/src/DataStreams/GraphiteRollupSortedBlockInputStream.h b/dbms/src/DataStreams/GraphiteRollupSortedBlockInputStream.h index dc5260be0e7..00bd2f4b67e 100644 --- a/dbms/src/DataStreams/GraphiteRollupSortedBlockInputStream.h +++ b/dbms/src/DataStreams/GraphiteRollupSortedBlockInputStream.h @@ -113,6 +113,7 @@ namespace Graphite struct Pattern { std::shared_ptr regexp; + std::string regexp_str; AggregateFunctionPtr function; Retentions retentions; /// Must be ordered by 'age' descending. enum { TypeUndef, TypeRetention, TypeAggregation, TypeAll } type = TypeAll; /// The type of defined pattern, filled automatically @@ -216,6 +217,7 @@ private: const Graphite::Pattern undef_pattern = { /// temporary empty pattern for selectPatternForPath nullptr, + "", nullptr, DB::Graphite::Retentions(), undef_pattern.TypeUndef, diff --git a/dbms/src/Storages/MergeTree/registerStorageMergeTree.cpp b/dbms/src/Storages/MergeTree/registerStorageMergeTree.cpp index 4b934ea3122..6411ec21bac 100644 --- a/dbms/src/Storages/MergeTree/registerStorageMergeTree.cpp +++ b/dbms/src/Storages/MergeTree/registerStorageMergeTree.cpp @@ -102,6 +102,7 @@ static void appendGraphitePattern( if (key == "regexp") { pattern.regexp = std::make_shared(config.getString(config_element + ".regexp")); + pattern.regexp_str = config.getString(config_element + ".regexp"); } else if (key == "function") { diff --git a/dbms/src/Storages/System/StorageSystemGraphite.cpp b/dbms/src/Storages/System/StorageSystemGraphite.cpp index 4f9fb755a23..fa1b768ac98 100644 --- a/dbms/src/Storages/System/StorageSystemGraphite.cpp +++ b/dbms/src/Storages/System/StorageSystemGraphite.cpp @@ -60,8 +60,7 @@ StorageSystemGraphite::Configs StorageSystemGraphite::getConfigs(const Context & { Config new_config = { - /// FIXME Do we own a table? (possible dangling reference) - &table_data->merging_params.graphite_params, + table_data->merging_params.graphite_params, { table_data->getDatabaseName() }, { table_data->getTableName() }, }; @@ -86,7 +85,7 @@ void StorageSystemGraphite::fillData(MutableColumns & res_columns, const Context for (const auto & config : graphite_configs) { UInt16 priority = 0; - for (const auto & pattern : config.second.graphite_params->patterns) + for (const auto & pattern : config.second.graphite_params.patterns) { bool is_default = pattern.regexp == nullptr; String regexp; @@ -99,8 +98,7 @@ void StorageSystemGraphite::fillData(MutableColumns & res_columns, const Context else { priority++; - /// FIXME Null pointer dereference for trivial patterns. - regexp = pattern.regexp->getRE2()->pattern(); + regexp = pattern.regexp_str; } if (pattern.function) diff --git a/dbms/src/Storages/System/StorageSystemGraphite.h b/dbms/src/Storages/System/StorageSystemGraphite.h index 4205f77f1ea..b874e294782 100644 --- a/dbms/src/Storages/System/StorageSystemGraphite.h +++ b/dbms/src/Storages/System/StorageSystemGraphite.h @@ -20,7 +20,7 @@ public: struct Config { - const Graphite::Params * graphite_params; + Graphite::Params graphite_params; Array databases; Array tables; }; From 1e71559b2dbf53a4db2ae43c46f38bedf3b05714 Mon Sep 17 00:00:00 2001 From: Simon Podlipsky Date: Sat, 9 Mar 2019 14:58:08 +0100 Subject: [PATCH 15/25] Upgrade librdkafka to RC7 --- contrib/librdkafka | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/librdkafka b/contrib/librdkafka index 363dcad5a23..51ae5f5fd8b 160000 --- a/contrib/librdkafka +++ b/contrib/librdkafka @@ -1 +1 @@ -Subproject commit 363dcad5a23dc29381cc626620e68ae418b3af19 +Subproject commit 51ae5f5fd8b742e56f47a8bb0136344868818285 From 4e67678b642dc314aaf44eedf59c8aa817d0d20a Mon Sep 17 00:00:00 2001 From: Danila Kutenin Date: Sat, 9 Mar 2019 19:57:52 +0300 Subject: [PATCH 16/25] Better docs to the distance functions --- docs/en/query_language/functions/string_search_functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/query_language/functions/string_search_functions.md b/docs/en/query_language/functions/string_search_functions.md index bde56693c36..6ae7c03f73c 100644 --- a/docs/en/query_language/functions/string_search_functions.md +++ b/docs/en/query_language/functions/string_search_functions.md @@ -72,7 +72,7 @@ The same thing as 'like', but negative. ## ngramDistance(haystack, needle) -Calculate the 4-gram distance between `haystack` and `needle`: count the symmetric difference between two multisets of 4-grams and normalize it by the sum of their cardinalities. Returns float number from 0 to 1 - the closer to zero, the more strings are similar to each other. If the `needle` is more than 32Kb, throw an exception. If some of the `haystack` strings are more than 32Kb, the distance is always one. +Calculates the 4-gram distance between `haystack` and `needle`: counts the symmetric difference between two multisets of 4-grams and normalizes it by the sum of their cardinalities. Returns float number from 0 to 1 -- the closer to zero, the more strings are similar to each other. If the `needle` is more than 32Kb, throws an exception. If some of the `haystack` strings are more than 32Kb, the distance is always one. For case-insensitive search or/and in UTF-8 format use functions `ngramDistanceCaseInsensitive, ngramDistanceUTF8, ngramDistanceCaseInsensitiveUTF8`. From 56872ef0e3d59b87d4761926c994aff447b41b14 Mon Sep 17 00:00:00 2001 From: Danila Kutenin Date: Sat, 9 Mar 2019 19:59:43 +0300 Subject: [PATCH 17/25] Better docs to the distance functions --- docs/ru/query_language/functions/string_search_functions.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/ru/query_language/functions/string_search_functions.md b/docs/ru/query_language/functions/string_search_functions.md index 6658cc4ee19..8939e4c926c 100644 --- a/docs/ru/query_language/functions/string_search_functions.md +++ b/docs/ru/query_language/functions/string_search_functions.md @@ -61,10 +61,10 @@ ## ngramDistance(haystack, needle) -Вычисление 4-граммного расстояния между `haystack` и `needle`: считается симметрическая разность между двумя мультимножествами 4-грамм и нормализается на сумму их мощностей. Возвращает число float от 0 до 1 - чем ближе к нулю, тем больше строки похожи друг на друга. Если `needle` больше чем 32КБ, кидается исключение. Если некоторые строка из `haystack` больше 32КБ, расстояние всегда равно единице. +Вычисление 4-граммного расстояния между `haystack` и `needle`: считается симметрическая разность между двумя мультимножествами 4-грамм и нормализается на сумму их мощностей. Возвращает число float от 0 до 1 -- чем ближе к нулю, тем больше строки похожи друг на друга. Если `needle` больше чем 32КБ, кидается исключение. Если некоторые строки из `haystack` больше 32КБ, расстояние всегда равно единице. Для поиска без учета регистра и/или в формате UTF-8 используйте функции `ngramDistanceCaseInsensitive, ngramDistanceUTF8, ngramDistanceCaseInsensitiveUTF8`. -Примечание: для случая UTF-8 мы используем триграммное расстояние. Вычисление n-граммного расстояния не совсем честное. Мы используем 2-х байтные хэши для хэширования n-грамм, а затем вычисляем симметричную разность между хэш таблицами - могут возникнуть коллизии. В формате UTF-8 без учета регистра мы не используем честную функцию `tolower` - мы обнуляем 5-й бит (нумерация с нуля) каждого байта кодовой точки - это работает для латиницы и почти для всех кириллических букв. +Примечание: для случая UTF-8 мы используем триграммное расстояние. Вычисление n-граммного расстояния не совсем честное. Мы используем 2-х байтные хэши для хэширования n-грамм, а затем вычисляем симметричную разность между хэш таблицами - могут возникнуть коллизии. В формате UTF-8 без учета регистра мы не используем честную функцию `tolower` - мы обнуляем 5-й бит (нумерация с нуля) каждого байта кодовой точки -- это работает для латиницы и почти для всех кириллических букв. [Оригинальная статья](https://clickhouse.yandex/docs/ru/query_language/functions/string_search_functions/) From b8538c49c98b7e00728f897eaf4a7347c9517a87 Mon Sep 17 00:00:00 2001 From: Danila Kutenin Date: Sat, 9 Mar 2019 20:01:01 +0300 Subject: [PATCH 18/25] Better docs to the distance functions --- docs/en/query_language/functions/string_search_functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/query_language/functions/string_search_functions.md b/docs/en/query_language/functions/string_search_functions.md index 6ae7c03f73c..dce9917776c 100644 --- a/docs/en/query_language/functions/string_search_functions.md +++ b/docs/en/query_language/functions/string_search_functions.md @@ -76,7 +76,7 @@ Calculates the 4-gram distance between `haystack` and `needle`: counts the symme For case-insensitive search or/and in UTF-8 format use functions `ngramDistanceCaseInsensitive, ngramDistanceUTF8, ngramDistanceCaseInsensitiveUTF8`. -Notes: For UTF-8 case we use 3-gram distance. All these are not perfectly fair n-gram distances. We use 2 bytes hashes to hash n-grams and then calculate the symmetric difference between these hash tables -- collisions may occur. With UTF-8 case-insensitive format we do not use fair `tolower` function -- we zero the 5-th bit (starting from zero) of each codepoint byte -- this works for Latin and mostly for all Cyrillic letters. +Notes: For UTF-8 case we use 3-gram distance. All these are not perfectly fair n-gram distances. We use 2-byte hashes to hash n-grams and then calculate the symmetric difference between these hash tables -- collisions may occur. With UTF-8 case-insensitive format we do not use fair `tolower` function -- we zero the 5-th bit (starting from zero) of each codepoint byte -- this works for Latin and mostly for all Cyrillic letters. [Original article](https://clickhouse.yandex/docs/en/query_language/functions/string_search_functions/) From 86df0960d9595293fef7b784b59aa1c8f17a10b0 Mon Sep 17 00:00:00 2001 From: Danila Kutenin Date: Sat, 9 Mar 2019 20:07:45 +0300 Subject: [PATCH 19/25] Better docs to the distance functions --- docs/en/query_language/functions/string_search_functions.md | 2 +- docs/ru/query_language/functions/string_search_functions.md | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/en/query_language/functions/string_search_functions.md b/docs/en/query_language/functions/string_search_functions.md index dce9917776c..29e8bcf8a38 100644 --- a/docs/en/query_language/functions/string_search_functions.md +++ b/docs/en/query_language/functions/string_search_functions.md @@ -76,7 +76,7 @@ Calculates the 4-gram distance between `haystack` and `needle`: counts the symme For case-insensitive search or/and in UTF-8 format use functions `ngramDistanceCaseInsensitive, ngramDistanceUTF8, ngramDistanceCaseInsensitiveUTF8`. -Notes: For UTF-8 case we use 3-gram distance. All these are not perfectly fair n-gram distances. We use 2-byte hashes to hash n-grams and then calculate the symmetric difference between these hash tables -- collisions may occur. With UTF-8 case-insensitive format we do not use fair `tolower` function -- we zero the 5-th bit (starting from zero) of each codepoint byte -- this works for Latin and mostly for all Cyrillic letters. +Notes: For UTF-8 case we use 3-gram distance. All these are not perfectly fair n-gram distances. We use 12-byte hashes to hash n-grams and then calculate the symmetric difference between these hash tables -- collisions may occur. With UTF-8 case-insensitive format we do not use fair `tolower` function -- we zero the 5-th bit (starting from zero) of each codepoint byte -- this works for Latin and mostly for all Cyrillic letters. [Original article](https://clickhouse.yandex/docs/en/query_language/functions/string_search_functions/) diff --git a/docs/ru/query_language/functions/string_search_functions.md b/docs/ru/query_language/functions/string_search_functions.md index 8939e4c926c..4b335cce34c 100644 --- a/docs/ru/query_language/functions/string_search_functions.md +++ b/docs/ru/query_language/functions/string_search_functions.md @@ -9,7 +9,7 @@ Для поиска без учета регистра используйте функцию `positionCaseInsensitive`. ## positionUTF8(haystack, needle) -Так же, как `position`, но позиция возвращается в кодовых точках Unicode. Работает при допущении, что строка содержит набор байт, представляющий текст в кодировке UTF-8. Если допущение не выполнено - то возвращает какой-нибудь результат (не кидает исключение). +Так же, как `position`, но позиция возвращается в кодовых точках Unicode. Работает при допущении, что строка содержит набор байт, представляющий текст в кодировке UTF-8. Если допущение не выполнено -- то возвращает какой-нибудь результат (не кидает исключение). Для поиска без учета регистра используйте функцию `positionCaseInsensitiveUTF8`. @@ -65,6 +65,6 @@ Для поиска без учета регистра и/или в формате UTF-8 используйте функции `ngramDistanceCaseInsensitive, ngramDistanceUTF8, ngramDistanceCaseInsensitiveUTF8`. -Примечание: для случая UTF-8 мы используем триграммное расстояние. Вычисление n-граммного расстояния не совсем честное. Мы используем 2-х байтные хэши для хэширования n-грамм, а затем вычисляем симметричную разность между хэш таблицами - могут возникнуть коллизии. В формате UTF-8 без учета регистра мы не используем честную функцию `tolower` - мы обнуляем 5-й бит (нумерация с нуля) каждого байта кодовой точки -- это работает для латиницы и почти для всех кириллических букв. +Примечание: для случая UTF-8 мы используем триграммное расстояние. Вычисление n-граммного расстояния не совсем честное. Мы используем 2-х байтные хэши для хэширования n-грамм, а затем вычисляем симметричную разность между хэш таблицами -- могут возникнуть коллизии. В формате UTF-8 без учета регистра мы не используем честную функцию `tolower` -- мы обнуляем 5-й бит (нумерация с нуля) каждого байта кодовой точки -- это работает для латиницы и почти для всех кириллических букв. [Оригинальная статья](https://clickhouse.yandex/docs/ru/query_language/functions/string_search_functions/) From 2905159c8598ab9cc90cc1b690f146cedfbea9df Mon Sep 17 00:00:00 2001 From: Danila Kutenin Date: Sat, 9 Mar 2019 20:26:32 +0300 Subject: [PATCH 20/25] Better docs to the distance functions --- docs/en/query_language/functions/string_search_functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/query_language/functions/string_search_functions.md b/docs/en/query_language/functions/string_search_functions.md index 29e8bcf8a38..dce9917776c 100644 --- a/docs/en/query_language/functions/string_search_functions.md +++ b/docs/en/query_language/functions/string_search_functions.md @@ -76,7 +76,7 @@ Calculates the 4-gram distance between `haystack` and `needle`: counts the symme For case-insensitive search or/and in UTF-8 format use functions `ngramDistanceCaseInsensitive, ngramDistanceUTF8, ngramDistanceCaseInsensitiveUTF8`. -Notes: For UTF-8 case we use 3-gram distance. All these are not perfectly fair n-gram distances. We use 12-byte hashes to hash n-grams and then calculate the symmetric difference between these hash tables -- collisions may occur. With UTF-8 case-insensitive format we do not use fair `tolower` function -- we zero the 5-th bit (starting from zero) of each codepoint byte -- this works for Latin and mostly for all Cyrillic letters. +Notes: For UTF-8 case we use 3-gram distance. All these are not perfectly fair n-gram distances. We use 2-byte hashes to hash n-grams and then calculate the symmetric difference between these hash tables -- collisions may occur. With UTF-8 case-insensitive format we do not use fair `tolower` function -- we zero the 5-th bit (starting from zero) of each codepoint byte -- this works for Latin and mostly for all Cyrillic letters. [Original article](https://clickhouse.yandex/docs/en/query_language/functions/string_search_functions/) From 0061df234a0c8fde2a6a2839f35285c441f50b8d Mon Sep 17 00:00:00 2001 From: proller Date: Sat, 9 Mar 2019 21:52:46 +0300 Subject: [PATCH 21/25] Build fix (split) (#4641) --- dbms/programs/server/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/programs/server/CMakeLists.txt b/dbms/programs/server/CMakeLists.txt index 217447413d5..5cb08018065 100644 --- a/dbms/programs/server/CMakeLists.txt +++ b/dbms/programs/server/CMakeLists.txt @@ -10,7 +10,7 @@ set(CLICKHOUSE_SERVER_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/TCPHandler.cpp ) -set(CLICKHOUSE_SERVER_LINK PRIVATE clickhouse_dictionaries clickhouse_common_io daemon clickhouse_storages_system clickhouse_functions clickhouse_aggregate_functions clickhouse_table_functions ${Poco_Net_LIBRARY}) +set(CLICKHOUSE_SERVER_LINK PRIVATE clickhouse_dictionaries clickhouse_common_io PUBLIC daemon PRIVATE clickhouse_storages_system clickhouse_functions clickhouse_aggregate_functions clickhouse_table_functions ${Poco_Net_LIBRARY}) if (USE_POCO_NETSSL) set(CLICKHOUSE_SERVER_LINK ${CLICKHOUSE_SERVER_LINK} PRIVATE ${Poco_NetSSL_LIBRARY} ${Poco_Crypto_LIBRARY}) endif () From 446caea46efe6d297b4a044adc266a45dbd24abd Mon Sep 17 00:00:00 2001 From: proller Date: Sun, 10 Mar 2019 04:28:13 +0300 Subject: [PATCH 22/25] Update contrib/cppkafka (#4620) * Update contrib/cppkafka * Fix --- contrib/cppkafka | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/cppkafka b/contrib/cppkafka index 860c90e92ee..9b184d881c1 160000 --- a/contrib/cppkafka +++ b/contrib/cppkafka @@ -1 +1 @@ -Subproject commit 860c90e92eee6690aa74a2ca7b7c5c6930dffecd +Subproject commit 9b184d881c15cc50784b28688c7c99d3d764db24 From 128fd20adf5ab5861e7893d9d7e655bf8c773872 Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Sun, 10 Mar 2019 04:30:42 +0300 Subject: [PATCH 23/25] Update registerStorageMergeTree.cpp --- dbms/src/Storages/MergeTree/registerStorageMergeTree.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Storages/MergeTree/registerStorageMergeTree.cpp b/dbms/src/Storages/MergeTree/registerStorageMergeTree.cpp index 6411ec21bac..a64f376e3de 100644 --- a/dbms/src/Storages/MergeTree/registerStorageMergeTree.cpp +++ b/dbms/src/Storages/MergeTree/registerStorageMergeTree.cpp @@ -101,8 +101,8 @@ static void appendGraphitePattern( { if (key == "regexp") { - pattern.regexp = std::make_shared(config.getString(config_element + ".regexp")); pattern.regexp_str = config.getString(config_element + ".regexp"); + pattern.regexp = std::make_shared(pattern.regexp_str); } else if (key == "function") { From b81f73bb132493aca5c184a195d298a261a3366f Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 10 Mar 2019 06:13:19 +0300 Subject: [PATCH 24/25] Added a test [#CLICKHOUSE-1704] --- dbms/tests/queries/0_stateless/00915_tuple_orantius.reference | 1 + dbms/tests/queries/0_stateless/00915_tuple_orantius.sql | 1 + 2 files changed, 2 insertions(+) create mode 100644 dbms/tests/queries/0_stateless/00915_tuple_orantius.reference create mode 100644 dbms/tests/queries/0_stateless/00915_tuple_orantius.sql diff --git a/dbms/tests/queries/0_stateless/00915_tuple_orantius.reference b/dbms/tests/queries/0_stateless/00915_tuple_orantius.reference new file mode 100644 index 00000000000..6b303cbce8b --- /dev/null +++ b/dbms/tests/queries/0_stateless/00915_tuple_orantius.reference @@ -0,0 +1 @@ +1 (1,2,3) 1 diff --git a/dbms/tests/queries/0_stateless/00915_tuple_orantius.sql b/dbms/tests/queries/0_stateless/00915_tuple_orantius.sql new file mode 100644 index 00000000000..938260c5123 --- /dev/null +++ b/dbms/tests/queries/0_stateless/00915_tuple_orantius.sql @@ -0,0 +1 @@ +select 1 as x, (1,2,3) as y, x in y; From 6db73152d2e1b7b35a03b7e146549dd84fe2992d Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Sun, 10 Mar 2019 06:16:51 +0300 Subject: [PATCH 25/25] Hardening debug build (experimental) (#4632) * Hardening debug build: more granular memory mappings and ASLR; add memory protection for mark cache and index * Addition to prev. revision * Addition to prev. revision * Addition to prev. revision --- dbms/src/AggregateFunctions/QuantileTDigest.h | 2 +- dbms/src/Columns/ColumnAggregateFunction.cpp | 5 ++ dbms/src/Columns/ColumnAggregateFunction.h | 2 + dbms/src/Columns/ColumnArray.cpp | 7 +++ dbms/src/Columns/ColumnArray.h | 1 + dbms/src/Columns/ColumnDecimal.h | 1 + dbms/src/Columns/ColumnFixedString.h | 5 ++ dbms/src/Columns/ColumnLowCardinality.cpp | 1 - dbms/src/Columns/ColumnNullable.cpp | 6 ++ dbms/src/Columns/ColumnNullable.h | 1 + dbms/src/Columns/ColumnString.cpp | 7 +++ dbms/src/Columns/ColumnString.h | 2 + dbms/src/Columns/ColumnTuple.cpp | 6 ++ dbms/src/Columns/ColumnTuple.h | 1 + dbms/src/Columns/ColumnUnique.h | 1 + dbms/src/Columns/ColumnVector.h | 5 ++ dbms/src/Columns/ColumnVectorHelper.h | 3 +- dbms/src/Columns/IColumn.h | 4 ++ dbms/src/Common/Allocator.cpp | 23 ++++++- dbms/src/Common/Allocator.h | 19 ++++++ dbms/src/Common/ErrorCodes.cpp | 1 + dbms/src/Common/PODArray.h | 63 +++++++++++++++++++ dbms/src/Interpreters/AggregationCommon.h | 20 +++--- .../Storages/MergeTree/MergeTreeDataPart.cpp | 5 +- .../MergeTree/MergeTreeReaderStream.cpp | 1 + libs/libcommon/include/common/mremap.h | 9 ++- 26 files changed, 182 insertions(+), 19 deletions(-) diff --git a/dbms/src/AggregateFunctions/QuantileTDigest.h b/dbms/src/AggregateFunctions/QuantileTDigest.h index ca7d4f2fb1a..c4ee76b6eed 100644 --- a/dbms/src/AggregateFunctions/QuantileTDigest.h +++ b/dbms/src/AggregateFunctions/QuantileTDigest.h @@ -85,7 +85,7 @@ class QuantileTDigest Params params; /// The memory will be allocated to several elements at once, so that the state occupies 64 bytes. - static constexpr size_t bytes_in_arena = 64 - sizeof(PODArray) - sizeof(Count) - sizeof(UInt32); + static constexpr size_t bytes_in_arena = 128 - sizeof(PODArray) - sizeof(Count) - sizeof(UInt32); using Summary = PODArray, bytes_in_arena>>; diff --git a/dbms/src/Columns/ColumnAggregateFunction.cpp b/dbms/src/Columns/ColumnAggregateFunction.cpp index 69bcdac2ab7..4652e4a08c8 100644 --- a/dbms/src/Columns/ColumnAggregateFunction.cpp +++ b/dbms/src/Columns/ColumnAggregateFunction.cpp @@ -255,6 +255,11 @@ size_t ColumnAggregateFunction::allocatedBytes() const return res; } +void ColumnAggregateFunction::protect() +{ + data.protect(); +} + MutableColumnPtr ColumnAggregateFunction::cloneEmpty() const { return create(func, Arenas(1, std::make_shared())); diff --git a/dbms/src/Columns/ColumnAggregateFunction.h b/dbms/src/Columns/ColumnAggregateFunction.h index 3fc76b4c047..a028a95d68c 100644 --- a/dbms/src/Columns/ColumnAggregateFunction.h +++ b/dbms/src/Columns/ColumnAggregateFunction.h @@ -157,6 +157,8 @@ public: size_t allocatedBytes() const override; + void protect() override; + void insertRangeFrom(const IColumn & from, size_t start, size_t length) override; void popBack(size_t n) override; diff --git a/dbms/src/Columns/ColumnArray.cpp b/dbms/src/Columns/ColumnArray.cpp index 4ceda666db7..eeb06b64f49 100644 --- a/dbms/src/Columns/ColumnArray.cpp +++ b/dbms/src/Columns/ColumnArray.cpp @@ -311,6 +311,13 @@ size_t ColumnArray::allocatedBytes() const } +void ColumnArray::protect() +{ + getData().protect(); + getOffsets().protect(); +} + + bool ColumnArray::hasEqualOffsets(const ColumnArray & other) const { if (offsets == other.offsets) diff --git a/dbms/src/Columns/ColumnArray.h b/dbms/src/Columns/ColumnArray.h index 3e1b586e755..d58dfba025a 100644 --- a/dbms/src/Columns/ColumnArray.h +++ b/dbms/src/Columns/ColumnArray.h @@ -78,6 +78,7 @@ public: void reserve(size_t n) override; size_t byteSize() const override; size_t allocatedBytes() const override; + void protect() override; ColumnPtr replicate(const Offsets & replicate_offsets) const override; ColumnPtr convertToFullColumnIfConst() const override; void getExtremes(Field & min, Field & max) const override; diff --git a/dbms/src/Columns/ColumnDecimal.h b/dbms/src/Columns/ColumnDecimal.h index 50a6d9d67fb..372b0c245c0 100644 --- a/dbms/src/Columns/ColumnDecimal.h +++ b/dbms/src/Columns/ColumnDecimal.h @@ -87,6 +87,7 @@ public: size_t size() const override { return data.size(); } size_t byteSize() const override { return data.size() * sizeof(data[0]); } size_t allocatedBytes() const override { return data.allocated_bytes(); } + void protect() override { data.protect(); } void reserve(size_t n) override { data.reserve(n); } void insertFrom(const IColumn & src, size_t n) override { data.push_back(static_cast(src).getData()[n]); } diff --git a/dbms/src/Columns/ColumnFixedString.h b/dbms/src/Columns/ColumnFixedString.h index 941314b8888..b773d7c8eb4 100644 --- a/dbms/src/Columns/ColumnFixedString.h +++ b/dbms/src/Columns/ColumnFixedString.h @@ -57,6 +57,11 @@ public: return chars.allocated_bytes() + sizeof(n); } + void protect() override + { + chars.protect(); + } + Field operator[](size_t index) const override { return String(reinterpret_cast(&chars[n * index]), n); diff --git a/dbms/src/Columns/ColumnLowCardinality.cpp b/dbms/src/Columns/ColumnLowCardinality.cpp index c919116112c..c9a475fd8a6 100644 --- a/dbms/src/Columns/ColumnLowCardinality.cpp +++ b/dbms/src/Columns/ColumnLowCardinality.cpp @@ -363,7 +363,6 @@ ColumnPtr ColumnLowCardinality::countKeys() const } - ColumnLowCardinality::Index::Index() : positions(ColumnUInt8::create()), size_of_type(sizeof(UInt8)) {} ColumnLowCardinality::Index::Index(MutableColumnPtr && positions) : positions(std::move(positions)) diff --git a/dbms/src/Columns/ColumnNullable.cpp b/dbms/src/Columns/ColumnNullable.cpp index b88cf60581b..d9a8ea4f825 100644 --- a/dbms/src/Columns/ColumnNullable.cpp +++ b/dbms/src/Columns/ColumnNullable.cpp @@ -291,6 +291,12 @@ size_t ColumnNullable::allocatedBytes() const return getNestedColumn().allocatedBytes() + getNullMapColumn().allocatedBytes(); } +void ColumnNullable::protect() +{ + getNestedColumn().protect(); + getNullMapColumn().protect(); +} + namespace { diff --git a/dbms/src/Columns/ColumnNullable.h b/dbms/src/Columns/ColumnNullable.h index c8453a29689..8012d03b0e8 100644 --- a/dbms/src/Columns/ColumnNullable.h +++ b/dbms/src/Columns/ColumnNullable.h @@ -71,6 +71,7 @@ public: void reserve(size_t n) override; size_t byteSize() const override; size_t allocatedBytes() const override; + void protect() override; ColumnPtr replicate(const Offsets & replicate_offsets) const override; void updateHashWithValue(size_t n, SipHash & hash) const override; void getExtremes(Field & min, Field & max) const override; diff --git a/dbms/src/Columns/ColumnString.cpp b/dbms/src/Columns/ColumnString.cpp index 1717c02f1df..1443283783a 100644 --- a/dbms/src/Columns/ColumnString.cpp +++ b/dbms/src/Columns/ColumnString.cpp @@ -412,4 +412,11 @@ void ColumnString::getPermutationWithCollation(const Collator & collator, bool r } } + +void ColumnString::protect() +{ + getChars().protect(); + getOffsets().protect(); +} + } diff --git a/dbms/src/Columns/ColumnString.h b/dbms/src/Columns/ColumnString.h index 5ca05079bd5..a30a4ceb5a1 100644 --- a/dbms/src/Columns/ColumnString.h +++ b/dbms/src/Columns/ColumnString.h @@ -68,6 +68,8 @@ public: return chars.allocated_bytes() + offsets.allocated_bytes(); } + void protect() override; + MutableColumnPtr cloneResized(size_t to_size) const override; Field operator[](size_t n) const override diff --git a/dbms/src/Columns/ColumnTuple.cpp b/dbms/src/Columns/ColumnTuple.cpp index c235cd07c31..ec0bcc1f5b5 100644 --- a/dbms/src/Columns/ColumnTuple.cpp +++ b/dbms/src/Columns/ColumnTuple.cpp @@ -315,6 +315,12 @@ size_t ColumnTuple::allocatedBytes() const return res; } +void ColumnTuple::protect() +{ + for (auto & column : columns) + column->assumeMutableRef().protect(); +} + void ColumnTuple::getExtremes(Field & min, Field & max) const { const size_t tuple_size = columns.size(); diff --git a/dbms/src/Columns/ColumnTuple.h b/dbms/src/Columns/ColumnTuple.h index d146c8bff6c..c39a92e3c8c 100644 --- a/dbms/src/Columns/ColumnTuple.h +++ b/dbms/src/Columns/ColumnTuple.h @@ -71,6 +71,7 @@ public: void reserve(size_t n) override; size_t byteSize() const override; size_t allocatedBytes() const override; + void protect() override; void forEachSubcolumn(ColumnCallback callback) override; size_t tupleSize() const { return columns.size(); } diff --git a/dbms/src/Columns/ColumnUnique.h b/dbms/src/Columns/ColumnUnique.h index 85a9c498a94..5eee80dc9d8 100644 --- a/dbms/src/Columns/ColumnUnique.h +++ b/dbms/src/Columns/ColumnUnique.h @@ -80,6 +80,7 @@ public: bool isNumeric() const override { return column_holder->isNumeric(); } size_t byteSize() const override { return column_holder->byteSize(); } + void protect() override { column_holder->assumeMutableRef().protect(); } size_t allocatedBytes() const override { return column_holder->allocatedBytes() diff --git a/dbms/src/Columns/ColumnVector.h b/dbms/src/Columns/ColumnVector.h index 1c5a45ef6ad..9de84f95b4a 100644 --- a/dbms/src/Columns/ColumnVector.h +++ b/dbms/src/Columns/ColumnVector.h @@ -163,6 +163,11 @@ public: return data.allocated_bytes(); } + void protect() override + { + data.protect(); + } + void insertValue(const T value) { data.push_back(value); diff --git a/dbms/src/Columns/ColumnVectorHelper.h b/dbms/src/Columns/ColumnVectorHelper.h index 8a25812ffe7..d805f44218c 100644 --- a/dbms/src/Columns/ColumnVectorHelper.h +++ b/dbms/src/Columns/ColumnVectorHelper.h @@ -24,9 +24,10 @@ namespace DB class ColumnVectorHelper : public IColumn { public: + template const char * getRawDataBegin() const { - return *reinterpret_cast(reinterpret_cast(this) + sizeof(*this)); + return reinterpret_cast, 15, 16> *>(reinterpret_cast(this) + sizeof(*this))->raw_data(); } template diff --git a/dbms/src/Columns/IColumn.h b/dbms/src/Columns/IColumn.h index 2560b9639ad..86a1097d368 100644 --- a/dbms/src/Columns/IColumn.h +++ b/dbms/src/Columns/IColumn.h @@ -253,6 +253,10 @@ public: /// Zero, if could be determined. virtual size_t allocatedBytes() const = 0; + /// Make memory region readonly with mprotect if it is large enough. + /// The operation is slow and performed only for debug builds. + virtual void protect() {} + /// If the column contains subcolumns (such as Array, Nullable, etc), do callback on them. /// Shallow: doesn't do recursive calls; don't do call for itself. using ColumnCallback = std::function; diff --git a/dbms/src/Common/Allocator.cpp b/dbms/src/Common/Allocator.cpp index ba0c7820187..92ff10eafb7 100644 --- a/dbms/src/Common/Allocator.cpp +++ b/dbms/src/Common/Allocator.cpp @@ -43,11 +43,30 @@ namespace ErrorCodes * * PS. This is also required, because tcmalloc can not allocate a chunk of memory greater than 16 GB. */ -static constexpr size_t MMAP_THRESHOLD = 64 * (1ULL << 20); +#ifdef NDEBUG + static constexpr size_t MMAP_THRESHOLD = 64 * (1ULL << 20); +#else + /// In debug build, use small mmap threshold to reproduce more memory stomping bugs. + /// Along with ASLR it will hopefully detect more issues than ASan. + /// The program may fail due to the limit on number of memory mappings. + static constexpr size_t MMAP_THRESHOLD = 4096; +#endif + static constexpr size_t MMAP_MIN_ALIGNMENT = 4096; static constexpr size_t MALLOC_MIN_ALIGNMENT = 8; +template +void * Allocator::mmap_hint() +{ +#if ALLOCATOR_ASLR + return reinterpret_cast(std::uniform_int_distribution(0x100000000000UL, 0x700000000000UL)(rng)); +#else + return nullptr; +#endif +} + + template void * Allocator::alloc(size_t size, size_t alignment) { @@ -61,7 +80,7 @@ void * Allocator::alloc(size_t size, size_t alignment) throw DB::Exception("Too large alignment " + formatReadableSizeWithBinarySuffix(alignment) + ": more than page size when allocating " + formatReadableSizeWithBinarySuffix(size) + ".", DB::ErrorCodes::BAD_ARGUMENTS); - buf = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + buf = mmap(mmap_hint(), size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (MAP_FAILED == buf) DB::throwFromErrno("Allocator: Cannot mmap " + formatReadableSizeWithBinarySuffix(size) + ".", DB::ErrorCodes::CANNOT_ALLOCATE_MEMORY); diff --git a/dbms/src/Common/Allocator.h b/dbms/src/Common/Allocator.h index 9a2ab0b975c..d2a81f77b62 100644 --- a/dbms/src/Common/Allocator.h +++ b/dbms/src/Common/Allocator.h @@ -2,6 +2,19 @@ #include +#ifdef NDEBUG + /// If set to 1 - randomize memory mappings manually (address space layout randomization) to reproduce more memory stomping bugs. + /// Note that Linux doesn't do it by default. This may lead to worse TLB performance. + #define ALLOCATOR_ASLR 0 +#else + #define ALLOCATOR_ASLR 1 +#endif + +#if ALLOCATOR_ASLR + #include + #include +#endif + /** Responsible for allocating / freeing memory. Used, for example, in PODArray, Arena. * Also used in hash tables. @@ -14,6 +27,12 @@ template class Allocator { +#if ALLOCATOR_ASLR +private: + pcg64 rng{randomSeed()}; +#endif + void * mmap_hint(); + protected: static constexpr bool clear_memory = clear_memory_; diff --git a/dbms/src/Common/ErrorCodes.cpp b/dbms/src/Common/ErrorCodes.cpp index d3401427037..f974b2bdaf6 100644 --- a/dbms/src/Common/ErrorCodes.cpp +++ b/dbms/src/Common/ErrorCodes.cpp @@ -419,6 +419,7 @@ namespace ErrorCodes extern const int BAD_DATABASE_FOR_TEMPORARY_TABLE = 442; extern const int NO_COMMON_COLUMNS_WITH_PROTOBUF_SCHEMA = 443; extern const int UNKNOWN_PROTOBUF_FORMAT = 444; + extern const int CANNOT_MPROTECT = 445; extern const int KEEPER_EXCEPTION = 999; extern const int POCO_EXCEPTION = 1000; diff --git a/dbms/src/Common/PODArray.h b/dbms/src/Common/PODArray.h index 462842f8236..a7b8b02bb98 100644 --- a/dbms/src/Common/PODArray.h +++ b/dbms/src/Common/PODArray.h @@ -17,10 +17,19 @@ #include #include +#ifndef NDEBUG + #include +#endif + namespace DB { +namespace ErrorCodes +{ + extern const int CANNOT_MPROTECT; +} + inline constexpr size_t integerRoundUp(size_t value, size_t dividend) { return ((value + dividend - 1) / dividend) * dividend; @@ -108,6 +117,8 @@ protected: if (c_start == null) return; + unprotect(); + TAllocator::free(c_start - pad_left, allocated_bytes()); } @@ -120,6 +131,8 @@ protected: return; } + unprotect(); + ptrdiff_t end_diff = c_end - c_start; c_start = reinterpret_cast( @@ -155,6 +168,28 @@ protected: realloc(allocated_bytes() * 2, std::forward(allocator_params)...); } +#ifndef NDEBUG + /// Make memory region readonly with mprotect if it is large enough. + /// The operation is slow and performed only for debug builds. + void protectImpl(int prot) + { + static constexpr size_t PAGE_SIZE = 4096; + + char * left_rounded_up = reinterpret_cast((reinterpret_cast(c_start) - pad_left + PAGE_SIZE - 1) / PAGE_SIZE * PAGE_SIZE); + char * right_rounded_down = reinterpret_cast((reinterpret_cast(c_end_of_storage) + pad_right) / PAGE_SIZE * PAGE_SIZE); + + if (right_rounded_down > left_rounded_up) + { + size_t length = right_rounded_down - left_rounded_up; + if (0 != mprotect(left_rounded_up, length, prot)) + throwFromErrno("Cannot mprotect memory region", ErrorCodes::CANNOT_MPROTECT); + } + } + + /// Restore memory protection in destructor or realloc for further reuse by allocator. + bool mprotected = false; +#endif + public: bool empty() const { return c_end == c_start; } size_t size() const { return (c_end - c_start) / ELEMENT_SIZE; } @@ -199,6 +234,23 @@ public: c_end += byte_size(1); } + void protect() + { +#ifndef NDEBUG + protectImpl(PROT_READ); + mprotected = true; +#endif + } + + void unprotect() + { +#ifndef NDEBUG + if (mprotected) + protectImpl(PROT_WRITE); + mprotected = false; +#endif + } + ~PODArrayBase() { dealloc(); @@ -402,6 +454,11 @@ public: void swap(PODArray & rhs) { +#ifndef NDEBUG + this->unprotect(); + rhs.unprotect(); +#endif + /// Swap two PODArray objects, arr1 and arr2, that satisfy the following conditions: /// - The elements of arr1 are stored on stack. /// - The elements of arr2 are stored on heap. @@ -450,7 +507,9 @@ public: }; if (!this->isInitialized() && !rhs.isInitialized()) + { return; + } else if (!this->isInitialized() && rhs.isInitialized()) { do_move(rhs, *this); @@ -494,9 +553,13 @@ public: rhs.c_end = rhs.c_start + this->byte_size(lhs_size); } else if (this->isAllocatedFromStack() && !rhs.isAllocatedFromStack()) + { swap_stack_heap(*this, rhs); + } else if (!this->isAllocatedFromStack() && rhs.isAllocatedFromStack()) + { swap_stack_heap(rhs, *this); + } else { std::swap(this->c_start, rhs.c_start); diff --git a/dbms/src/Interpreters/AggregationCommon.h b/dbms/src/Interpreters/AggregationCommon.h index 12c2d53819b..74836d4463d 100644 --- a/dbms/src/Interpreters/AggregationCommon.h +++ b/dbms/src/Interpreters/AggregationCommon.h @@ -102,23 +102,23 @@ static inline T ALWAYS_INLINE packFixed( switch (key_sizes[j]) { case 1: - memcpy(bytes + offset, static_cast(column)->getRawDataBegin() + index, 1); + memcpy(bytes + offset, static_cast(column)->getRawDataBegin<1>() + index, 1); offset += 1; break; case 2: - memcpy(bytes + offset, static_cast(column)->getRawDataBegin() + index * 2, 2); + memcpy(bytes + offset, static_cast(column)->getRawDataBegin<2>() + index * 2, 2); offset += 2; break; case 4: - memcpy(bytes + offset, static_cast(column)->getRawDataBegin() + index * 4, 4); + memcpy(bytes + offset, static_cast(column)->getRawDataBegin<4>() + index * 4, 4); offset += 4; break; case 8: - memcpy(bytes + offset, static_cast(column)->getRawDataBegin() + index * 8, 8); + memcpy(bytes + offset, static_cast(column)->getRawDataBegin<8>() + index * 8, 8); offset += 8; break; default: - memcpy(bytes + offset, static_cast(column)->getRawDataBegin() + index * key_sizes[j], key_sizes[j]); + memcpy(bytes + offset, static_cast(column)->getRawDataBegin<1>() + index * key_sizes[j], key_sizes[j]); offset += key_sizes[j]; } } @@ -168,23 +168,23 @@ static inline T ALWAYS_INLINE packFixed( switch (key_sizes[j]) { case 1: - memcpy(bytes + offset, static_cast(key_columns[j])->getRawDataBegin() + i, 1); + memcpy(bytes + offset, static_cast(key_columns[j])->getRawDataBegin<1>() + i, 1); offset += 1; break; case 2: - memcpy(bytes + offset, static_cast(key_columns[j])->getRawDataBegin() + i * 2, 2); + memcpy(bytes + offset, static_cast(key_columns[j])->getRawDataBegin<2>() + i * 2, 2); offset += 2; break; case 4: - memcpy(bytes + offset, static_cast(key_columns[j])->getRawDataBegin() + i * 4, 4); + memcpy(bytes + offset, static_cast(key_columns[j])->getRawDataBegin<4>() + i * 4, 4); offset += 4; break; case 8: - memcpy(bytes + offset, static_cast(key_columns[j])->getRawDataBegin() + i * 8, 8); + memcpy(bytes + offset, static_cast(key_columns[j])->getRawDataBegin<8>() + i * 8, 8); offset += 8; break; default: - memcpy(bytes + offset, static_cast(key_columns[j])->getRawDataBegin() + i * key_sizes[j], key_sizes[j]); + memcpy(bytes + offset, static_cast(key_columns[j])->getRawDataBegin<1>() + i * key_sizes[j], key_sizes[j]); offset += key_sizes[j]; } } diff --git a/dbms/src/Storages/MergeTree/MergeTreeDataPart.cpp b/dbms/src/Storages/MergeTree/MergeTreeDataPart.cpp index bf9c5b3409d..01ff4c4cdac 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeDataPart.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeDataPart.cpp @@ -513,13 +513,16 @@ void MergeTreeDataPart::loadIndex() for (size_t i = 0; i < marks_count; ++i) //-V756 for (size_t j = 0; j < key_size; ++j) - storage.primary_key_data_types[j]->deserializeBinary(*loaded_index[j].get(), index_file); + storage.primary_key_data_types[j]->deserializeBinary(*loaded_index[j], index_file); for (size_t i = 0; i < key_size; ++i) + { + loaded_index[i]->protect(); if (loaded_index[i]->size() != marks_count) throw Exception("Cannot read all data from index file " + index_path + "(expected size: " + toString(marks_count) + ", read: " + toString(loaded_index[i]->size()) + ")", ErrorCodes::CANNOT_READ_ALL_DATA); + } if (!index_file.eof()) throw Exception("Index file " + index_path + " is unexpectedly long", ErrorCodes::EXPECTED_END_OF_FILE); diff --git a/dbms/src/Storages/MergeTree/MergeTreeReaderStream.cpp b/dbms/src/Storages/MergeTree/MergeTreeReaderStream.cpp index 9091228d80a..89f5aaeafd5 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeReaderStream.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeReaderStream.cpp @@ -132,6 +132,7 @@ void MergeTreeReaderStream::loadMarks() if (buffer.eof() || buffer.buffer().size() != file_size) throw Exception("Cannot read all marks from file " + mrk_path, ErrorCodes::CANNOT_READ_ALL_DATA); + res->protect(); return res; }; diff --git a/libs/libcommon/include/common/mremap.h b/libs/libcommon/include/common/mremap.h index f569ff05d4e..31ca74da827 100644 --- a/libs/libcommon/include/common/mremap.h +++ b/libs/libcommon/include/common/mremap.h @@ -12,7 +12,8 @@ #define MREMAP_MAYMOVE 1 -void * mremap(void * old_address, +void * mremap( + void * old_address, size_t old_size, size_t new_size, int flags = 0, @@ -23,7 +24,8 @@ void * mremap(void * old_address, #endif -inline void * clickhouse_mremap(void * old_address, +inline void * clickhouse_mremap( + void * old_address, size_t old_size, size_t new_size, int flags = 0, @@ -32,7 +34,8 @@ inline void * clickhouse_mremap(void * old_address, [[maybe_unused]] int mmap_fd = -1, [[maybe_unused]] off_t mmap_offset = 0) { - return mremap(old_address, + return mremap( + old_address, old_size, new_size, flags