From 952b5ea24a09528d9f3caa4d3e033b182143a060 Mon Sep 17 00:00:00 2001
From: Danila Kutenin <kutdanila@yandex.ru>
Date: Wed, 6 Mar 2019 01:42:28 +0300
Subject: [PATCH 01/25] Rename trigramDistance to ngramDistance, add more
 functions with CaseInsensitive and UTF, update docs, more job done in perf,
 added some perf tests for string search that I would like to see

---
 .../Functions/FunctionsStringSimilarity.cpp   | 340 ++++++++----
 .../src/Functions/FunctionsStringSimilarity.h |   3 +-
 dbms/tests/performance/website/url_hits.xml   |   6 +
 .../00909_ngram_distance.reference            | 509 ++++++++++++++++++
 .../0_stateless/00909_ngram_distance.sql      | 106 ++++
 .../00909_trigram_distance.reference          | 119 ----
 .../0_stateless/00909_trigram_distance.sql    |  29 -
 .../functions/string_search_functions.md      |   8 +
 .../functions/string_search_functions.md      |   8 +
 9 files changed, 878 insertions(+), 250 deletions(-)
 create mode 100644 dbms/tests/queries/0_stateless/00909_ngram_distance.reference
 create mode 100644 dbms/tests/queries/0_stateless/00909_ngram_distance.sql
 delete mode 100644 dbms/tests/queries/0_stateless/00909_trigram_distance.reference
 delete mode 100644 dbms/tests/queries/0_stateless/00909_trigram_distance.sql
diff --git a/dbms/src/Functions/FunctionsStringSimilarity.cpp b/dbms/src/Functions/FunctionsStringSimilarity.cpp
index 7f0267d6d59..a90c7e82acd 100644
--- a/dbms/src/Functions/FunctionsStringSimilarity.cpp
+++ b/dbms/src/Functions/FunctionsStringSimilarity.cpp
@@ -8,164 +8,271 @@
 
 #include <Core/Defines.h>
 
+#include <common/unaligned.h>
+
 #include <algorithm>
+#include <climits>
 #include <cstring>
 #include <limits>
 #include <memory>
+#include <utility>
 
 #ifdef __SSE4_2__
-#include <nmmintrin.h>
+#    include <nmmintrin.h>
 #endif
 
 namespace DB
 {
 /** Distance function implementation.
-  * We calculate all the trigrams from left string and count by the index of
+  * We calculate all the n-grams from left string and count by the index of
   * 16 bits hash of them in the map.
-  * Then calculate all the trigrams from the right string and calculate
-  * the trigram distance on the flight by adding and subtracting from the hashmap.
+  * Then calculate all the n-grams from the right string and calculate
+  * the n-gram distance on the flight by adding and subtracting from the hashmap.
   * Then return the map into the condition of which it was after the left string
   * calculation. If the right string size is big (more than 2**15 bytes),
   * the strings are not similar at all and we return 1.
   */
-struct TrigramDistanceImpl
+template <size_t N, class CodePoint, bool UTF8, bool CaseInsensitive>
+struct NgramDistanceImpl
 {
     using ResultType = Float32;
-    using CodePoint = UInt32;
 
-    /// map_size for trigram difference
+    /// map_size for ngram difference.
     static constexpr size_t map_size = 1u << 16;
 
-    /// If the haystack size is bigger than this, behaviour is unspecified for this function
+    /// If the haystack size is bigger than this, behaviour is unspecified for this function.
     static constexpr size_t max_string_size = 1u << 15;
 
+    /// Default padding to read safely.
+    static constexpr size_t default_padding = 16;
+
+    /// Max codepoints to store at once. 16 is for batching usage and PODArray has this padding.
+    static constexpr size_t simultaneously_codepoints_num = default_padding + N - 1;
+
     /** This fits mostly in L2 cache all the time.
       * Actually use UInt16 as addings and subtractions do not UB overflow. But think of it as a signed
       * integer array.
       */
-    using TrigramStats = UInt16[map_size];
+    using NgramStats = UInt16[map_size];
 
-    static ALWAYS_INLINE UInt16 trigramHash(CodePoint one, CodePoint two, CodePoint three)
+    static ALWAYS_INLINE UInt16 ASCIIHash(const CodePoint * code_points)
     {
-        UInt64 combined = (static_cast<UInt64>(one) << 32) | two;
+        return intHashCRC32(unalignedLoad<UInt32>(code_points)) & 0xFFFFu;
+    }
+
+    static ALWAYS_INLINE UInt16 UTF8Hash(const CodePoint * code_points)
+    {
+        UInt64 combined = (static_cast<UInt64>(code_points[0]) << 32) | code_points[1];
 #ifdef __SSE4_2__
-        return _mm_crc32_u64(three, combined) & 0xFFFFu;
+        return _mm_crc32_u64(code_points[2], combined) & 0xFFFFu;
 #else
-        return (intHashCRC32(combined) ^ intHashCRC32(three)) & 0xFFFFu;
+        return (intHashCRC32(combined) ^ intHashCRC32(code_points[2])) & 0xFFFFu;
 #endif
     }
 
-    static ALWAYS_INLINE CodePoint readCodePoint(const char *& pos, const char * end) noexcept
+    template <size_t Offset, class Container, size_t... I>
+    static ALWAYS_INLINE inline void unrollLowering(Container & cont, const std::index_sequence<I...> &)
     {
-        size_t length = UTF8::seqLength(*pos);
-
-        if (pos + length > end)
-            length = end - pos;
-
-        CodePoint res;
-        /// This is faster than just memcpy because of compiler optimizations with moving bytes.
-        switch (length)
-        {
-            case 1:
-                res = 0;
-                memcpy(&res, pos, 1);
-                break;
-            case 2:
-                res = 0;
-                memcpy(&res, pos, 2);
-                break;
-            case 3:
-                res = 0;
-                memcpy(&res, pos, 3);
-                break;
-            default:
-                memcpy(&res, pos, 4);
-        }
-
-        pos += length;
-        return res;
+        ((cont[Offset + I] = std::tolower(cont[Offset + I])), ...);
     }
 
-    static inline size_t calculateNeedleStats(const char * data, const size_t size, TrigramStats & trigram_stats) noexcept
+    static ALWAYS_INLINE size_t readASCIICodePoints(CodePoint * code_points, const char *& pos, const char * end)
     {
-        size_t len = 0;
-        const char * start = data;
-        const char * end = data + size;
-        CodePoint cp1 = 0;
-        CodePoint cp2 = 0;
-        CodePoint cp3 = 0;
+        /// Offset before which we copy some data.
+        constexpr size_t padding_offset = default_padding - N + 1;
+        /// We have an array like this for ASCII (N == 4, other cases are similar)
+        /// |a0|a1|a2|a3|a4|a5|a6|a7|a8|a9|a10|a11|a12|a13|a14|a15|a16|a17|a18|
+        /// And we copy                                ^^^^^^^^^^^^^^^ these bytes to the start
+        /// Actually it is enough to copy 3 bytes, but memcpy for 4 bytes translates into 1 instruction
+        memcpy(code_points, code_points + padding_offset, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(CodePoint));
+        /// Now we have an array
+        /// |a13|a14|a15|a16|a4|a5|a6|a7|a8|a9|a10|a11|a12|a13|a14|a15|a16|a17|a18|
+        ///              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+        /// Doing unaligned read of 16 bytes and copy them like above
+        /// 16 is also chosen to do two `movups`.
+        /// Such copying allow us to have 3 codepoints from the previous read to produce the n-gram with them.
+        memcpy(code_points + (N - 1), pos, default_padding * sizeof(CodePoint));
 
-        while (start != end)
+        if constexpr (CaseInsensitive)
         {
-            cp1 = cp2;
-            cp2 = cp3;
-            cp3 = readCodePoint(start, end);
-            ++len;
-            if (len < 3)
-                continue;
-            ++trigram_stats[trigramHash(cp1, cp2, cp3)];
+            /// We really need template lambdas with C++20 to do it inline
+            unrollLowering<N - 1>(code_points, std::make_index_sequence<padding_offset>());
         }
-        return std::max(static_cast<Int64>(0), static_cast<Int64>(len) - 2);
+        pos += padding_offset;
+        if (pos > end)
+            return default_padding - (pos - end);
+        return default_padding;
     }
 
-    static inline UInt64 calculateHaystackStatsAndMetric(const char * data, const size_t size, TrigramStats & trigram_stats, size_t & distance)
+    static ALWAYS_INLINE size_t readUTF8CodePoints(CodePoint * code_points, const char *& pos, const char * end)
     {
-        size_t len = 0;
-        size_t trigram_cnt = 0;
+        /// The same copying as described in the function above.
+        memcpy(code_points, code_points + default_padding - N + 1, roundUpToPowerOfTwoOrZero(N - 1) * sizeof(CodePoint));
+
+        size_t num = N - 1;
+        while (num < default_padding && pos < end)
+        {
+            size_t length = UTF8::seqLength(*pos);
+
+            if (pos + length > end)
+                length = end - pos;
+
+            CodePoint res;
+            /// This is faster than just memcpy because of compiler optimizations with moving bytes.
+            switch (length)
+            {
+                case 1:
+                    res = 0;
+                    memcpy(&res, pos, 1);
+                    break;
+                case 2:
+                    res = 0;
+                    memcpy(&res, pos, 2);
+                    break;
+                case 3:
+                    res = 0;
+                    memcpy(&res, pos, 3);
+                    break;
+                default:
+                    memcpy(&res, pos, 4);
+            }
+
+            /// This is not a really true case insensitive utf8. We zero the 5-th bit of every byte.
+            /// For ASCII it works https://catonmat.net/ascii-case-conversion-trick. For most cyrrilic letters also does.
+            /// For others, we don't care now. Lowering UTF is not a cheap operation.
+            if constexpr (CaseInsensitive)
+            {
+                switch (length)
+                {
+                    case 4:
+                        res &= ~(1u << (5 + 3 * CHAR_BIT));
+                        [[fallthrough]];
+                    case 3:
+                        res &= ~(1u << (5 + 2 * CHAR_BIT));
+                        [[fallthrough]];
+                    case 2:
+                        res &= ~(1u << (5 + CHAR_BIT));
+                        [[fallthrough]];
+                    default:
+                        res &= ~(1u << 5);
+                }
+            }
+
+            pos += length;
+            code_points[num++] = res;
+        }
+        return num;
+    }
+
+    static ALWAYS_INLINE inline size_t calculateNeedleStats(
+        const char * data,
+        const size_t size,
+        NgramStats & ngram_stats,
+        size_t (*read_code_points)(CodePoint *, const char *&, const char *),
+        UInt16 (*hash_functor)(const CodePoint *))
+    {
+        // To prevent size_t overflow below.
+        if (size < N)
+            return 0;
+
         const char * start = data;
         const char * end = data + size;
-        CodePoint cp1 = 0;
-        CodePoint cp2 = 0;
-        CodePoint cp3 = 0;
+        CodePoint cp[simultaneously_codepoints_num] = {};
+
+        /// read_code_points returns the position of cp where it stopped reading codepoints.
+        size_t found = read_code_points(cp, start, end);
+        /// We need to start for the first time here, because first N - 1 codepoints mean nothing.
+        size_t i = N - 1;
+        /// Initialize with this value because for the first time `found` does not initialize first N - 1 codepoints.
+        size_t len = -N + 1;
+        do
+        {
+            len += found - N + 1;
+            for (; i + N <= found; ++i)
+                ++ngram_stats[hash_functor(cp + i)];
+            i = 0;
+        } while (start < end && (found = read_code_points(cp, start, end)));
+
+        return len;
+    }
+
+    static ALWAYS_INLINE inline UInt64 calculateHaystackStatsAndMetric(
+        const char * data,
+        const size_t size,
+        NgramStats & ngram_stats,
+        size_t & distance,
+        size_t (*read_code_points)(CodePoint *, const char *&, const char *),
+        UInt16 (*hash_functor)(const CodePoint *))
+    {
+        size_t ngram_cnt = 0;
+        const char * start = data;
+        const char * end = data + size;
+        CodePoint cp[simultaneously_codepoints_num] = {};
 
         /// allocation tricks, most strings are relatively small
         static constexpr size_t small_buffer_size = 256;
         std::unique_ptr<UInt16[]> big_buffer;
         UInt16 small_buffer[small_buffer_size];
-        UInt16 * trigram_storage = small_buffer;
+        UInt16 * ngram_storage = small_buffer;
 
         if (size > small_buffer_size)
         {
-            trigram_storage = new UInt16[size];
-            big_buffer.reset(trigram_storage);
+            ngram_storage = new UInt16[size];
+            big_buffer.reset(ngram_storage);
         }
 
-        while (start != end)
+        /// read_code_points returns the position of cp where it stopped reading codepoints.
+        size_t found = read_code_points(cp, start, end);
+        /// We need to start for the first time here, because first N - 1 codepoints mean nothing.
+        size_t iter = N - 1;
+
+        do
         {
-            cp1 = cp2;
-            cp2 = cp3;
-            cp3 = readCodePoint(start, end);
-            ++len;
-            if (len < 3)
-                continue;
+            for (; iter + N <= found; ++iter)
+            {
+                UInt16 hash = hash_functor(cp + iter);
+                if (static_cast<Int16>(ngram_stats[hash]) > 0)
+                    --distance;
+                else
+                    ++distance;
 
-            UInt16 hash = trigramHash(cp1, cp2, cp3);
-
-            if (static_cast<Int16>(trigram_stats[hash]) > 0)
-                --distance;
-            else
-                ++distance;
-
-            trigram_storage[trigram_cnt++] = hash;
-            --trigram_stats[hash];
-        }
+                ngram_storage[ngram_cnt++] = hash;
+                --ngram_stats[hash];
+            }
+            iter = 0;
+        } while (start < end && (found = read_code_points(cp, start, end)));
 
         /// Return the state of hash map to its initial.
-        for (size_t i = 0; i < trigram_cnt; ++i)
-            ++trigram_stats[trigram_storage[i]];
-
-        return trigram_cnt;
+        for (size_t i = 0; i < ngram_cnt; ++i)
+            ++ngram_stats[ngram_storage[i]];
+        return ngram_cnt;
     }
 
-    static void constant_constant(const std::string & data, const std::string & needle, Float32 & res)
+    template <class Callback, class... Args>
+    static inline size_t dispatchSearcher(Callback callback, Args &&... args)
     {
-        TrigramStats common_stats;
+        if constexpr (!UTF8)
+            return callback(std::forward<Args>(args)..., readASCIICodePoints, ASCIIHash);
+        else
+            return callback(std::forward<Args>(args)..., readUTF8CodePoints, UTF8Hash);
+    }
+
+    static void constant_constant(std::string data, std::string needle, Float32 & res)
+    {
+        NgramStats common_stats;
         memset(common_stats, 0, sizeof(common_stats));
-        size_t second_size = calculateNeedleStats(needle.data(), needle.size(), common_stats);
+
+        /// We use unsafe versions of getting ngrams, so I decided to use padded strings.
+        const size_t needle_size = needle.size();
+        const size_t data_size = data.size();
+        needle.resize(needle_size + default_padding);
+        data.resize(data_size + default_padding);
+
+        size_t second_size = dispatchSearcher(calculateNeedleStats, needle.data(), needle_size, common_stats);
         size_t distance = second_size;
-        if (data.size() <= max_string_size)
+        if (data_size <= max_string_size)
         {
-            size_t first_size = calculateHaystackStatsAndMetric(data.data(), data.size(), common_stats, distance);
+            size_t first_size = dispatchSearcher(calculateHaystackStatsAndMetric, data.data(), data_size, common_stats, distance);
             res = distance * 1.f / std::max(first_size + second_size, size_t(1));
         }
         else
@@ -175,11 +282,18 @@ struct TrigramDistanceImpl
     }
 
     static void vector_constant(
-        const ColumnString::Chars & data, const ColumnString::Offsets & offsets, const std::string & needle, PaddedPODArray<Float32> & res)
+        const ColumnString::Chars & data, const ColumnString::Offsets & offsets, std::string needle, PaddedPODArray<Float32> & res)
     {
-        TrigramStats common_stats;
+        /// zeroing our map
+        NgramStats common_stats;
         memset(common_stats, 0, sizeof(common_stats));
-        const size_t needle_stats_size = calculateNeedleStats(needle.data(), needle.size(), common_stats);
+
+        /// We use unsafe versions of getting ngrams, so I decided to use padded_data even in needle case.
+        const size_t needle_size = needle.size();
+        needle.resize(needle_size + default_padding);
+
+        const size_t needle_stats_size = dispatchSearcher(calculateNeedleStats, needle.data(), needle_size, common_stats);
+
         size_t distance = needle_stats_size;
         size_t prev_offset = 0;
         for (size_t i = 0; i < offsets.size(); ++i)
@@ -188,12 +302,13 @@ struct TrigramDistanceImpl
             const size_t haystack_size = offsets[i] - prev_offset - 1;
             if (haystack_size <= max_string_size)
             {
-                size_t haystack_stats_size
-                    = calculateHaystackStatsAndMetric(reinterpret_cast<const char *>(haystack), haystack_size, common_stats, distance);
+                size_t haystack_stats_size = dispatchSearcher(
+                    calculateHaystackStatsAndMetric, reinterpret_cast<const char *>(haystack), haystack_size, common_stats, distance);
                 res[i] = distance * 1.f / std::max(haystack_stats_size + needle_stats_size, size_t(1));
             }
             else
             {
+                /// if the strings are too big, we say they are completely not the same
                 res[i] = 1.f;
             }
             distance = needle_stats_size;
@@ -203,16 +318,39 @@ struct TrigramDistanceImpl
 };
 
 
-struct TrigramDistanceName
+struct NgramDistanceName
 {
-    static constexpr auto name = "trigramDistance";
+    static constexpr auto name = "ngramDistance";
 };
 
-using FunctionTrigramsDistance = FunctionsStringSimilarity<TrigramDistanceImpl, TrigramDistanceName>;
+struct NgramDistanceCaseInsensitiveName
+{
+    static constexpr auto name = "ngramDistanceCaseInsensitive";
+};
+
+struct NgramDistanceUTF8Name
+{
+    static constexpr auto name = "ngramDistanceUTF8";
+};
+
+struct NgramDistanceUTF8CaseInsensitiveName
+{
+    static constexpr auto name = "ngramDistanceCaseInsensitiveUTF8";
+};
+
+using FunctionNgramDistance = FunctionsStringSimilarity<NgramDistanceImpl<4, UInt8, false, false>, NgramDistanceName>;
+using FunctionNgramDistanceCaseInsensitive
+    = FunctionsStringSimilarity<NgramDistanceImpl<4, UInt8, false, true>, NgramDistanceCaseInsensitiveName>;
+using FunctionNgramDistanceUTF8 = FunctionsStringSimilarity<NgramDistanceImpl<3, UInt32, true, false>, NgramDistanceUTF8Name>;
+using FunctionNgramDistanceCaseInsensitiveUTF8
+    = FunctionsStringSimilarity<NgramDistanceImpl<3, UInt32, true, true>, NgramDistanceUTF8CaseInsensitiveName>;
 
 void registerFunctionsStringSimilarity(FunctionFactory & factory)
 {
-    factory.registerFunction<FunctionTrigramsDistance>();
+    factory.registerFunction<FunctionNgramDistance>();
+    factory.registerFunction<FunctionNgramDistanceCaseInsensitive>();
+    factory.registerFunction<FunctionNgramDistanceUTF8>();
+    factory.registerFunction<FunctionNgramDistanceCaseInsensitiveUTF8>();
 }
 
 }
diff --git a/dbms/src/Functions/FunctionsStringSimilarity.h b/dbms/src/Functions/FunctionsStringSimilarity.h
index 00c90e20569..c23d9be999a 100644
--- a/dbms/src/Functions/FunctionsStringSimilarity.h
+++ b/dbms/src/Functions/FunctionsStringSimilarity.h
@@ -12,8 +12,9 @@ namespace DB
 
 /** Calculate similarity metrics:
   *
-  * trigramDistance(haystack, needle) --- calculate so called 3-gram distance between haystack and needle.
+  * ngramDistance(haystack, needle) --- calculate n-gram distance between haystack and needle.
   * Returns float number from 0 to 1 - the closer to zero, the more strings are similar to each other.
+  * Also support CaseInsensitive and UTF8 formats.
   */
 
 namespace ErrorCodes
diff --git a/dbms/tests/performance/website/url_hits.xml b/dbms/tests/performance/website/url_hits.xml
index f83ec663ef7..88f48705d9a 100644
--- a/dbms/tests/performance/website/url_hits.xml
+++ b/dbms/tests/performance/website/url_hits.xml
@@ -78,5 +78,11 @@
 <query>SELECT URLHash, EventDate, count() AS PageViews FROM {table} WHERE CounterID = 34 AND EventDate >= '2013-07-01' AND EventDate &lt;= '2013-07-31' AND NOT Refresh AND TraficSourceID IN (-1, 6) AND RefererHash = halfMD5('http://example.ru/') GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 100</query>
 <query>SELECT WindowClientWidth, WindowClientHeight, count() AS PageViews FROM {table} WHERE CounterID = 34 AND EventDate >= '2013-07-01' AND EventDate &lt;= '2013-07-31' AND NOT Refresh AND NOT DontCountHits AND URLHash = halfMD5('http://example.ru/') GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10000</query>
 <query>SELECT toStartOfMinute(EventTime) AS Minute, count() AS PageViews FROM {table} WHERE CounterID = 34 AND EventDate >= '2013-07-01' AND EventDate &lt;= '2013-07-02' AND NOT Refresh AND NOT DontCountHits GROUP BY Minute ORDER BY Minute</query>
+<query>SELECT count(multiSearch(URL, ['yandex', 'google', 'rambler'])) from {table};</query>
+<query>SELECT count(match(URL, 'google|yandex|rambler')) from hits_100m_single;</query>
+<query>SELECT count(match(URL, 'google')), count(match(URL, 'yandex')), count(match(URL, 'rambler')) from {table}</query>
+<query>SELECT DISTINCT Title, ngramDistance(Title, 'what is love') AS distance FROM {table} ORDER BY distance ASC LIMIT 50</query>
+<query>SELECT DISTINCT Title, ngramDistanceUTF8(Title, 'метрика') AS distance FROM {table} ORDER BY distance ASC LIMIT 50</query>
+
 
 </test>
diff --git a/dbms/tests/queries/0_stateless/00909_ngram_distance.reference b/dbms/tests/queries/0_stateless/00909_ngram_distance.reference
new file mode 100644
index 00000000000..356cc5db466
--- /dev/null
+++ b/dbms/tests/queries/0_stateless/00909_ngram_distance.reference
@@ -0,0 +1,509 @@
+0
+0
+0
+0
+0
+1000
+1000
+1000
+1000
+1000
+1000
+1000
+1000
+1000
+1000
+0
+0
+0
+0
+0
+77
+77
+77
+77
+77
+636
+636
+636
+636
+636
+1000
+1000
+1000
+1000
+1000
+0
+1000
+1000
+0
+77
+636
+1000
+привет как дела?... Херсон	297
+пап привет как дела - Яндекс.Видео	422
+привет как дела клип - Яндекс.Видео	435
+привет братан как дела - Яндекс.Видео	500
+привет	529
+http://metric.ru/	1000
+http://autometric.ru/	1000
+http://metrica.yandex.com/	1000
+http://metris.ru/	1000
+http://metrika.ru/	1000
+	1000
+привет как дела?... Херсон	459
+пап привет как дела - Яндекс.Видео	511
+привет	529
+привет как дела клип - Яндекс.Видео	565
+привет братан как дела - Яндекс.Видео	583
+http://metric.ru/	1000
+http://autometric.ru/	1000
+http://metrica.yandex.com/	1000
+http://metris.ru/	1000
+http://metrika.ru/	1000
+	1000
+http://metrika.ru/	524
+http://metric.ru/	700
+http://metris.ru/	700
+http://autometric.ru/	750
+http://metrica.yandex.com/	793
+привет как дела?... Херсон	1000
+привет как дела клип - Яндекс.Видео	1000
+привет	1000
+пап привет как дела - Яндекс.Видео	1000
+привет братан как дела - Яндекс.Видео	1000
+	1000
+http://metric.ru/	600
+http://metrica.yandex.com/	655
+http://autometric.ru/	667
+http://metris.ru/	700
+http://metrika.ru/	714
+привет как дела?... Херсон	1000
+привет как дела клип - Яндекс.Видео	1000
+привет	1000
+пап привет как дела - Яндекс.Видео	1000
+привет братан как дела - Яндекс.Видео	1000
+	1000
+http://metrika.ru/	619
+http://metric.ru/	700
+http://metris.ru/	700
+http://autometric.ru/	750
+http://metrica.yandex.com/	793
+привет как дела?... Херсон	1000
+привет как дела клип - Яндекс.Видео	1000
+привет	1000
+пап привет как дела - Яндекс.Видео	1000
+привет братан как дела - Яндекс.Видео	1000
+	1000
+http://metric.ru/	600
+http://autometric.ru/	667
+http://metris.ru/	700
+http://metrika.ru/	714
+http://metrica.yandex.com/	724
+привет как дела?... Херсон	1000
+привет как дела клип - Яндекс.Видео	1000
+привет	1000
+пап привет как дела - Яндекс.Видео	1000
+привет братан как дела - Яндекс.Видео	1000
+	1000
+http://metrica.yandex.com/	714
+привет как дела?... Херсон	1000
+привет как дела клип - Яндекс.Видео	1000
+привет	1000
+пап привет как дела - Яндекс.Видео	1000
+привет братан как дела - Яндекс.Видео	1000
+http://metric.ru/	1000
+http://autometric.ru/	1000
+http://metris.ru/	1000
+http://metrika.ru/	1000
+	1000
+0
+0
+0
+0
+0
+1000
+1000
+1000
+1000
+1000
+1000
+1000
+1000
+1000
+1000
+0
+0
+0
+0
+0
+77
+77
+77
+77
+77
+636
+636
+636
+636
+636
+1000
+1000
+1000
+1000
+1000
+0
+1000
+1000
+429
+77
+636
+1000
+привет как дела?... Херсон	297
+пап привет как дела - Яндекс.Видео	422
+привет как дела клип - Яндекс.Видео	435
+привет братан как дела - Яндекс.Видео	500
+привет	529
+http://metric.ru/	1000
+http://autometric.ru/	1000
+http://metrica.yandex.com/	1000
+http://metris.ru/	1000
+http://metrika.ru/	1000
+	1000
+привет как дела?... Херсон	676
+пап привет как дела - Яндекс.Видео	733
+привет как дела клип - Яндекс.Видео	739
+привет братан как дела - Яндекс.Видео	750
+привет	882
+http://metric.ru/	1000
+http://autometric.ru/	1000
+http://metrica.yandex.com/	1000
+http://metris.ru/	1000
+http://metrika.ru/	1000
+	1000
+http://metrika.ru/	524
+http://metric.ru/	700
+http://metris.ru/	700
+http://autometric.ru/	750
+http://metrica.yandex.com/	793
+привет как дела?... Херсон	1000
+привет как дела клип - Яндекс.Видео	1000
+привет	1000
+пап привет как дела - Яндекс.Видео	1000
+привет братан как дела - Яндекс.Видео	1000
+	1000
+http://metrika.ru/	524
+http://metric.ru/	700
+http://metris.ru/	700
+http://autometric.ru/	750
+http://metrica.yandex.com/	793
+привет как дела?... Херсон	1000
+привет как дела клип - Яндекс.Видео	1000
+привет	1000
+пап привет как дела - Яндекс.Видео	1000
+привет братан как дела - Яндекс.Видео	1000
+	1000
+http://metric.ru/	600
+http://metrica.yandex.com/	655
+http://autometric.ru/	667
+http://metris.ru/	700
+http://metrika.ru/	714
+привет как дела?... Херсон	1000
+привет как дела клип - Яндекс.Видео	1000
+привет	1000
+пап привет как дела - Яндекс.Видео	1000
+привет братан как дела - Яндекс.Видео	1000
+	1000
+http://metrika.ru/	619
+http://metric.ru/	700
+http://metris.ru/	700
+http://autometric.ru/	750
+http://metrica.yandex.com/	793
+привет как дела?... Херсон	1000
+привет как дела клип - Яндекс.Видео	1000
+привет	1000
+пап привет как дела - Яндекс.Видео	1000
+привет братан как дела - Яндекс.Видео	1000
+	1000
+http://metric.ru/	600
+http://autometric.ru/	667
+http://metris.ru/	700
+http://metrika.ru/	714
+http://metrica.yandex.com/	724
+привет как дела?... Херсон	1000
+привет как дела клип - Яндекс.Видео	1000
+привет	1000
+пап привет как дела - Яндекс.Видео	1000
+привет братан как дела - Яндекс.Видео	1000
+	1000
+http://metrica.yandex.com/	714
+привет как дела?... Херсон	1000
+привет как дела клип - Яндекс.Видео	1000
+привет	1000
+пап привет как дела - Яндекс.Видео	1000
+привет братан как дела - Яндекс.Видео	1000
+http://metric.ru/	1000
+http://autometric.ru/	1000
+http://metris.ru/	1000
+http://metrika.ru/	1000
+	1000
+привет как дела клип - Яндекс.Видео	182
+пап привет как дела - Яндекс.Видео	354
+привет братан как дела - Яндекс.Видео	382
+привет как дела?... Херсон	649
+привет	838
+http://metric.ru/	1000
+http://autometric.ru/	1000
+http://metrica.yandex.com/	1000
+http://metris.ru/	1000
+http://metrika.ru/	1000
+	1000
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+111
+111
+111
+111
+111
+429
+429
+429
+429
+429
+1000
+1000
+1000
+1000
+1000
+0
+0
+0
+0
+111
+429
+1000
+привет как дела?... Херсон	254
+пап привет как дела - Яндекс.Видео	398
+привет как дела клип - Яндекс.Видео	412
+привет братан как дела - Яндекс.Видео	461
+привет	471
+http://metric.ru/	1000
+http://autometric.ru/	1000
+http://metrica.yandex.com/	1000
+http://metris.ru/	1000
+http://metrika.ru/	1000
+	1000
+привет как дела?... Херсон	343
+пап привет как дела - Яндекс.Видео	446
+привет	471
+привет как дела клип - Яндекс.Видео	482
+привет братан как дела - Яндекс.Видео	506
+http://metric.ru/	1000
+http://autometric.ru/	1000
+http://metrica.yandex.com/	1000
+http://metris.ru/	1000
+http://metrika.ru/	1000
+	1000
+http://metrika.ru/	579
+http://metric.ru/	778
+http://metris.ru/	778
+http://autometric.ru/	818
+http://metrica.yandex.com/	852
+привет как дела?... Херсон	1000
+привет как дела клип - Яндекс.Видео	1000
+привет	1000
+пап привет как дела - Яндекс.Видео	1000
+привет братан как дела - Яндекс.Видео	1000
+	1000
+http://metric.ru/	667
+http://metrica.yandex.com/	704
+http://autometric.ru/	727
+http://metris.ru/	778
+http://metrika.ru/	789
+привет как дела?... Херсон	1000
+привет как дела клип - Яндекс.Видео	1000
+привет	1000
+пап привет как дела - Яндекс.Видео	1000
+привет братан как дела - Яндекс.Видео	1000
+	1000
+http://metrika.ru/	684
+http://metric.ru/	778
+http://metris.ru/	778
+http://autometric.ru/	818
+http://metrica.yandex.com/	852
+привет как дела?... Херсон	1000
+привет как дела клип - Яндекс.Видео	1000
+привет	1000
+пап привет как дела - Яндекс.Видео	1000
+привет братан как дела - Яндекс.Видео	1000
+	1000
+http://metric.ru/	667
+http://autometric.ru/	727
+http://metrica.yandex.com/	778
+http://metris.ru/	778
+http://metrika.ru/	789
+привет как дела?... Херсон	1000
+привет как дела клип - Яндекс.Видео	1000
+привет	1000
+пап привет как дела - Яндекс.Видео	1000
+привет братан как дела - Яндекс.Видео	1000
+	1000
+http://metrica.yandex.com/	769
+привет как дела?... Херсон	1000
+привет как дела клип - Яндекс.Видео	1000
+привет	1000
+пап привет как дела - Яндекс.Видео	1000
+привет братан как дела - Яндекс.Видео	1000
+http://metric.ru/	1000
+http://autometric.ru/	1000
+http://metris.ru/	1000
+http://metrika.ru/	1000
+	1000
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
+111
+111
+111
+111
+111
+600
+600
+600
+600
+600
+1000
+1000
+1000
+1000
+1000
+0
+0
+0
+0
+111
+600
+1000
+привет как дела?... Херсон	910
+пап привет как дела - Яндекс.Видео	928
+привет как дела клип - Яндекс.Видео	929
+привет братан как дела - Яндекс.Видео	955
+привет	1000
+http://metric.ru/	1000
+http://autometric.ru/	1000
+http://metrica.yandex.com/	1000
+http://metris.ru/	1000
+http://metrika.ru/	1000
+	1000
+привет как дела?... Херсон	672
+пап привет как дела - Яндекс.Видео	735
+привет как дела клип - Яндекс.Видео	741
+привет братан как дела - Яндекс.Видео	753
+привет	1000
+http://metric.ru/	1000
+http://autometric.ru/	1000
+http://metrica.yandex.com/	1000
+http://metris.ru/	1000
+http://metrika.ru/	1000
+	1000
+http://metrika.ru/	579
+http://metric.ru/	778
+http://metris.ru/	778
+http://autometric.ru/	818
+http://metrica.yandex.com/	852
+привет как дела?... Херсон	1000
+привет как дела клип - Яндекс.Видео	1000
+привет	1000
+пап привет как дела - Яндекс.Видео	1000
+привет братан как дела - Яндекс.Видео	1000
+	1000
+http://metrika.ru/	579
+http://metric.ru/	778
+http://metris.ru/	778
+http://autometric.ru/	818
+http://metrica.yandex.com/	852
+привет как дела?... Херсон	1000
+привет как дела клип - Яндекс.Видео	1000
+привет	1000
+пап привет как дела - Яндекс.Видео	1000
+привет братан как дела - Яндекс.Видео	1000
+	1000
+http://metric.ru/	667
+http://metrica.yandex.com/	704
+http://autometric.ru/	727
+http://metris.ru/	778
+http://metrika.ru/	789
+привет как дела?... Херсон	1000
+привет как дела клип - Яндекс.Видео	1000
+привет	1000
+пап привет как дела - Яндекс.Видео	1000
+привет братан как дела - Яндекс.Видео	1000
+	1000
+http://metrika.ru/	684
+http://metric.ru/	778
+http://metris.ru/	778
+http://autometric.ru/	818
+http://metrica.yandex.com/	852
+привет как дела?... Херсон	1000
+привет как дела клип - Яндекс.Видео	1000
+привет	1000
+пап привет как дела - Яндекс.Видео	1000
+привет братан как дела - Яндекс.Видео	1000
+	1000
+http://metric.ru/	667
+http://autometric.ru/	727
+http://metrica.yandex.com/	778
+http://metris.ru/	778
+http://metrika.ru/	789
+привет как дела?... Херсон	1000
+привет как дела клип - Яндекс.Видео	1000
+привет	1000
+пап привет как дела - Яндекс.Видео	1000
+привет братан как дела - Яндекс.Видео	1000
+	1000
+http://metrica.yandex.com/	769
+привет как дела?... Херсон	1000
+привет как дела клип - Яндекс.Видео	1000
+привет	1000
+пап привет как дела - Яндекс.Видео	1000
+привет братан как дела - Яндекс.Видео	1000
+http://metric.ru/	1000
+http://autometric.ru/	1000
+http://metris.ru/	1000
+http://metrika.ru/	1000
+	1000
diff --git a/dbms/tests/queries/0_stateless/00909_ngram_distance.sql b/dbms/tests/queries/0_stateless/00909_ngram_distance.sql
new file mode 100644
index 00000000000..867e69f4fe7
--- /dev/null
+++ b/dbms/tests/queries/0_stateless/00909_ngram_distance.sql
@@ -0,0 +1,106 @@
+select round(1000 * ngramDistanceUTF8(materialize(''), '')) from system.numbers limit 5;
+select round(1000 * ngramDistanceUTF8(materialize('абв'), '')) from system.numbers limit 5;
+select round(1000 * ngramDistanceUTF8(materialize(''), 'абв')) from system.numbers limit 5;
+select round(1000 * ngramDistanceUTF8(materialize('абвгдеёжз'), 'абвгдеёжз')) from system.numbers limit 5;
+select round(1000 * ngramDistanceUTF8(materialize('абвгдеёжз'), 'абвгдеёж')) from system.numbers limit 5;
+select round(1000 * ngramDistanceUTF8(materialize('абвгдеёжз'), 'гдеёзд')) from system.numbers limit 5;
+select round(1000 * ngramDistanceUTF8(materialize('абвгдеёжз'), 'ёёёёёёёё')) from system.numbers limit 5;
+
+select round(1000 * ngramDistanceUTF8('', ''));
+select round(1000 * ngramDistanceUTF8('абв', ''));
+select round(1000 * ngramDistanceUTF8('', 'абв'));
+select round(1000 * ngramDistanceUTF8('абвгдеёжз', 'абвгдеёжз'));
+select round(1000 * ngramDistanceUTF8('абвгдеёжз', 'абвгдеёж'));
+select round(1000 * ngramDistanceUTF8('абвгдеёжз', 'гдеёзд'));
+select round(1000 * ngramDistanceUTF8('абвгдеёжз', 'ёёёёёёёё'));
+
+drop table if exists test.test_distance;
+create table test.test_distance (Title String) engine = Memory;
+insert into test.test_distance values ('привет как дела?... Херсон'), ('привет как дела клип - Яндекс.Видео'), ('привет'), ('пап привет как дела - Яндекс.Видео'), ('привет братан как дела - Яндекс.Видео'), ('http://metric.ru/'), ('http://autometric.ru/'), ('http://metrica.yandex.com/'), ('http://metris.ru/'), ('http://metrika.ru/'), ('');
+
+SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceUTF8(Title, 'привет как дела') as distance;
+SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceUTF8(Title, 'как привет дела') as distance;
+SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceUTF8(Title, 'metrika') as distance;
+SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceUTF8(Title, 'metrica') as distance;
+SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceUTF8(Title, 'metriks') as distance;
+SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceUTF8(Title, 'metrics') as distance;
+SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceUTF8(Title, 'yandex') as distance;
+
+
+select round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize(''), '')) from system.numbers limit 5;
+select round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('абв'), '')) from system.numbers limit 5;
+select round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize(''), 'абв')) from system.numbers limit 5;
+select round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('абвГДЕёжз'), 'АбвгдЕёжз')) from system.numbers limit 5;
+select round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('аБВГдеёЖз'), 'АбвГдеёж')) from system.numbers limit 5;
+select round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('абвгдеёжз'), 'гдеёЗД')) from system.numbers limit 5;
+select round(1000 * ngramDistanceCaseInsensitiveUTF8(materialize('абвгдеёжз'), 'ЁЁЁЁЁЁЁЁ')) from system.numbers limit 5;
+
+select round(1000 * ngramDistanceCaseInsensitiveUTF8('', ''));
+select round(1000 * ngramDistanceCaseInsensitiveUTF8('абв', ''));
+select round(1000 * ngramDistanceCaseInsensitiveUTF8('', 'абв'));
+select round(1000 * ngramDistanceCaseInsensitiveUTF8('абвГДЕёжз', 'АбвгдЕЁжз'));
+select round(1000 * ngramDistanceCaseInsensitiveUTF8('аБВГдеёЖз', 'АбвГдеёж'));
+select round(1000 * ngramDistanceCaseInsensitiveUTF8('абвгдеёжз', 'гдеёЗД'));
+select round(1000 * ngramDistanceCaseInsensitiveUTF8('АБВГДеёжз', 'ЁЁЁЁЁЁЁЁ'));
+
+SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitiveUTF8(Title, 'ПрИвЕт кАК ДЕЛа') as distance;
+SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitiveUTF8(Title, 'как ПРИВЕТ дела') as distance;
+SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitiveUTF8(Title, 'metrika') as distance;
+SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitiveUTF8(Title, 'Metrika') as distance;
+SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitiveUTF8(Title, 'mEtrica') as distance;
+SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitiveUTF8(Title, 'metriKS') as distance;
+SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitiveUTF8(Title, 'metrics') as distance;
+SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitiveUTF8(Title, 'YanDEX') as distance;
+SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitiveUTF8(Title, 'приВЕТ КАк ДеЛа КлИп - яндеКс.видео') as distance;
+
+
+select round(1000 * ngramDistance(materialize(''), '')) from system.numbers limit 5;
+select round(1000 * ngramDistance(materialize('abc'), '')) from system.numbers limit 5;
+select round(1000 * ngramDistance(materialize(''), 'abc')) from system.numbers limit 5;
+select round(1000 * ngramDistance(materialize('abcdefgh'), 'abcdefgh')) from system.numbers limit 5;
+select round(1000 * ngramDistance(materialize('abcdefgh'), 'abcdefg')) from system.numbers limit 5;
+select round(1000 * ngramDistance(materialize('abcdefgh'), 'defgh')) from system.numbers limit 5;
+select round(1000 * ngramDistance(materialize('abcdefgh'), 'aaaaaaaa')) from system.numbers limit 5;
+
+select round(1000 * ngramDistance('', ''));
+select round(1000 * ngramDistance('abc', ''));
+select round(1000 * ngramDistance('', 'abc'));
+select round(1000 * ngramDistance('abcdefgh', 'abcdefgh'));
+select round(1000 * ngramDistance('abcdefgh', 'abcdefg'));
+select round(1000 * ngramDistance('abcdefgh', 'defgh'));
+select round(1000 * ngramDistance('abcdefgh', 'aaaaaaaa'));
+
+SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistance(Title, 'привет как дела') as distance;
+SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistance(Title, 'как привет дела') as distance;
+SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistance(Title, 'metrika') as distance;
+SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistance(Title, 'metrica') as distance;
+SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistance(Title, 'metriks') as distance;
+SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistance(Title, 'metrics') as distance;
+SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistance(Title, 'yandex') as distance;
+
+select round(1000 * ngramDistanceCaseInsensitive(materialize(''), '')) from system.numbers limit 5;
+select round(1000 * ngramDistanceCaseInsensitive(materialize('abc'), '')) from system.numbers limit 5;
+select round(1000 * ngramDistanceCaseInsensitive(materialize(''), 'abc')) from system.numbers limit 5;
+select round(1000 * ngramDistanceCaseInsensitive(materialize('abCdefgH'), 'Abcdefgh')) from system.numbers limit 5;
+select round(1000 * ngramDistanceCaseInsensitive(materialize('abcdefgh'), 'abcdeFG')) from system.numbers limit 5;
+select round(1000 * ngramDistanceCaseInsensitive(materialize('AAAAbcdefgh'), 'defgh')) from system.numbers limit 5;
+select round(1000 * ngramDistanceCaseInsensitive(materialize('ABCdefgH'), 'aaaaaaaa')) from system.numbers limit 5;
+
+select round(1000 * ngramDistanceCaseInsensitive('', ''));
+select round(1000 * ngramDistanceCaseInsensitive('abc', ''));
+select round(1000 * ngramDistanceCaseInsensitive('', 'abc'));
+select round(1000 * ngramDistanceCaseInsensitive('abCdefgH', 'Abcdefgh'));
+select round(1000 * ngramDistanceCaseInsensitive('abcdefgh', 'abcdeFG'));
+select round(1000 * ngramDistanceCaseInsensitive('AAAAbcdefgh', 'defgh'));
+select round(1000 * ngramDistanceCaseInsensitive('ABCdefgH', 'aaaaaaaa'));
+
+SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitive(Title, 'ПрИвЕт кАК ДЕЛа') as distance;
+SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitive(Title, 'как ПРИВЕТ дела') as distance;
+SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitive(Title, 'metrika') as distance;
+SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitive(Title, 'Metrika') as distance;
+SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitive(Title, 'mEtrica') as distance;
+SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitive(Title, 'metriKS') as distance;
+SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitive(Title, 'metrics') as distance;
+SELECT Title, round(1000 * distance) FROM test.test_distance ORDER BY ngramDistanceCaseInsensitive(Title, 'YanDEX') as distance;
+
+drop table if exists test.test_distance;
diff --git a/dbms/tests/queries/0_stateless/00909_trigram_distance.reference b/dbms/tests/queries/0_stateless/00909_trigram_distance.reference
deleted file mode 100644
index 14dba2a2dcf..00000000000
--- a/dbms/tests/queries/0_stateless/00909_trigram_distance.reference
+++ /dev/null
@@ -1,119 +0,0 @@
-0
-0
-0
-0
-0
-1000
-1000
-1000
-1000
-1000
-1000
-1000
-1000
-1000
-1000
-0
-0
-0
-0
-0
-77
-77
-77
-77
-77
-636
-636
-636
-636
-636
-1000
-1000
-1000
-1000
-1000
-0
-1000
-1000
-0
-77
-636
-1000
-привет как дела?... Херсон
-пап привет как дела - Яндекс.Видео
-привет как дела клип - Яндекс.Видео
-привет братан как дела - Яндекс.Видео
-привет
-http://metric.ru/
-http://autometric.ru/
-http://metrica.yandex.com/
-http://metris.ru/
-http://metrika.ru/
-
-привет как дела?... Херсон
-пап привет как дела - Яндекс.Видео
-привет
-привет как дела клип - Яндекс.Видео
-привет братан как дела - Яндекс.Видео
-http://metric.ru/
-http://autometric.ru/
-http://metrica.yandex.com/
-http://metris.ru/
-http://metrika.ru/
-
-http://metrika.ru/
-http://metric.ru/
-http://metris.ru/
-http://autometric.ru/
-http://metrica.yandex.com/
-привет как дела?... Херсон
-привет как дела клип - Яндекс.Видео
-привет
-пап привет как дела - Яндекс.Видео
-привет братан как дела - Яндекс.Видео
-
-http://metric.ru/
-http://metrica.yandex.com/
-http://autometric.ru/
-http://metris.ru/
-http://metrika.ru/
-привет как дела?... Херсон
-привет как дела клип - Яндекс.Видео
-привет
-пап привет как дела - Яндекс.Видео
-привет братан как дела - Яндекс.Видео
-
-http://metrika.ru/
-http://metric.ru/
-http://metris.ru/
-http://autometric.ru/
-http://metrica.yandex.com/
-привет как дела?... Херсон
-привет как дела клип - Яндекс.Видео
-привет
-пап привет как дела - Яндекс.Видео
-привет братан как дела - Яндекс.Видео
-
-http://metric.ru/
-http://autometric.ru/
-http://metris.ru/
-http://metrika.ru/
-http://metrica.yandex.com/
-привет как дела?... Херсон
-привет как дела клип - Яндекс.Видео
-привет
-пап привет как дела - Яндекс.Видео
-привет братан как дела - Яндекс.Видео
-
-http://metrica.yandex.com/
-привет как дела?... Херсон
-привет как дела клип - Яндекс.Видео
-привет
-пап привет как дела - Яндекс.Видео
-привет братан как дела - Яндекс.Видео
-http://metric.ru/
-http://autometric.ru/
-http://metris.ru/
-http://metrika.ru/
-
diff --git a/dbms/tests/queries/0_stateless/00909_trigram_distance.sql b/dbms/tests/queries/0_stateless/00909_trigram_distance.sql
deleted file mode 100644
index ca6a18d2513..00000000000
--- a/dbms/tests/queries/0_stateless/00909_trigram_distance.sql
+++ /dev/null
@@ -1,29 +0,0 @@
-select round(1000 * trigramDistance(materialize(''), '')) from system.numbers limit 5;
-select round(1000 * trigramDistance(materialize('абв'), '')) from system.numbers limit 5;
-select round(1000 * trigramDistance(materialize(''), 'абв')) from system.numbers limit 5;
-select round(1000 * trigramDistance(materialize('абвгдеёжз'), 'абвгдеёжз')) from system.numbers limit 5;
-select round(1000 * trigramDistance(materialize('абвгдеёжз'), 'абвгдеёж')) from system.numbers limit 5;
-select round(1000 * trigramDistance(materialize('абвгдеёжз'), 'гдеёзд')) from system.numbers limit 5;
-select round(1000 * trigramDistance(materialize('абвгдеёжз'), 'ёёёёёёёё')) from system.numbers limit 5;
-
-select round(1000 * trigramDistance('', ''));
-select round(1000 * trigramDistance('абв', ''));
-select round(1000 * trigramDistance('', 'абв'));
-select round(1000 * trigramDistance('абвгдеёжз', 'абвгдеёжз'));
-select round(1000 * trigramDistance('абвгдеёжз', 'абвгдеёж'));
-select round(1000 * trigramDistance('абвгдеёжз', 'гдеёзд'));
-select round(1000 * trigramDistance('абвгдеёжз', 'ёёёёёёёё'));
-
-drop table if exists test.test_distance;
-create table test.test_distance (Title String) engine = Memory;
-insert into test.test_distance values ('привет как дела?... Херсон'), ('привет как дела клип - Яндекс.Видео'), ('привет'), ('пап привет как дела - Яндекс.Видео'), ('привет братан как дела - Яндекс.Видео'), ('http://metric.ru/'), ('http://autometric.ru/'), ('http://metrica.yandex.com/'), ('http://metris.ru/'), ('http://metrika.ru/'), ('');
-
-SELECT Title FROM test.test_distance ORDER BY trigramDistance(Title, 'привет как дела');
-SELECT Title FROM test.test_distance ORDER BY trigramDistance(Title, 'как привет дела');
-SELECT Title FROM test.test_distance ORDER BY trigramDistance(Title, 'metrika');
-SELECT Title FROM test.test_distance ORDER BY trigramDistance(Title, 'metrica');
-SELECT Title FROM test.test_distance ORDER BY trigramDistance(Title, 'metriks');
-SELECT Title FROM test.test_distance ORDER BY trigramDistance(Title, 'metrics');
-SELECT Title FROM test.test_distance ORDER BY trigramDistance(Title, 'yandex');
-
-drop table if exists test.test_distance;
diff --git a/docs/en/query_language/functions/string_search_functions.md b/docs/en/query_language/functions/string_search_functions.md
index b3b8b63d136..26890c4c920 100644
--- a/docs/en/query_language/functions/string_search_functions.md
+++ b/docs/en/query_language/functions/string_search_functions.md
@@ -70,5 +70,13 @@ For other regular expressions, the code is the same as for the 'match' function.
 
 The same thing as 'like', but negative.
 
+## ngramDistance(haystack, needle)
+
+Calculate the 4-gram distance between `haystack` and `needle`: count the symmetric difference between two sets of 4-grams and normalize it by the sum of their cardinality. Returns float number from 0 to 1 - the closer to zero, the more strings are similar to each other. If the `needle` is more than 32Kb, throw an exception. If some of the `haystack` strings are more than 32Kb, the distance is always one.
+
+For case-insensitive search or/and in UTF-8 format use functions `ngramDistanceCaseInsensitive, ngramDistanceUTF8, ngramDistanceCaseInsensitiveUTF8`.
+
+Notes: For UTF-8 case we use 3-gram distance. All these are not perfectly fair n-gram distances. We use 2 bytes hashes to hash n-grams and then calculate the symmetric difference between these hash tables -- collisions may occur. With UTF-8 case-insensitive format we do not use fair `tolower` function -- we zero the 5-th bit of each codepoint byte -- this works for Latin and mostly for all Cyrillic letters.
+
 
 [Original article](https://clickhouse.yandex/docs/en/query_language/functions/string_search_functions/) <!--hide-->
diff --git a/docs/ru/query_language/functions/string_search_functions.md b/docs/ru/query_language/functions/string_search_functions.md
index a79ea043716..b0f72e6474d 100644
--- a/docs/ru/query_language/functions/string_search_functions.md
+++ b/docs/ru/query_language/functions/string_search_functions.md
@@ -59,4 +59,12 @@
 ## notLike(haystack, pattern), оператор haystack NOT LIKE pattern
 То же, что like, но с отрицанием.
 
+## ngramDistance(haystack, needle)
+
+Вычисление 4-граммного расстояния между `haystack` и `needle`: считается симметрическая разность между двумя множествами 4-грам и нормализается на сумму их мощностей. Возвращает число float от 0 до 1 - чем ближе к нулю, тем больше строки похожи друг на друга. Если `needle` больше чем 32КБ, кидается исключение. Если некоторые строка из `haystack` больше 32КБ, расстояние всегда равно единице.
+
+Для поиска без учета регистра и/или в формате UTF-8 используйте функции `ngramDistanceCaseInsensitive, ngramDistanceUTF8, ngramDistanceCaseInsensitiveUTF8`.
+
+Примечание: для случая UTF-8 мы используем триграммное расстояние. Вычисление n-граммовного расстояния не совсем честное. Мы используем 2-х байтные хэши для хэширования n-грамм, а затем вычисляем симметричную разность между хэш-таблицами - могут возникнуть коллизии. В формате UTF-8 без учета регистра мы не используем честную функцию `tolower` - мы обнуляем 5-й бит каждого байта кодовой точки - это работает для латиницы и почти для всех кириллических букв.
+
 [Оригинальная статья](https://clickhouse.yandex/docs/ru/query_language/functions/string_search_functions/) <!--hide-->

From 24cc9e4e65a3554d69b15305584393c582467827 Mon Sep 17 00:00:00 2001
From: Danila Kutenin <kutdanila@yandex.ru>
Date: Wed, 6 Mar 2019 01:44:47 +0300
Subject: [PATCH 02/25] Fix docs

---
 docs/en/query_language/functions/string_search_functions.md | 2 +-
 docs/ru/query_language/functions/string_search_functions.md | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/en/query_language/functions/string_search_functions.md b/docs/en/query_language/functions/string_search_functions.md
index 26890c4c920..c900b52cf94 100644
--- a/docs/en/query_language/functions/string_search_functions.md
+++ b/docs/en/query_language/functions/string_search_functions.md
@@ -72,7 +72,7 @@ The same thing as 'like', but negative.
 
 ## ngramDistance(haystack, needle)
 
-Calculate the 4-gram distance between `haystack` and `needle`: count the symmetric difference between two sets of 4-grams and normalize it by the sum of their cardinality. Returns float number from 0 to 1 - the closer to zero, the more strings are similar to each other. If the `needle` is more than 32Kb, throw an exception. If some of the `haystack` strings are more than 32Kb, the distance is always one.
+Calculate the 4-gram distance between `haystack` and `needle`: count the symmetric difference between two sets of 4-grams and normalize it by the sum of their cardinalities. Returns float number from 0 to 1 - the closer to zero, the more strings are similar to each other. If the `needle` is more than 32Kb, throw an exception. If some of the `haystack` strings are more than 32Kb, the distance is always one.
 
 For case-insensitive search or/and in UTF-8 format use functions `ngramDistanceCaseInsensitive, ngramDistanceUTF8, ngramDistanceCaseInsensitiveUTF8`.
 
diff --git a/docs/ru/query_language/functions/string_search_functions.md b/docs/ru/query_language/functions/string_search_functions.md
index b0f72e6474d..48a255ded71 100644
--- a/docs/ru/query_language/functions/string_search_functions.md
+++ b/docs/ru/query_language/functions/string_search_functions.md
@@ -61,10 +61,10 @@
 
 ## ngramDistance(haystack, needle)
 
-Вычисление 4-граммного расстояния между `haystack` и `needle`: считается симметрическая разность между двумя множествами 4-грам и нормализается на сумму их мощностей. Возвращает число float от 0 до 1 - чем ближе к нулю, тем больше строки похожи друг на друга. Если `needle` больше чем 32КБ, кидается исключение. Если некоторые строка из `haystack` больше 32КБ, расстояние всегда равно единице.
+Вычисление 4-граммного расстояния между `haystack` и `needle`: считается симметрическая разность между двумя множествами 4-грамм и нормализается на сумму их мощностей. Возвращает число float от 0 до 1 - чем ближе к нулю, тем больше строки похожи друг на друга. Если `needle` больше чем 32КБ, кидается исключение. Если некоторые строка из `haystack` больше 32КБ, расстояние всегда равно единице.
 
 Для поиска без учета регистра и/или в формате UTF-8 используйте функции `ngramDistanceCaseInsensitive, ngramDistanceUTF8, ngramDistanceCaseInsensitiveUTF8`.
 
-Примечание: для случая UTF-8 мы используем триграммное расстояние. Вычисление n-граммовного расстояния не совсем честное. Мы используем 2-х байтные хэши для хэширования n-грамм, а затем вычисляем симметричную разность между хэш-таблицами - могут возникнуть коллизии. В формате UTF-8 без учета регистра мы не используем честную функцию `tolower` - мы обнуляем 5-й бит каждого байта кодовой точки - это работает для латиницы и почти для всех кириллических букв.
+Примечание: для случая UTF-8 мы используем триграммное расстояние. Вычисление n-граммного расстояния не совсем честное. Мы используем 2-х байтные хэши для хэширования n-грамм, а затем вычисляем симметричную разность между хэш-таблицами - могут возникнуть коллизии. В формате UTF-8 без учета регистра мы не используем честную функцию `tolower` - мы обнуляем 5-й бит каждого байта кодовой точки - это работает для латиницы и почти для всех кириллических букв.
 
 [Оригинальная статья](https://clickhouse.yandex/docs/ru/query_language/functions/string_search_functions/) <!--hide-->

From dcfd3fe37f1b025384ac0f444f189b79ba232079 Mon Sep 17 00:00:00 2001
From: Danila Kutenin <kutdanila@yandex.ru>
Date: Wed, 6 Mar 2019 02:08:08 +0300
Subject: [PATCH 03/25] Comment in FunctionsStringSimilarity

---
 dbms/src/Functions/FunctionsStringSimilarity.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dbms/src/Functions/FunctionsStringSimilarity.cpp b/dbms/src/Functions/FunctionsStringSimilarity.cpp
index a90c7e82acd..7c77857345a 100644
--- a/dbms/src/Functions/FunctionsStringSimilarity.cpp
+++ b/dbms/src/Functions/FunctionsStringSimilarity.cpp
@@ -90,7 +90,7 @@ struct NgramDistanceImpl
         ///              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
         /// Doing unaligned read of 16 bytes and copy them like above
         /// 16 is also chosen to do two `movups`.
-        /// Such copying allow us to have 3 codepoints from the previous read to produce the n-gram with them.
+        /// Such copying allow us to have 3 codepoints from the previous read to produce the 4-grams with them.
         memcpy(code_points + (N - 1), pos, default_padding * sizeof(CodePoint));
 
         if constexpr (CaseInsensitive)

From 8800134b9a1d8ae9d5f0b87bf5ccefb2c1222455 Mon Sep 17 00:00:00 2001
From: Danila Kutenin <kutdanila@yandex.ru>
Date: Wed, 6 Mar 2019 02:43:23 +0300
Subject: [PATCH 04/25] remove public perf tests

---
 dbms/tests/performance/website/url_hits.xml | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/dbms/tests/performance/website/url_hits.xml b/dbms/tests/performance/website/url_hits.xml
index 88f48705d9a..f83ec663ef7 100644
--- a/dbms/tests/performance/website/url_hits.xml
+++ b/dbms/tests/performance/website/url_hits.xml
@@ -78,11 +78,5 @@
 <query>SELECT URLHash, EventDate, count() AS PageViews FROM {table} WHERE CounterID = 34 AND EventDate >= '2013-07-01' AND EventDate &lt;= '2013-07-31' AND NOT Refresh AND TraficSourceID IN (-1, 6) AND RefererHash = halfMD5('http://example.ru/') GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 100</query>
 <query>SELECT WindowClientWidth, WindowClientHeight, count() AS PageViews FROM {table} WHERE CounterID = 34 AND EventDate >= '2013-07-01' AND EventDate &lt;= '2013-07-31' AND NOT Refresh AND NOT DontCountHits AND URLHash = halfMD5('http://example.ru/') GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10000</query>
 <query>SELECT toStartOfMinute(EventTime) AS Minute, count() AS PageViews FROM {table} WHERE CounterID = 34 AND EventDate >= '2013-07-01' AND EventDate &lt;= '2013-07-02' AND NOT Refresh AND NOT DontCountHits GROUP BY Minute ORDER BY Minute</query>
-<query>SELECT count(multiSearch(URL, ['yandex', 'google', 'rambler'])) from {table};</query>
-<query>SELECT count(match(URL, 'google|yandex|rambler')) from hits_100m_single;</query>
-<query>SELECT count(match(URL, 'google')), count(match(URL, 'yandex')), count(match(URL, 'rambler')) from {table}</query>
-<query>SELECT DISTINCT Title, ngramDistance(Title, 'what is love') AS distance FROM {table} ORDER BY distance ASC LIMIT 50</query>
-<query>SELECT DISTINCT Title, ngramDistanceUTF8(Title, 'метрика') AS distance FROM {table} ORDER BY distance ASC LIMIT 50</query>
-
 
 </test>

From cb7158f615ebb445c0c013fc25091cb1e6615d76 Mon Sep 17 00:00:00 2001
From: Danila Kutenin <kutdanila@yandex.ru>
Date: Wed, 6 Mar 2019 02:52:19 +0300
Subject: [PATCH 05/25] perf test for distance functions in a proper folder

---
 .../string_search/ngram_distance.xml          | 42 +++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 dbms/tests/performance/string_search/ngram_distance.xml

diff --git a/dbms/tests/performance/string_search/ngram_distance.xml b/dbms/tests/performance/string_search/ngram_distance.xml
new file mode 100644
index 00000000000..557928cbf12
--- /dev/null
+++ b/dbms/tests/performance/string_search/ngram_distance.xml
@@ -0,0 +1,42 @@
+<test>
+    <name>Distance search performance search</name>
+
+    <tags>
+        <tag>search</tag>
+    </tags>
+
+    <preconditions>
+        <table_exists>hits_100m_single</table_exists>
+    </preconditions>
+
+    <type>loop</type>
+
+    <stop_conditions>
+        <all_of>
+            <iterations>5</iterations>
+            <min_time_not_changing_for_ms>10000</min_time_not_changing_for_ms>
+        </all_of>
+        <any_of>
+            <iterations>50</iterations>
+            <total_time_ms>60000</total_time_ms>
+        </any_of>
+    </stop_conditions>
+
+    <query>SELECT DISTINCT Title, ngramDistance(Title, 'what is love') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50</query>
+    <query>SELECT DISTINCT Title, ngramDistance(Title, 'baby dont hurt me') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50</query>
+    <query>SELECT DISTINCT Title, ngramDistance(Title, 'no more') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50</query>
+    <query>SELECT DISTINCT Title, ngramDistanceCaseInsensitive(Title, 'wHAt Is lovE') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50</query>
+    <query>SELECT DISTINCT Title, ngramDistanceCaseInsensitive(Title, 'BABY DonT hUrT me') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50</query>
+    <query>SELECT DISTINCT Title, ngramDistanceCaseInsensitive(Title, 'nO MOrE') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50</query>
+    <query>SELECT DISTINCT Title, ngramDistanceUTF8(Title, 'метрика') AS distance FROM {table} ORDER BY distance ASC LIMIT 50</query>
+    <query>SELECT DISTINCT URL, ngramDistanceUTF8(URL, 'как дела') AS distance FROM {table} ORDER BY distance ASC LIMIT 50</query>
+    <query>SELECT DISTINCT URL, ngramDistanceUTF8(URL, 'как дела') AS distance FROM {table} ORDER BY distance ASC LIMIT 50</query>
+
+    <query>SELECT DISTINCT Title, ngramDistanceUTF8CaseInsensitive(Title, 'Метрика') AS distance FROM {table} ORDER BY distance ASC LIMIT 50</query>
+    <query>SELECT DISTINCT URL, ngramDistanceUTF8CaseInsensitive(URL, 'как дЕлА') AS distance FROM {table} ORDER BY distance ASC LIMIT 50</query>
+    <query>SELECT DISTINCT URL, ngramDistanceUTF8CaseInsensitive(URL, 'Чем зАнимаешЬся') AS distance FROM {table} ORDER BY distance ASC LIMIT 50</query>
+
+    <main_metric>
+        <min_time/>
+    </main_metric>
+</test>

From d09f2023c975c2b25156e2259e1ecb183b01e05c Mon Sep 17 00:00:00 2001
From: Danila Kutenin <kutdanila@yandex.ru>
Date: Wed, 6 Mar 2019 02:54:00 +0300
Subject: [PATCH 06/25] typos in perf tests for distance function

---
 .../performance/string_search/ngram_distance.xml     | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/dbms/tests/performance/string_search/ngram_distance.xml b/dbms/tests/performance/string_search/ngram_distance.xml
index 557928cbf12..2c75cd967cb 100644
--- a/dbms/tests/performance/string_search/ngram_distance.xml
+++ b/dbms/tests/performance/string_search/ngram_distance.xml
@@ -28,13 +28,13 @@
     <query>SELECT DISTINCT Title, ngramDistanceCaseInsensitive(Title, 'wHAt Is lovE') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50</query>
     <query>SELECT DISTINCT Title, ngramDistanceCaseInsensitive(Title, 'BABY DonT hUrT me') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50</query>
     <query>SELECT DISTINCT Title, ngramDistanceCaseInsensitive(Title, 'nO MOrE') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50</query>
-    <query>SELECT DISTINCT Title, ngramDistanceUTF8(Title, 'метрика') AS distance FROM {table} ORDER BY distance ASC LIMIT 50</query>
-    <query>SELECT DISTINCT URL, ngramDistanceUTF8(URL, 'как дела') AS distance FROM {table} ORDER BY distance ASC LIMIT 50</query>
-    <query>SELECT DISTINCT URL, ngramDistanceUTF8(URL, 'как дела') AS distance FROM {table} ORDER BY distance ASC LIMIT 50</query>
+    <query>SELECT DISTINCT Title, ngramDistanceUTF8(Title, 'метрика') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50</query>
+    <query>SELECT DISTINCT URL, ngramDistanceUTF8(URL, 'как дела') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50</query>
+    <query>SELECT DISTINCT URL, ngramDistanceUTF8(URL, 'как дела') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50</query>
 
-    <query>SELECT DISTINCT Title, ngramDistanceUTF8CaseInsensitive(Title, 'Метрика') AS distance FROM {table} ORDER BY distance ASC LIMIT 50</query>
-    <query>SELECT DISTINCT URL, ngramDistanceUTF8CaseInsensitive(URL, 'как дЕлА') AS distance FROM {table} ORDER BY distance ASC LIMIT 50</query>
-    <query>SELECT DISTINCT URL, ngramDistanceUTF8CaseInsensitive(URL, 'Чем зАнимаешЬся') AS distance FROM {table} ORDER BY distance ASC LIMIT 50</query>
+    <query>SELECT DISTINCT Title, ngramDistanceCaseInsensitiveUTF8(Title, 'Метрика') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50</query>
+    <query>SELECT DISTINCT URL, ngramDistanceCaseInsensitiveUTF8(URL, 'как дЕлА') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50</query>
+    <query>SELECT DISTINCT URL, ngramDistanceCaseInsensitiveUTF8(URL, 'Чем зАнимаешЬся') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50</query>
 
     <main_metric>
         <min_time/>

From 97349fb83ea66fcaaf05bfe9ef1fd40227f3dafa Mon Sep 17 00:00:00 2001
From: Danila Kutenin <kutdanila@yandex.ru>
Date: Wed, 6 Mar 2019 02:55:51 +0300
Subject: [PATCH 07/25] typos in perf tests for distance function

---
 dbms/tests/performance/string_search/ngram_distance.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dbms/tests/performance/string_search/ngram_distance.xml b/dbms/tests/performance/string_search/ngram_distance.xml
index 2c75cd967cb..84b599dd882 100644
--- a/dbms/tests/performance/string_search/ngram_distance.xml
+++ b/dbms/tests/performance/string_search/ngram_distance.xml
@@ -1,5 +1,5 @@
 <test>
-    <name>Distance search performance search</name>
+    <name>Distance search performance test</name>
 
     <tags>
         <tag>search</tag>

From 570af60bfa60d993941b95a5d3335fb8b3277249 Mon Sep 17 00:00:00 2001
From: Danila Kutenin <kutdanila@yandex.ru>
Date: Wed, 6 Mar 2019 03:05:14 +0300
Subject: [PATCH 08/25] more typos to the god of typos in distance perf test

---
 dbms/tests/performance/string_search/ngram_distance.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dbms/tests/performance/string_search/ngram_distance.xml b/dbms/tests/performance/string_search/ngram_distance.xml
index 84b599dd882..16960811067 100644
--- a/dbms/tests/performance/string_search/ngram_distance.xml
+++ b/dbms/tests/performance/string_search/ngram_distance.xml
@@ -30,7 +30,7 @@
     <query>SELECT DISTINCT Title, ngramDistanceCaseInsensitive(Title, 'nO MOrE') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50</query>
     <query>SELECT DISTINCT Title, ngramDistanceUTF8(Title, 'метрика') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50</query>
     <query>SELECT DISTINCT URL, ngramDistanceUTF8(URL, 'как дела') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50</query>
-    <query>SELECT DISTINCT URL, ngramDistanceUTF8(URL, 'как дела') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50</query>
+    <query>SELECT DISTINCT URL, ngramDistanceUTF8(URL, 'чем занимаешься') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50</query>
 
     <query>SELECT DISTINCT Title, ngramDistanceCaseInsensitiveUTF8(Title, 'Метрика') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50</query>
     <query>SELECT DISTINCT URL, ngramDistanceCaseInsensitiveUTF8(URL, 'как дЕлА') AS distance FROM hits_100m_single ORDER BY distance ASC LIMIT 50</query>

From dd22d1fb89e491020b7e9b5c13133e8add2f9967 Mon Sep 17 00:00:00 2001
From: Danila Kutenin <kutdanila@yandex.ru>
Date: Wed, 6 Mar 2019 03:08:12 +0300
Subject: [PATCH 09/25] Better docs to the distance functions

---
 docs/en/query_language/functions/string_search_functions.md | 4 ++--
 docs/ru/query_language/functions/string_search_functions.md | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/en/query_language/functions/string_search_functions.md b/docs/en/query_language/functions/string_search_functions.md
index c900b52cf94..bde56693c36 100644
--- a/docs/en/query_language/functions/string_search_functions.md
+++ b/docs/en/query_language/functions/string_search_functions.md
@@ -72,11 +72,11 @@ The same thing as 'like', but negative.
 
 ## ngramDistance(haystack, needle)
 
-Calculate the 4-gram distance between `haystack` and `needle`: count the symmetric difference between two sets of 4-grams and normalize it by the sum of their cardinalities. Returns float number from 0 to 1 - the closer to zero, the more strings are similar to each other. If the `needle` is more than 32Kb, throw an exception. If some of the `haystack` strings are more than 32Kb, the distance is always one.
+Calculate the 4-gram distance between `haystack` and `needle`: count the symmetric difference between two multisets of 4-grams and normalize it by the sum of their cardinalities. Returns float number from 0 to 1 - the closer to zero, the more strings are similar to each other. If the `needle` is more than 32Kb, throw an exception. If some of the `haystack` strings are more than 32Kb, the distance is always one.
 
 For case-insensitive search or/and in UTF-8 format use functions `ngramDistanceCaseInsensitive, ngramDistanceUTF8, ngramDistanceCaseInsensitiveUTF8`.
 
-Notes: For UTF-8 case we use 3-gram distance. All these are not perfectly fair n-gram distances. We use 2 bytes hashes to hash n-grams and then calculate the symmetric difference between these hash tables -- collisions may occur. With UTF-8 case-insensitive format we do not use fair `tolower` function -- we zero the 5-th bit of each codepoint byte -- this works for Latin and mostly for all Cyrillic letters.
+Notes: For UTF-8 case we use 3-gram distance. All these are not perfectly fair n-gram distances. We use 2 bytes hashes to hash n-grams and then calculate the symmetric difference between these hash tables -- collisions may occur. With UTF-8 case-insensitive format we do not use fair `tolower` function -- we zero the 5-th bit (starting from zero) of each codepoint byte -- this works for Latin and mostly for all Cyrillic letters.
 
 
 [Original article](https://clickhouse.yandex/docs/en/query_language/functions/string_search_functions/) <!--hide-->
diff --git a/docs/ru/query_language/functions/string_search_functions.md b/docs/ru/query_language/functions/string_search_functions.md
index 48a255ded71..6658cc4ee19 100644
--- a/docs/ru/query_language/functions/string_search_functions.md
+++ b/docs/ru/query_language/functions/string_search_functions.md
@@ -61,10 +61,10 @@
 
 ## ngramDistance(haystack, needle)
 
-Вычисление 4-граммного расстояния между `haystack` и `needle`: считается симметрическая разность между двумя множествами 4-грамм и нормализается на сумму их мощностей. Возвращает число float от 0 до 1 - чем ближе к нулю, тем больше строки похожи друг на друга. Если `needle` больше чем 32КБ, кидается исключение. Если некоторые строка из `haystack` больше 32КБ, расстояние всегда равно единице.
+Вычисление 4-граммного расстояния между `haystack` и `needle`: считается симметрическая разность между двумя мультимножествами 4-грамм и нормализается на сумму их мощностей. Возвращает число float от 0 до 1 - чем ближе к нулю, тем больше строки похожи друг на друга. Если `needle` больше чем 32КБ, кидается исключение. Если некоторые строка из `haystack` больше 32КБ, расстояние всегда равно единице.
 
 Для поиска без учета регистра и/или в формате UTF-8 используйте функции `ngramDistanceCaseInsensitive, ngramDistanceUTF8, ngramDistanceCaseInsensitiveUTF8`.
 
-Примечание: для случая UTF-8 мы используем триграммное расстояние. Вычисление n-граммного расстояния не совсем честное. Мы используем 2-х байтные хэши для хэширования n-грамм, а затем вычисляем симметричную разность между хэш-таблицами - могут возникнуть коллизии. В формате UTF-8 без учета регистра мы не используем честную функцию `tolower` - мы обнуляем 5-й бит каждого байта кодовой точки - это работает для латиницы и почти для всех кириллических букв.
+Примечание: для случая UTF-8 мы используем триграммное расстояние. Вычисление n-граммного расстояния не совсем честное. Мы используем 2-х байтные хэши для хэширования n-грамм, а затем вычисляем симметричную разность между хэш таблицами - могут возникнуть коллизии. В формате UTF-8 без учета регистра мы не используем честную функцию `tolower` - мы обнуляем 5-й бит (нумерация с нуля) каждого байта кодовой точки - это работает для латиницы и почти для всех кириллических букв.
 
 [Оригинальная статья](https://clickhouse.yandex/docs/ru/query_language/functions/string_search_functions/) <!--hide-->

From bccbd52d783b7bdceb364e5f8e6051995cabfa70 Mon Sep 17 00:00:00 2001
From: "Mikhail f. Shiryaev" <mr.felixoid@gmail.com>
Date: Mon, 4 Mar 2019 18:40:49 +0100
Subject: [PATCH 10/25] Rework system.graphite_retentions table

---
 .../GraphiteRollupSortedBlockInputStream.h    |   1 +
 .../MergeTree/registerStorageMergeTree.cpp    |   1 +
 .../Storages/System/StorageSystemGraphite.cpp | 253 ++++++++----------
 .../Storages/System/StorageSystemGraphite.h   |  14 +
 4 files changed, 124 insertions(+), 145 deletions(-)

diff --git a/dbms/src/DataStreams/GraphiteRollupSortedBlockInputStream.h b/dbms/src/DataStreams/GraphiteRollupSortedBlockInputStream.h
index bb2f81fc81f..dc5260be0e7 100644
--- a/dbms/src/DataStreams/GraphiteRollupSortedBlockInputStream.h
+++ b/dbms/src/DataStreams/GraphiteRollupSortedBlockInputStream.h
@@ -124,6 +124,7 @@ namespace Graphite
 
     struct Params
     {
+        String config_name;
         String path_column_name;
         String time_column_name;
         String value_column_name;
diff --git a/dbms/src/Storages/MergeTree/registerStorageMergeTree.cpp b/dbms/src/Storages/MergeTree/registerStorageMergeTree.cpp
index 103be508564..4b934ea3122 100644
--- a/dbms/src/Storages/MergeTree/registerStorageMergeTree.cpp
+++ b/dbms/src/Storages/MergeTree/registerStorageMergeTree.cpp
@@ -165,6 +165,7 @@ static void setGraphitePatternsFromConfig(const Context & context,
         throw Exception("No '" + config_element + "' element in configuration file",
             ErrorCodes::NO_ELEMENTS_IN_CONFIG);
 
+    params.config_name = config_element;
     params.path_column_name = config.getString(config_element + ".path_column_name", "Path");
     params.time_column_name = config.getString(config_element + ".time_column_name", "Time");
     params.value_column_name = config.getString(config_element + ".value_column_name", "Value");
diff --git a/dbms/src/Storages/System/StorageSystemGraphite.cpp b/dbms/src/Storages/System/StorageSystemGraphite.cpp
index d75eb71841e..ed37235e270 100644
--- a/dbms/src/Storages/System/StorageSystemGraphite.cpp
+++ b/dbms/src/Storages/System/StorageSystemGraphite.cpp
@@ -1,175 +1,138 @@
 #include <Storages/System/StorageSystemGraphite.h>
+#include <Storages/StorageMergeTree.h>
+#include <Storages/StorageReplicatedMergeTree.h>
 
-#include <Common/StringUtils/StringUtils.h>
-#include <Columns/ColumnString.h>
-#include <Columns/ColumnsNumber.h>
-#include <Core/Field.h>
-#include <DataStreams/OneBlockInputStream.h>
-#include <DataTypes/DataTypeString.h>
-#include <DataTypes/DataTypesNumber.h>
 #include <Interpreters/Context.h>
 
-#include <Poco/Util/Application.h>
-
 
 namespace DB
 {
 
-namespace ErrorCodes
-{
-    extern const int NO_ELEMENTS_IN_CONFIG;
-}
-
-namespace
-{
-
-using namespace Poco::Util;
-
-struct Pattern
-{
-    struct Retention
-    {
-        UInt64 age;
-        UInt64 precision;
-    };
-
-    std::string regexp;
-    std::string function;
-    std::vector<Retention> retentions;
-    UInt16 priority;
-    UInt8 is_default;
-};
-
-static Pattern readOnePattern(
-    const AbstractConfiguration & config,
-    const std::string & path)
-{
-    Pattern pattern;
-    AbstractConfiguration::Keys keys;
-
-    config.keys(path, keys);
-
-    if (keys.empty())
-        throw Exception("Empty pattern in Graphite rollup configuration", ErrorCodes::NO_ELEMENTS_IN_CONFIG);
-
-    for (const auto & key : keys)
-    {
-        const String key_path = path + "." + key;
-
-        if (startsWith(key, "regexp"))
-        {
-            pattern.regexp = config.getString(key_path);
-        }
-        else if (startsWith(key, "function"))
-        {
-            pattern.function = config.getString(key_path);
-        }
-        else if (startsWith(key, "retention"))
-        {
-            pattern.retentions.push_back(Pattern::Retention{0, 0});
-            pattern.retentions.back().age = config.getUInt64(key_path + ".age", 0);
-            pattern.retentions.back().precision = config.getUInt64(key_path + ".precision", 0);
-        }
-    }
-
-    return pattern;
-}
-
-static std::vector<Pattern> readPatterns(
-    const AbstractConfiguration & config,
-    const std::string & section)
-{
-    AbstractConfiguration::Keys keys;
-    std::vector<Pattern> result;
-    size_t count = 0;
-
-    config.keys(section, keys);
-
-    for (const auto & key : keys)
-    {
-        if (startsWith(key, "pattern"))
-        {
-            Pattern pattern(readOnePattern(config, section + "." + key));
-            pattern.is_default = false;
-            pattern.priority = ++count;
-            result.push_back(pattern);
-        }
-        else if (startsWith(key, "default"))
-        {
-            Pattern pattern(readOnePattern(config, section + "." + key));
-            pattern.is_default = true;
-            pattern.priority = std::numeric_limits<UInt16>::max();
-            result.push_back(pattern);
-        }
-    }
-
-    return result;
-}
-
-static Strings getAllGraphiteSections(const AbstractConfiguration & config)
-{
-    Strings result;
-
-    AbstractConfiguration::Keys keys;
-    config.keys(keys);
-
-    for (const auto & key : keys)
-    {
-        if (startsWith(key, "graphite_"))
-            result.push_back(key);
-    }
-
-    return result;
-}
-
-} // namespace
-
 NamesAndTypesList StorageSystemGraphite::getNamesAndTypes()
 {
     return {
-        {"config_name", std::make_shared<DataTypeString>()},
-        {"regexp",      std::make_shared<DataTypeString>()},
-        {"function",    std::make_shared<DataTypeString>()},
-        {"age",         std::make_shared<DataTypeUInt64>()},
-        {"precision",   std::make_shared<DataTypeUInt64>()},
-        {"priority",    std::make_shared<DataTypeUInt16>()},
-        {"is_default",  std::make_shared<DataTypeUInt8>()},
+        {"config_name",     std::make_shared<DataTypeString>()},
+        {"regexp",          std::make_shared<DataTypeString>()},
+        {"function",        std::make_shared<DataTypeString>()},
+        {"age",             std::make_shared<DataTypeUInt64>()},
+        {"precision",       std::make_shared<DataTypeUInt64>()},
+        {"priority",        std::make_shared<DataTypeUInt16>()},
+        {"is_default",      std::make_shared<DataTypeUInt8>()},
+        {"Tables.database", std::make_shared<DataTypeArray>(std::make_shared<DataTypeString>())},
+        {"Tables.table",    std::make_shared<DataTypeArray>(std::make_shared<DataTypeString>())},
     };
 }
 
 
+/*
+ * Looking for (Replicated)*GraphiteMergeTree and get all configuration parameters for them
+ */
+StorageSystemGraphite::Configs StorageSystemGraphite::getConfigs(const Context & context) const
+{
+    const Databases databases = context.getDatabases();
+    Configs graphite_configs;
+
+    for (const auto & db : databases)
+    {
+        for (auto iterator = db.second->getIterator(context); iterator->isValid(); iterator->next())
+        {
+            auto & table = iterator->table();
+            const MergeTreeData * table_data = nullptr;
+
+            if (const StorageMergeTree * merge_tree = dynamic_cast<StorageMergeTree *>(table.get()))
+            {
+                table_data = & merge_tree->getData();
+            }
+            else if (const StorageReplicatedMergeTree * replicated_merge_tree = dynamic_cast<StorageReplicatedMergeTree *>(table.get()))
+            {
+                table_data = & replicated_merge_tree->getData();
+            }
+            else
+            {
+                continue;
+            }
+
+            if (table_data->merging_params.mode == MergeTreeData::MergingParams::Graphite)
+            {
+                const String config_name = table_data->merging_params.graphite_params.config_name;
+
+                if (graphite_configs.find(config_name) == graphite_configs.end())
+                {
+                  Config new_config = {
+                      & table_data->merging_params.graphite_params,
+                      { table_data->getDatabaseName() },
+                      { table_data->getTableName() },
+                  };
+                  graphite_configs.insert(std::make_pair(config_name, new_config));
+                }
+                else
+                {
+                    graphite_configs[config_name].databases.emplace_back(table_data->getDatabaseName());
+                    graphite_configs[config_name].tables.emplace_back(table_data->getTableName());
+                }
+            }
+        }
+    }
+
+    return graphite_configs;
+}
+
 void StorageSystemGraphite::fillData(MutableColumns & res_columns, const Context & context, const SelectQueryInfo &) const
 {
-    const auto & config = context.getConfigRef();
+    Configs graphite_configs = StorageSystemGraphite::getConfigs(context);
 
-    Strings sections = getAllGraphiteSections(config);
-    for (const auto & section : sections)
+    for (const auto & config : graphite_configs)
     {
-        const auto patterns = readPatterns(config, section);
-        for (const auto & pattern : patterns)
+        UInt16 priority = 0;
+        for (const auto & pattern : config.second.graphite_params->patterns)
         {
+            bool is_default = pattern.regexp == nullptr;
+            String regexp = "";
+            String function = "";
+
+            if (is_default)
+            {
+                priority = std::numeric_limits<UInt16>::max();
+            }
+            else
+            {
+                priority++;
+                regexp = pattern.regexp->getRE2()->pattern();
+            }
+
+            if (pattern.function)
+            {
+                function = pattern.function->getName();
+            }
+
             if (!pattern.retentions.empty())
             {
-                for (const auto & ret : pattern.retentions)
+                for (const auto & retention : pattern.retentions)
                 {
-                    res_columns[0]->insert(section);
-                    res_columns[1]->insert(pattern.regexp);
-                    res_columns[2]->insert(pattern.function);
-                    res_columns[3]->insert(ret.age);
-                    res_columns[4]->insert(ret.precision);
-                    res_columns[5]->insert(pattern.priority);
-                    res_columns[6]->insert(pattern.is_default);
+                    size_t i = 0;
+                    res_columns[i++]->insert(config.first);
+                    res_columns[i++]->insert(regexp);
+                    res_columns[i++]->insert(function);
+                    res_columns[i++]->insert(retention.age);
+                    res_columns[i++]->insert(retention.precision);
+                    res_columns[i++]->insert(priority);
+                    res_columns[i++]->insert(is_default);
+                    res_columns[i++]->insert(config.second.databases);
+                    res_columns[i++]->insert(config.second.tables);
                 }
             }
             else
             {
-                res_columns[0]->insert(section);
-                res_columns[1]->insert(pattern.regexp);
-                res_columns[2]->insert(pattern.function);
-                res_columns[3]->insert(0);
-                res_columns[4]->insert(0);
-                res_columns[5]->insert(pattern.priority);
-                res_columns[6]->insert(pattern.is_default);
+                size_t i = 0;
+                res_columns[i++]->insert(config.first);
+                res_columns[i++]->insert(regexp);
+                res_columns[i++]->insert(function);
+                res_columns[i++]->insert(NULL);
+                res_columns[i++]->insert(NULL);
+                res_columns[i++]->insert(priority);
+                res_columns[i++]->insert(is_default);
+                res_columns[i++]->insert(config.second.databases);
+                res_columns[i++]->insert(config.second.tables);
             }
         }
     }
diff --git a/dbms/src/Storages/System/StorageSystemGraphite.h b/dbms/src/Storages/System/StorageSystemGraphite.h
index fa63c839857..4205f77f1ea 100644
--- a/dbms/src/Storages/System/StorageSystemGraphite.h
+++ b/dbms/src/Storages/System/StorageSystemGraphite.h
@@ -1,7 +1,10 @@
 #pragma once
 
+#include <DataTypes/DataTypeArray.h>
 #include <DataTypes/DataTypeString.h>
+#include <DataTypes/DataTypesNumber.h>
 #include <Storages/System/IStorageSystemOneBlock.h>
+#include <Storages/MergeTree/MergeTreeData.h>
 #include <ext/shared_ptr_helper.h>
 
 namespace DB
@@ -15,10 +18,21 @@ public:
 
     static NamesAndTypesList getNamesAndTypes();
 
+    struct Config
+    {
+        const Graphite::Params * graphite_params;
+        Array databases;
+        Array tables;
+    };
+
+    using Configs = std::map<const String, Config>;
+
+
 protected:
     using IStorageSystemOneBlock::IStorageSystemOneBlock;
 
     void fillData(MutableColumns & res_columns, const Context & context, const SelectQueryInfo & query_info) const override;
+    StorageSystemGraphite::Configs getConfigs(const Context & context) const;
 };
 
 }

From d1cb4932d7bf7d77a2774ab88cf869c20783346f Mon Sep 17 00:00:00 2001
From: "Mikhail f. Shiryaev" <mr.felixoid@gmail.com>
Date: Mon, 4 Mar 2019 19:22:20 +0100
Subject: [PATCH 11/25] Add documentation about system.graphite_retentions

---
 docs/en/operations/system_tables.md | 16 ++++++++++++++++
 docs/ru/operations/system_tables.md | 17 +++++++++++++++++
 2 files changed, 33 insertions(+)

diff --git a/docs/en/operations/system_tables.md b/docs/en/operations/system_tables.md
index 34b44419cce..c6d90c89cb1 100644
--- a/docs/en/operations/system_tables.md
+++ b/docs/en/operations/system_tables.md
@@ -85,6 +85,22 @@ Columns:
 - `name`(`String`) – The name of the function.
 - `is_aggregate`(`UInt8`) — Whether the function is aggregate.
 
+## system.graphite_retentions
+
+Contains information about parameters [graphite_rollup](../server_settings/settings.md#server_settings-graphite_rollup) which use in tables with [\*GraphiteMergeTree](table_engines/graphitemergetree.md) engines.
+
+Столбцы:
+- `config_name`     (String) - `graphite_rollup` parameter name.
+- `regexp`          (String) - A pattern for the metric name.
+- `function`        (String) - The name of the aggregating function.
+- `age`             (UInt64) - The minimum age of the data in seconds.
+- `precision`       (UInt64) - How precisely to define the age of the data in seconds.
+- `priority`        (UInt16) - Pattern priority.
+- `is_default`      (UInt8) - Is pattern default or not.
+- `Tables.database` (Array(String)) - Array of databases names of tables, which use `config_name` parameter.
+- `Tables.table`    (Array(String)) - Array of tables names, which use `config_name` parameter.
+
+
 ## system.merges
 
 Contains information about merges and part mutations currently in process for tables in the MergeTree family.
diff --git a/docs/ru/operations/system_tables.md b/docs/ru/operations/system_tables.md
index 82aec59ec29..7a4e69ca1cd 100644
--- a/docs/ru/operations/system_tables.md
+++ b/docs/ru/operations/system_tables.md
@@ -83,6 +83,23 @@ default_expression String - выражение для значения по ум
 
 - `name` (`String`) – Имя функции.
 - `is_aggregate` (`UInt8`) – Признак, является ли функция агрегатной.
+
+## system.graphite_retentions
+
+Содержит информацию о том, какие параметры [graphite_rollup](../server_settings/settings.md#server_settings-graphite_rollup) используются в таблицах с движками [\*GraphiteMergeTree](table_engines/graphitemergetree.md).
+
+Столбцы:
+- `config_name`     (String) - Имя параметра, используемого для `graphite_rollup`.
+- `regexp`          (String) - Шаблон имени метрики.
+- `function`        (String) - Имя агрегирующей функции.
+- `age`             (UInt64) - Минимальный возраст данных в секундах.
+- `precision`       (UInt64) - Точность определения возраста данных в секундах.
+- `priority`        (UInt16) - Приоритет раздела pattern.
+- `is_default`      (UInt8) - Является ли раздел pattern дефолтным.
+- `Tables.database` (Array(String)) - Массив имён баз данных таблиц, использующих параметр `config_name`.
+- `Tables.table`    (Array(String)) - Массив имён таблиц, использующих параметр `config_name`.
+
+
 ## system.merges
 
 Содержит информацию о производящихся прямо сейчас слияниях и мутациях кусков для таблиц семейства MergeTree.

From 90466728c6a1fbfe2a21ecc5f35aa226f9f49ddd Mon Sep 17 00:00:00 2001
From: "Mikhail f. Shiryaev" <mr.felixoid@gmail.com>
Date: Mon, 4 Mar 2019 19:50:43 +0100
Subject: [PATCH 12/25] Add tests for system.graphite_retentions

---
 .../test_graphite_merge_tree/test.py          | 44 +++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/dbms/tests/integration/test_graphite_merge_tree/test.py b/dbms/tests/integration/test_graphite_merge_tree/test.py
index 8e98c97e077..509fbac97d0 100644
--- a/dbms/tests/integration/test_graphite_merge_tree/test.py
+++ b/dbms/tests/integration/test_graphite_merge_tree/test.py
@@ -231,6 +231,50 @@ SELECT * FROM test.graphite;
     assert TSV(result) == TSV(expected)
 
 
+def test_system_graphite_retentions(graphite_table):
+    expected = '''
+graphite_rollup	\\\\.count$	sum	0	0	1	0	['test']	['graphite']
+graphite_rollup	\\\\.max$	max	0	0	2	0	['test']	['graphite']
+graphite_rollup	^five_min\\\\.		31536000	14400	3	0	['test']	['graphite']
+graphite_rollup	^five_min\\\\.		5184000	3600	3	0	['test']	['graphite']
+graphite_rollup	^five_min\\\\.		0	300	3	0	['test']	['graphite']
+graphite_rollup	^one_min	avg	31536000	600	4	0	['test']	['graphite']
+graphite_rollup	^one_min	avg	7776000	300	4	0	['test']	['graphite']
+graphite_rollup	^one_min	avg	0	60	4	0	['test']	['graphite']
+    '''
+    result = q('SELECT * from system.graphite_retentions')
+
+    assert TSV(result) == TSV(expected)
+
+    q('''
+DROP TABLE IF EXISTS test.graphite2;
+CREATE TABLE test.graphite2
+    (metric String, value Float64, timestamp UInt32, date Date, updated UInt32)
+    ENGINE = GraphiteMergeTree('graphite_rollup')
+    PARTITION BY toYYYYMM(date)
+    ORDER BY (metric, timestamp)
+    SETTINGS index_granularity=8192;
+    ''')
+    expected = '''
+graphite_rollup	['test','test']	['graphite','graphite2']
+graphite_rollup	['test','test']	['graphite','graphite2']
+graphite_rollup	['test','test']	['graphite','graphite2']
+graphite_rollup	['test','test']	['graphite','graphite2']
+graphite_rollup	['test','test']	['graphite','graphite2']
+graphite_rollup	['test','test']	['graphite','graphite2']
+graphite_rollup	['test','test']	['graphite','graphite2']
+graphite_rollup	['test','test']	['graphite','graphite2']
+    '''
+    result = q('''
+    SELECT
+        config_name,
+        Tables.database,
+        Tables.table
+    FROM system.graphite_retentions
+    ''')
+    assert TSV(result) == TSV(expected)
+
+
 def test_path_dangling_pointer(graphite_table):
     q('''
 DROP TABLE IF EXISTS test.graphite2;

From 8b0d8644c860c8dbd2117114d53fa6da4471d6bf Mon Sep 17 00:00:00 2001
From: alexey-milovidov <milovidov@yandex-team.ru>
Date: Thu, 7 Mar 2019 19:55:53 +0300
Subject: [PATCH 13/25] Update StorageSystemGraphite.cpp

---
 .../Storages/System/StorageSystemGraphite.cpp | 27 ++++++++++---------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/dbms/src/Storages/System/StorageSystemGraphite.cpp b/dbms/src/Storages/System/StorageSystemGraphite.cpp
index ed37235e270..4f9fb755a23 100644
--- a/dbms/src/Storages/System/StorageSystemGraphite.cpp
+++ b/dbms/src/Storages/System/StorageSystemGraphite.cpp
@@ -41,11 +41,11 @@ StorageSystemGraphite::Configs StorageSystemGraphite::getConfigs(const Context &
 
             if (const StorageMergeTree * merge_tree = dynamic_cast<StorageMergeTree *>(table.get()))
             {
-                table_data = & merge_tree->getData();
+                table_data = &merge_tree->getData();
             }
             else if (const StorageReplicatedMergeTree * replicated_merge_tree = dynamic_cast<StorageReplicatedMergeTree *>(table.get()))
             {
-                table_data = & replicated_merge_tree->getData();
+                table_data = &replicated_merge_tree->getData();
             }
             else
             {
@@ -54,16 +54,18 @@ StorageSystemGraphite::Configs StorageSystemGraphite::getConfigs(const Context &
 
             if (table_data->merging_params.mode == MergeTreeData::MergingParams::Graphite)
             {
-                const String config_name = table_data->merging_params.graphite_params.config_name;
+                const String & config_name = table_data->merging_params.graphite_params.config_name;
 
-                if (graphite_configs.find(config_name) == graphite_configs.end())
+                if (!graphite_configs.count(config_name))
                 {
-                  Config new_config = {
-                      & table_data->merging_params.graphite_params,
-                      { table_data->getDatabaseName() },
-                      { table_data->getTableName() },
-                  };
-                  graphite_configs.insert(std::make_pair(config_name, new_config));
+                    Config new_config =
+                    {
+                        /// FIXME Do we own a table? (possible dangling reference)
+                        &table_data->merging_params.graphite_params,
+                        { table_data->getDatabaseName() },
+                        { table_data->getTableName() },
+                    };
+                    graphite_configs.emplace(config_name, new_config);
                 }
                 else
                 {
@@ -87,8 +89,8 @@ void StorageSystemGraphite::fillData(MutableColumns & res_columns, const Context
         for (const auto & pattern : config.second.graphite_params->patterns)
         {
             bool is_default = pattern.regexp == nullptr;
-            String regexp = "";
-            String function = "";
+            String regexp;
+            String function;
 
             if (is_default)
             {
@@ -97,6 +99,7 @@ void StorageSystemGraphite::fillData(MutableColumns & res_columns, const Context
             else
             {
                 priority++;
+                /// FIXME Null pointer dereference for trivial patterns.
                 regexp = pattern.regexp->getRE2()->pattern();
             }
 

From 9e82b44b625b3150380bc11c06b88cebf1926de9 Mon Sep 17 00:00:00 2001
From: "Mikhail f. Shiryaev" <mr.felixoid@gmail.com>
Date: Thu, 7 Mar 2019 21:17:06 +0100
Subject: [PATCH 14/25] Review adjustment

---
 .../DataStreams/GraphiteRollupSortedBlockInputStream.h    | 2 ++
 dbms/src/Storages/MergeTree/registerStorageMergeTree.cpp  | 1 +
 dbms/src/Storages/System/StorageSystemGraphite.cpp        | 8 +++-----
 dbms/src/Storages/System/StorageSystemGraphite.h          | 2 +-
 4 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/dbms/src/DataStreams/GraphiteRollupSortedBlockInputStream.h b/dbms/src/DataStreams/GraphiteRollupSortedBlockInputStream.h
index dc5260be0e7..00bd2f4b67e 100644
--- a/dbms/src/DataStreams/GraphiteRollupSortedBlockInputStream.h
+++ b/dbms/src/DataStreams/GraphiteRollupSortedBlockInputStream.h
@@ -113,6 +113,7 @@ namespace Graphite
     struct Pattern
     {
         std::shared_ptr<OptimizedRegularExpression> regexp;
+        std::string regexp_str;
         AggregateFunctionPtr function;
         Retentions retentions;    /// Must be ordered by 'age' descending.
         enum { TypeUndef, TypeRetention, TypeAggregation, TypeAll } type = TypeAll; /// The type of defined pattern, filled automatically
@@ -216,6 +217,7 @@ private:
     const Graphite::Pattern undef_pattern =
     { /// temporary empty pattern for selectPatternForPath
         nullptr,
+        "",
         nullptr,
         DB::Graphite::Retentions(),
         undef_pattern.TypeUndef,
diff --git a/dbms/src/Storages/MergeTree/registerStorageMergeTree.cpp b/dbms/src/Storages/MergeTree/registerStorageMergeTree.cpp
index 4b934ea3122..6411ec21bac 100644
--- a/dbms/src/Storages/MergeTree/registerStorageMergeTree.cpp
+++ b/dbms/src/Storages/MergeTree/registerStorageMergeTree.cpp
@@ -102,6 +102,7 @@ static void appendGraphitePattern(
         if (key == "regexp")
         {
             pattern.regexp = std::make_shared<OptimizedRegularExpression>(config.getString(config_element + ".regexp"));
+            pattern.regexp_str = config.getString(config_element + ".regexp");
         }
         else if (key == "function")
         {
diff --git a/dbms/src/Storages/System/StorageSystemGraphite.cpp b/dbms/src/Storages/System/StorageSystemGraphite.cpp
index 4f9fb755a23..fa1b768ac98 100644
--- a/dbms/src/Storages/System/StorageSystemGraphite.cpp
+++ b/dbms/src/Storages/System/StorageSystemGraphite.cpp
@@ -60,8 +60,7 @@ StorageSystemGraphite::Configs StorageSystemGraphite::getConfigs(const Context &
                 {
                     Config new_config =
                     {
-                        /// FIXME Do we own a table? (possible dangling reference)
-                        &table_data->merging_params.graphite_params,
+                        table_data->merging_params.graphite_params,
                         { table_data->getDatabaseName() },
                         { table_data->getTableName() },
                     };
@@ -86,7 +85,7 @@ void StorageSystemGraphite::fillData(MutableColumns & res_columns, const Context
     for (const auto & config : graphite_configs)
     {
         UInt16 priority = 0;
-        for (const auto & pattern : config.second.graphite_params->patterns)
+        for (const auto & pattern : config.second.graphite_params.patterns)
         {
             bool is_default = pattern.regexp == nullptr;
             String regexp;
@@ -99,8 +98,7 @@ void StorageSystemGraphite::fillData(MutableColumns & res_columns, const Context
             else
             {
                 priority++;
-                /// FIXME Null pointer dereference for trivial patterns.
-                regexp = pattern.regexp->getRE2()->pattern();
+                regexp = pattern.regexp_str;
             }
 
             if (pattern.function)
diff --git a/dbms/src/Storages/System/StorageSystemGraphite.h b/dbms/src/Storages/System/StorageSystemGraphite.h
index 4205f77f1ea..b874e294782 100644
--- a/dbms/src/Storages/System/StorageSystemGraphite.h
+++ b/dbms/src/Storages/System/StorageSystemGraphite.h
@@ -20,7 +20,7 @@ public:
 
     struct Config
     {
-        const Graphite::Params * graphite_params;
+        Graphite::Params graphite_params;
         Array databases;
         Array tables;
     };

From 1e71559b2dbf53a4db2ae43c46f38bedf3b05714 Mon Sep 17 00:00:00 2001
From: Simon Podlipsky <simon@podlipsky.net>
Date: Sat, 9 Mar 2019 14:58:08 +0100
Subject: [PATCH 15/25] Upgrade librdkafka to RC7

---
 contrib/librdkafka | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/contrib/librdkafka b/contrib/librdkafka
index 363dcad5a23..51ae5f5fd8b 160000
--- a/contrib/librdkafka
+++ b/contrib/librdkafka
@@ -1 +1 @@
-Subproject commit 363dcad5a23dc29381cc626620e68ae418b3af19
+Subproject commit 51ae5f5fd8b742e56f47a8bb0136344868818285

From 4e67678b642dc314aaf44eedf59c8aa817d0d20a Mon Sep 17 00:00:00 2001
From: Danila Kutenin <kutdanila@yandex.ru>
Date: Sat, 9 Mar 2019 19:57:52 +0300
Subject: [PATCH 16/25] Better docs to the distance functions

---
 docs/en/query_language/functions/string_search_functions.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en/query_language/functions/string_search_functions.md b/docs/en/query_language/functions/string_search_functions.md
index bde56693c36..6ae7c03f73c 100644
--- a/docs/en/query_language/functions/string_search_functions.md
+++ b/docs/en/query_language/functions/string_search_functions.md
@@ -72,7 +72,7 @@ The same thing as 'like', but negative.
 
 ## ngramDistance(haystack, needle)
 
-Calculate the 4-gram distance between `haystack` and `needle`: count the symmetric difference between two multisets of 4-grams and normalize it by the sum of their cardinalities. Returns float number from 0 to 1 - the closer to zero, the more strings are similar to each other. If the `needle` is more than 32Kb, throw an exception. If some of the `haystack` strings are more than 32Kb, the distance is always one.
+Calculates the 4-gram distance between `haystack` and `needle`: counts the symmetric difference between two multisets of 4-grams and normalizes it by the sum of their cardinalities. Returns float number from 0 to 1 -- the closer to zero, the more strings are similar to each other. If the `needle` is more than 32Kb, throws an exception. If some of the `haystack` strings are more than 32Kb, the distance is always one.
 
 For case-insensitive search or/and in UTF-8 format use functions `ngramDistanceCaseInsensitive, ngramDistanceUTF8, ngramDistanceCaseInsensitiveUTF8`.
 

From 56872ef0e3d59b87d4761926c994aff447b41b14 Mon Sep 17 00:00:00 2001
From: Danila Kutenin <kutdanila@yandex.ru>
Date: Sat, 9 Mar 2019 19:59:43 +0300
Subject: [PATCH 17/25] Better docs to the distance functions

---
 docs/ru/query_language/functions/string_search_functions.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/ru/query_language/functions/string_search_functions.md b/docs/ru/query_language/functions/string_search_functions.md
index 6658cc4ee19..8939e4c926c 100644
--- a/docs/ru/query_language/functions/string_search_functions.md
+++ b/docs/ru/query_language/functions/string_search_functions.md
@@ -61,10 +61,10 @@
 
 ## ngramDistance(haystack, needle)
 
-Вычисление 4-граммного расстояния между `haystack` и `needle`: считается симметрическая разность между двумя мультимножествами 4-грамм и нормализается на сумму их мощностей. Возвращает число float от 0 до 1 - чем ближе к нулю, тем больше строки похожи друг на друга. Если `needle` больше чем 32КБ, кидается исключение. Если некоторые строка из `haystack` больше 32КБ, расстояние всегда равно единице.
+Вычисление 4-граммного расстояния между `haystack` и `needle`: считается симметрическая разность между двумя мультимножествами 4-грамм и нормализается на сумму их мощностей. Возвращает число float от 0 до 1 -- чем ближе к нулю, тем больше строки похожи друг на друга. Если `needle` больше чем 32КБ, кидается исключение. Если некоторые строки из `haystack` больше 32КБ, расстояние всегда равно единице.
 
 Для поиска без учета регистра и/или в формате UTF-8 используйте функции `ngramDistanceCaseInsensitive, ngramDistanceUTF8, ngramDistanceCaseInsensitiveUTF8`.
 
-Примечание: для случая UTF-8 мы используем триграммное расстояние. Вычисление n-граммного расстояния не совсем честное. Мы используем 2-х байтные хэши для хэширования n-грамм, а затем вычисляем симметричную разность между хэш таблицами - могут возникнуть коллизии. В формате UTF-8 без учета регистра мы не используем честную функцию `tolower` - мы обнуляем 5-й бит (нумерация с нуля) каждого байта кодовой точки - это работает для латиницы и почти для всех кириллических букв.
+Примечание: для случая UTF-8 мы используем триграммное расстояние. Вычисление n-граммного расстояния не совсем честное. Мы используем 2-х байтные хэши для хэширования n-грамм, а затем вычисляем симметричную разность между хэш таблицами - могут возникнуть коллизии. В формате UTF-8 без учета регистра мы не используем честную функцию `tolower` - мы обнуляем 5-й бит (нумерация с нуля) каждого байта кодовой точки -- это работает для латиницы и почти для всех кириллических букв.
 
 [Оригинальная статья](https://clickhouse.yandex/docs/ru/query_language/functions/string_search_functions/) <!--hide-->

From b8538c49c98b7e00728f897eaf4a7347c9517a87 Mon Sep 17 00:00:00 2001
From: Danila Kutenin <kutdanila@yandex.ru>
Date: Sat, 9 Mar 2019 20:01:01 +0300
Subject: [PATCH 18/25] Better docs to the distance functions

---
 docs/en/query_language/functions/string_search_functions.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en/query_language/functions/string_search_functions.md b/docs/en/query_language/functions/string_search_functions.md
index 6ae7c03f73c..dce9917776c 100644
--- a/docs/en/query_language/functions/string_search_functions.md
+++ b/docs/en/query_language/functions/string_search_functions.md
@@ -76,7 +76,7 @@ Calculates the 4-gram distance between `haystack` and `needle`: counts the symme
 
 For case-insensitive search or/and in UTF-8 format use functions `ngramDistanceCaseInsensitive, ngramDistanceUTF8, ngramDistanceCaseInsensitiveUTF8`.
 
-Notes: For UTF-8 case we use 3-gram distance. All these are not perfectly fair n-gram distances. We use 2 bytes hashes to hash n-grams and then calculate the symmetric difference between these hash tables -- collisions may occur. With UTF-8 case-insensitive format we do not use fair `tolower` function -- we zero the 5-th bit (starting from zero) of each codepoint byte -- this works for Latin and mostly for all Cyrillic letters.
+Notes: For UTF-8 case we use 3-gram distance. All these are not perfectly fair n-gram distances. We use 2-byte hashes to hash n-grams and then calculate the symmetric difference between these hash tables -- collisions may occur. With UTF-8 case-insensitive format we do not use fair `tolower` function -- we zero the 5-th bit (starting from zero) of each codepoint byte -- this works for Latin and mostly for all Cyrillic letters.
 
 
 [Original article](https://clickhouse.yandex/docs/en/query_language/functions/string_search_functions/) <!--hide-->

From 86df0960d9595293fef7b784b59aa1c8f17a10b0 Mon Sep 17 00:00:00 2001
From: Danila Kutenin <kutdanila@yandex.ru>
Date: Sat, 9 Mar 2019 20:07:45 +0300
Subject: [PATCH 19/25] Better docs to the distance functions

---
 docs/en/query_language/functions/string_search_functions.md | 2 +-
 docs/ru/query_language/functions/string_search_functions.md | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/en/query_language/functions/string_search_functions.md b/docs/en/query_language/functions/string_search_functions.md
index dce9917776c..29e8bcf8a38 100644
--- a/docs/en/query_language/functions/string_search_functions.md
+++ b/docs/en/query_language/functions/string_search_functions.md
@@ -76,7 +76,7 @@ Calculates the 4-gram distance between `haystack` and `needle`: counts the symme
 
 For case-insensitive search or/and in UTF-8 format use functions `ngramDistanceCaseInsensitive, ngramDistanceUTF8, ngramDistanceCaseInsensitiveUTF8`.
 
-Notes: For UTF-8 case we use 3-gram distance. All these are not perfectly fair n-gram distances. We use 2-byte hashes to hash n-grams and then calculate the symmetric difference between these hash tables -- collisions may occur. With UTF-8 case-insensitive format we do not use fair `tolower` function -- we zero the 5-th bit (starting from zero) of each codepoint byte -- this works for Latin and mostly for all Cyrillic letters.
+Notes: For UTF-8 case we use 3-gram distance. All these are not perfectly fair n-gram distances. We use 12-byte hashes to hash n-grams and then calculate the symmetric difference between these hash tables -- collisions may occur. With UTF-8 case-insensitive format we do not use fair `tolower` function -- we zero the 5-th bit (starting from zero) of each codepoint byte -- this works for Latin and mostly for all Cyrillic letters.
 
 
 [Original article](https://clickhouse.yandex/docs/en/query_language/functions/string_search_functions/) <!--hide-->
diff --git a/docs/ru/query_language/functions/string_search_functions.md b/docs/ru/query_language/functions/string_search_functions.md
index 8939e4c926c..4b335cce34c 100644
--- a/docs/ru/query_language/functions/string_search_functions.md
+++ b/docs/ru/query_language/functions/string_search_functions.md
@@ -9,7 +9,7 @@
 Для поиска без учета регистра используйте функцию `positionCaseInsensitive`.
 
 ## positionUTF8(haystack, needle)
-Так же, как `position`, но позиция возвращается в кодовых точках Unicode. Работает при допущении, что строка содержит набор байт, представляющий текст в кодировке UTF-8. Если допущение не выполнено - то возвращает какой-нибудь результат (не кидает исключение).
+Так же, как `position`, но позиция возвращается в кодовых точках Unicode. Работает при допущении, что строка содержит набор байт, представляющий текст в кодировке UTF-8. Если допущение не выполнено -- то возвращает какой-нибудь результат (не кидает исключение).
 
 Для поиска без учета регистра используйте функцию `positionCaseInsensitiveUTF8`.
 
@@ -65,6 +65,6 @@
 
 Для поиска без учета регистра и/или в формате UTF-8 используйте функции `ngramDistanceCaseInsensitive, ngramDistanceUTF8, ngramDistanceCaseInsensitiveUTF8`.
 
-Примечание: для случая UTF-8 мы используем триграммное расстояние. Вычисление n-граммного расстояния не совсем честное. Мы используем 2-х байтные хэши для хэширования n-грамм, а затем вычисляем симметричную разность между хэш таблицами - могут возникнуть коллизии. В формате UTF-8 без учета регистра мы не используем честную функцию `tolower` - мы обнуляем 5-й бит (нумерация с нуля) каждого байта кодовой точки -- это работает для латиницы и почти для всех кириллических букв.
+Примечание: для случая UTF-8 мы используем триграммное расстояние. Вычисление n-граммного расстояния не совсем честное. Мы используем 2-х байтные хэши для хэширования n-грамм, а затем вычисляем симметричную разность между хэш таблицами -- могут возникнуть коллизии. В формате UTF-8 без учета регистра мы не используем честную функцию `tolower` -- мы обнуляем 5-й бит (нумерация с нуля) каждого байта кодовой точки -- это работает для латиницы и почти для всех кириллических букв.
 
 [Оригинальная статья](https://clickhouse.yandex/docs/ru/query_language/functions/string_search_functions/) <!--hide-->

From 2905159c8598ab9cc90cc1b690f146cedfbea9df Mon Sep 17 00:00:00 2001
From: Danila Kutenin <kutdanila@yandex.ru>
Date: Sat, 9 Mar 2019 20:26:32 +0300
Subject: [PATCH 20/25] Better docs to the distance functions

---
 docs/en/query_language/functions/string_search_functions.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en/query_language/functions/string_search_functions.md b/docs/en/query_language/functions/string_search_functions.md
index 29e8bcf8a38..dce9917776c 100644
--- a/docs/en/query_language/functions/string_search_functions.md
+++ b/docs/en/query_language/functions/string_search_functions.md
@@ -76,7 +76,7 @@ Calculates the 4-gram distance between `haystack` and `needle`: counts the symme
 
 For case-insensitive search or/and in UTF-8 format use functions `ngramDistanceCaseInsensitive, ngramDistanceUTF8, ngramDistanceCaseInsensitiveUTF8`.
 
-Notes: For UTF-8 case we use 3-gram distance. All these are not perfectly fair n-gram distances. We use 12-byte hashes to hash n-grams and then calculate the symmetric difference between these hash tables -- collisions may occur. With UTF-8 case-insensitive format we do not use fair `tolower` function -- we zero the 5-th bit (starting from zero) of each codepoint byte -- this works for Latin and mostly for all Cyrillic letters.
+Notes: For UTF-8 case we use 3-gram distance. All these are not perfectly fair n-gram distances. We use 2-byte hashes to hash n-grams and then calculate the symmetric difference between these hash tables -- collisions may occur. With UTF-8 case-insensitive format we do not use fair `tolower` function -- we zero the 5-th bit (starting from zero) of each codepoint byte -- this works for Latin and mostly for all Cyrillic letters.
 
 
 [Original article](https://clickhouse.yandex/docs/en/query_language/functions/string_search_functions/) <!--hide-->

From 0061df234a0c8fde2a6a2839f35285c441f50b8d Mon Sep 17 00:00:00 2001
From: proller <proller@users.noreply.github.com>
Date: Sat, 9 Mar 2019 21:52:46 +0300
Subject: [PATCH 21/25] Build fix (split) (#4641)

---
 dbms/programs/server/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dbms/programs/server/CMakeLists.txt b/dbms/programs/server/CMakeLists.txt
index 217447413d5..5cb08018065 100644
--- a/dbms/programs/server/CMakeLists.txt
+++ b/dbms/programs/server/CMakeLists.txt
@@ -10,7 +10,7 @@ set(CLICKHOUSE_SERVER_SOURCES
     ${CMAKE_CURRENT_SOURCE_DIR}/TCPHandler.cpp
    )
 
-set(CLICKHOUSE_SERVER_LINK PRIVATE clickhouse_dictionaries clickhouse_common_io daemon clickhouse_storages_system clickhouse_functions clickhouse_aggregate_functions clickhouse_table_functions ${Poco_Net_LIBRARY})
+set(CLICKHOUSE_SERVER_LINK PRIVATE clickhouse_dictionaries clickhouse_common_io PUBLIC daemon PRIVATE clickhouse_storages_system clickhouse_functions clickhouse_aggregate_functions clickhouse_table_functions ${Poco_Net_LIBRARY})
 if (USE_POCO_NETSSL)
     set(CLICKHOUSE_SERVER_LINK ${CLICKHOUSE_SERVER_LINK} PRIVATE ${Poco_NetSSL_LIBRARY} ${Poco_Crypto_LIBRARY})
 endif ()

From 446caea46efe6d297b4a044adc266a45dbd24abd Mon Sep 17 00:00:00 2001
From: proller <proller@users.noreply.github.com>
Date: Sun, 10 Mar 2019 04:28:13 +0300
Subject: [PATCH 22/25]  Update contrib/cppkafka (#4620)

* Update contrib/cppkafka

* Fix
---
 contrib/cppkafka | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/contrib/cppkafka b/contrib/cppkafka
index 860c90e92ee..9b184d881c1 160000
--- a/contrib/cppkafka
+++ b/contrib/cppkafka
@@ -1 +1 @@
-Subproject commit 860c90e92eee6690aa74a2ca7b7c5c6930dffecd
+Subproject commit 9b184d881c15cc50784b28688c7c99d3d764db24

From 128fd20adf5ab5861e7893d9d7e655bf8c773872 Mon Sep 17 00:00:00 2001
From: alexey-milovidov <milovidov@yandex-team.ru>
Date: Sun, 10 Mar 2019 04:30:42 +0300
Subject: [PATCH 23/25] Update registerStorageMergeTree.cpp

---
 dbms/src/Storages/MergeTree/registerStorageMergeTree.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dbms/src/Storages/MergeTree/registerStorageMergeTree.cpp b/dbms/src/Storages/MergeTree/registerStorageMergeTree.cpp
index 6411ec21bac..a64f376e3de 100644
--- a/dbms/src/Storages/MergeTree/registerStorageMergeTree.cpp
+++ b/dbms/src/Storages/MergeTree/registerStorageMergeTree.cpp
@@ -101,8 +101,8 @@ static void appendGraphitePattern(
     {
         if (key == "regexp")
         {
-            pattern.regexp = std::make_shared<OptimizedRegularExpression>(config.getString(config_element + ".regexp"));
             pattern.regexp_str = config.getString(config_element + ".regexp");
+            pattern.regexp = std::make_shared<OptimizedRegularExpression>(pattern.regexp_str);
         }
         else if (key == "function")
         {

From b81f73bb132493aca5c184a195d298a261a3366f Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Sun, 10 Mar 2019 06:13:19 +0300
Subject: [PATCH 24/25] Added a test [#CLICKHOUSE-1704]

---
 dbms/tests/queries/0_stateless/00915_tuple_orantius.reference | 1 +
 dbms/tests/queries/0_stateless/00915_tuple_orantius.sql       | 1 +
 2 files changed, 2 insertions(+)
 create mode 100644 dbms/tests/queries/0_stateless/00915_tuple_orantius.reference
 create mode 100644 dbms/tests/queries/0_stateless/00915_tuple_orantius.sql

diff --git a/dbms/tests/queries/0_stateless/00915_tuple_orantius.reference b/dbms/tests/queries/0_stateless/00915_tuple_orantius.reference
new file mode 100644
index 00000000000..6b303cbce8b
--- /dev/null
+++ b/dbms/tests/queries/0_stateless/00915_tuple_orantius.reference
@@ -0,0 +1 @@
+1	(1,2,3)	1
diff --git a/dbms/tests/queries/0_stateless/00915_tuple_orantius.sql b/dbms/tests/queries/0_stateless/00915_tuple_orantius.sql
new file mode 100644
index 00000000000..938260c5123
--- /dev/null
+++ b/dbms/tests/queries/0_stateless/00915_tuple_orantius.sql
@@ -0,0 +1 @@
+select 1 as x, (1,2,3) as y, x in y;

From 6db73152d2e1b7b35a03b7e146549dd84fe2992d Mon Sep 17 00:00:00 2001
From: alexey-milovidov <milovidov@yandex-team.ru>
Date: Sun, 10 Mar 2019 06:16:51 +0300
Subject: [PATCH 25/25] Hardening debug build (experimental) (#4632)

* Hardening debug build: more granular memory mappings and ASLR; add memory protection for mark cache and index

* Addition to prev. revision

* Addition to prev. revision

* Addition to prev. revision
---
 dbms/src/AggregateFunctions/QuantileTDigest.h |  2 +-
 dbms/src/Columns/ColumnAggregateFunction.cpp  |  5 ++
 dbms/src/Columns/ColumnAggregateFunction.h    |  2 +
 dbms/src/Columns/ColumnArray.cpp              |  7 +++
 dbms/src/Columns/ColumnArray.h                |  1 +
 dbms/src/Columns/ColumnDecimal.h              |  1 +
 dbms/src/Columns/ColumnFixedString.h          |  5 ++
 dbms/src/Columns/ColumnLowCardinality.cpp     |  1 -
 dbms/src/Columns/ColumnNullable.cpp           |  6 ++
 dbms/src/Columns/ColumnNullable.h             |  1 +
 dbms/src/Columns/ColumnString.cpp             |  7 +++
 dbms/src/Columns/ColumnString.h               |  2 +
 dbms/src/Columns/ColumnTuple.cpp              |  6 ++
 dbms/src/Columns/ColumnTuple.h                |  1 +
 dbms/src/Columns/ColumnUnique.h               |  1 +
 dbms/src/Columns/ColumnVector.h               |  5 ++
 dbms/src/Columns/ColumnVectorHelper.h         |  3 +-
 dbms/src/Columns/IColumn.h                    |  4 ++
 dbms/src/Common/Allocator.cpp                 | 23 ++++++-
 dbms/src/Common/Allocator.h                   | 19 ++++++
 dbms/src/Common/ErrorCodes.cpp                |  1 +
 dbms/src/Common/PODArray.h                    | 63 +++++++++++++++++++
 dbms/src/Interpreters/AggregationCommon.h     | 20 +++---
 .../Storages/MergeTree/MergeTreeDataPart.cpp  |  5 +-
 .../MergeTree/MergeTreeReaderStream.cpp       |  1 +
 libs/libcommon/include/common/mremap.h        |  9 ++-
 26 files changed, 182 insertions(+), 19 deletions(-)

diff --git a/dbms/src/AggregateFunctions/QuantileTDigest.h b/dbms/src/AggregateFunctions/QuantileTDigest.h
index ca7d4f2fb1a..c4ee76b6eed 100644
--- a/dbms/src/AggregateFunctions/QuantileTDigest.h
+++ b/dbms/src/AggregateFunctions/QuantileTDigest.h
@@ -85,7 +85,7 @@ class QuantileTDigest
     Params params;
 
     /// The memory will be allocated to several elements at once, so that the state occupies 64 bytes.
-    static constexpr size_t bytes_in_arena = 64 - sizeof(PODArray<Centroid>) - sizeof(Count) - sizeof(UInt32);
+    static constexpr size_t bytes_in_arena = 128 - sizeof(PODArray<Centroid>) - sizeof(Count) - sizeof(UInt32);
 
     using Summary = PODArray<Centroid, bytes_in_arena / sizeof(Centroid), AllocatorWithStackMemory<Allocator<false>, bytes_in_arena>>;
 
diff --git a/dbms/src/Columns/ColumnAggregateFunction.cpp b/dbms/src/Columns/ColumnAggregateFunction.cpp
index 69bcdac2ab7..4652e4a08c8 100644
--- a/dbms/src/Columns/ColumnAggregateFunction.cpp
+++ b/dbms/src/Columns/ColumnAggregateFunction.cpp
@@ -255,6 +255,11 @@ size_t ColumnAggregateFunction::allocatedBytes() const
     return res;
 }
 
+void ColumnAggregateFunction::protect()
+{
+    data.protect();
+}
+
 MutableColumnPtr ColumnAggregateFunction::cloneEmpty() const
 {
     return create(func, Arenas(1, std::make_shared<Arena>()));
diff --git a/dbms/src/Columns/ColumnAggregateFunction.h b/dbms/src/Columns/ColumnAggregateFunction.h
index 3fc76b4c047..a028a95d68c 100644
--- a/dbms/src/Columns/ColumnAggregateFunction.h
+++ b/dbms/src/Columns/ColumnAggregateFunction.h
@@ -157,6 +157,8 @@ public:
 
     size_t allocatedBytes() const override;
 
+    void protect() override;
+
     void insertRangeFrom(const IColumn & from, size_t start, size_t length) override;
 
     void popBack(size_t n) override;
diff --git a/dbms/src/Columns/ColumnArray.cpp b/dbms/src/Columns/ColumnArray.cpp
index 4ceda666db7..eeb06b64f49 100644
--- a/dbms/src/Columns/ColumnArray.cpp
+++ b/dbms/src/Columns/ColumnArray.cpp
@@ -311,6 +311,13 @@ size_t ColumnArray::allocatedBytes() const
 }
 
 
+void ColumnArray::protect()
+{
+    getData().protect();
+    getOffsets().protect();
+}
+
+
 bool ColumnArray::hasEqualOffsets(const ColumnArray & other) const
 {
     if (offsets == other.offsets)
diff --git a/dbms/src/Columns/ColumnArray.h b/dbms/src/Columns/ColumnArray.h
index 3e1b586e755..d58dfba025a 100644
--- a/dbms/src/Columns/ColumnArray.h
+++ b/dbms/src/Columns/ColumnArray.h
@@ -78,6 +78,7 @@ public:
     void reserve(size_t n) override;
     size_t byteSize() const override;
     size_t allocatedBytes() const override;
+    void protect() override;
     ColumnPtr replicate(const Offsets & replicate_offsets) const override;
     ColumnPtr convertToFullColumnIfConst() const override;
     void getExtremes(Field & min, Field & max) const override;
diff --git a/dbms/src/Columns/ColumnDecimal.h b/dbms/src/Columns/ColumnDecimal.h
index 50a6d9d67fb..372b0c245c0 100644
--- a/dbms/src/Columns/ColumnDecimal.h
+++ b/dbms/src/Columns/ColumnDecimal.h
@@ -87,6 +87,7 @@ public:
     size_t size() const override { return data.size(); }
     size_t byteSize() const override { return data.size() * sizeof(data[0]); }
     size_t allocatedBytes() const override { return data.allocated_bytes(); }
+    void protect() override { data.protect(); }
     void reserve(size_t n) override { data.reserve(n); }
 
     void insertFrom(const IColumn & src, size_t n) override { data.push_back(static_cast<const Self &>(src).getData()[n]); }
diff --git a/dbms/src/Columns/ColumnFixedString.h b/dbms/src/Columns/ColumnFixedString.h
index 941314b8888..b773d7c8eb4 100644
--- a/dbms/src/Columns/ColumnFixedString.h
+++ b/dbms/src/Columns/ColumnFixedString.h
@@ -57,6 +57,11 @@ public:
         return chars.allocated_bytes() + sizeof(n);
     }
 
+    void protect() override
+    {
+        chars.protect();
+    }
+
     Field operator[](size_t index) const override
     {
         return String(reinterpret_cast<const char *>(&chars[n * index]), n);
diff --git a/dbms/src/Columns/ColumnLowCardinality.cpp b/dbms/src/Columns/ColumnLowCardinality.cpp
index c919116112c..c9a475fd8a6 100644
--- a/dbms/src/Columns/ColumnLowCardinality.cpp
+++ b/dbms/src/Columns/ColumnLowCardinality.cpp
@@ -363,7 +363,6 @@ ColumnPtr ColumnLowCardinality::countKeys() const
 }
 
 
-
 ColumnLowCardinality::Index::Index() : positions(ColumnUInt8::create()), size_of_type(sizeof(UInt8)) {}
 
 ColumnLowCardinality::Index::Index(MutableColumnPtr && positions) : positions(std::move(positions))
diff --git a/dbms/src/Columns/ColumnNullable.cpp b/dbms/src/Columns/ColumnNullable.cpp
index b88cf60581b..d9a8ea4f825 100644
--- a/dbms/src/Columns/ColumnNullable.cpp
+++ b/dbms/src/Columns/ColumnNullable.cpp
@@ -291,6 +291,12 @@ size_t ColumnNullable::allocatedBytes() const
     return getNestedColumn().allocatedBytes() + getNullMapColumn().allocatedBytes();
 }
 
+void ColumnNullable::protect()
+{
+    getNestedColumn().protect();
+    getNullMapColumn().protect();
+}
+
 
 namespace
 {
diff --git a/dbms/src/Columns/ColumnNullable.h b/dbms/src/Columns/ColumnNullable.h
index c8453a29689..8012d03b0e8 100644
--- a/dbms/src/Columns/ColumnNullable.h
+++ b/dbms/src/Columns/ColumnNullable.h
@@ -71,6 +71,7 @@ public:
     void reserve(size_t n) override;
     size_t byteSize() const override;
     size_t allocatedBytes() const override;
+    void protect() override;
     ColumnPtr replicate(const Offsets & replicate_offsets) const override;
     void updateHashWithValue(size_t n, SipHash & hash) const override;
     void getExtremes(Field & min, Field & max) const override;
diff --git a/dbms/src/Columns/ColumnString.cpp b/dbms/src/Columns/ColumnString.cpp
index 1717c02f1df..1443283783a 100644
--- a/dbms/src/Columns/ColumnString.cpp
+++ b/dbms/src/Columns/ColumnString.cpp
@@ -412,4 +412,11 @@ void ColumnString::getPermutationWithCollation(const Collator & collator, bool r
     }
 }
 
+
+void ColumnString::protect()
+{
+    getChars().protect();
+    getOffsets().protect();
+}
+
 }
diff --git a/dbms/src/Columns/ColumnString.h b/dbms/src/Columns/ColumnString.h
index 5ca05079bd5..a30a4ceb5a1 100644
--- a/dbms/src/Columns/ColumnString.h
+++ b/dbms/src/Columns/ColumnString.h
@@ -68,6 +68,8 @@ public:
         return chars.allocated_bytes() + offsets.allocated_bytes();
     }
 
+    void protect() override;
+
     MutableColumnPtr cloneResized(size_t to_size) const override;
 
     Field operator[](size_t n) const override
diff --git a/dbms/src/Columns/ColumnTuple.cpp b/dbms/src/Columns/ColumnTuple.cpp
index c235cd07c31..ec0bcc1f5b5 100644
--- a/dbms/src/Columns/ColumnTuple.cpp
+++ b/dbms/src/Columns/ColumnTuple.cpp
@@ -315,6 +315,12 @@ size_t ColumnTuple::allocatedBytes() const
     return res;
 }
 
+void ColumnTuple::protect()
+{
+    for (auto & column : columns)
+        column->assumeMutableRef().protect();
+}
+
 void ColumnTuple::getExtremes(Field & min, Field & max) const
 {
     const size_t tuple_size = columns.size();
diff --git a/dbms/src/Columns/ColumnTuple.h b/dbms/src/Columns/ColumnTuple.h
index d146c8bff6c..c39a92e3c8c 100644
--- a/dbms/src/Columns/ColumnTuple.h
+++ b/dbms/src/Columns/ColumnTuple.h
@@ -71,6 +71,7 @@ public:
     void reserve(size_t n) override;
     size_t byteSize() const override;
     size_t allocatedBytes() const override;
+    void protect() override;
     void forEachSubcolumn(ColumnCallback callback) override;
 
     size_t tupleSize() const { return columns.size(); }
diff --git a/dbms/src/Columns/ColumnUnique.h b/dbms/src/Columns/ColumnUnique.h
index 85a9c498a94..5eee80dc9d8 100644
--- a/dbms/src/Columns/ColumnUnique.h
+++ b/dbms/src/Columns/ColumnUnique.h
@@ -80,6 +80,7 @@ public:
     bool isNumeric() const override { return column_holder->isNumeric(); }
 
     size_t byteSize() const override { return column_holder->byteSize(); }
+    void protect() override { column_holder->assumeMutableRef().protect(); }
     size_t allocatedBytes() const override
     {
         return column_holder->allocatedBytes()
diff --git a/dbms/src/Columns/ColumnVector.h b/dbms/src/Columns/ColumnVector.h
index 1c5a45ef6ad..9de84f95b4a 100644
--- a/dbms/src/Columns/ColumnVector.h
+++ b/dbms/src/Columns/ColumnVector.h
@@ -163,6 +163,11 @@ public:
         return data.allocated_bytes();
     }
 
+    void protect() override
+    {
+        data.protect();
+    }
+
     void insertValue(const T value)
     {
         data.push_back(value);
diff --git a/dbms/src/Columns/ColumnVectorHelper.h b/dbms/src/Columns/ColumnVectorHelper.h
index 8a25812ffe7..d805f44218c 100644
--- a/dbms/src/Columns/ColumnVectorHelper.h
+++ b/dbms/src/Columns/ColumnVectorHelper.h
@@ -24,9 +24,10 @@ namespace DB
 class ColumnVectorHelper : public IColumn
 {
 public:
+    template <size_t ELEMENT_SIZE>
     const char * getRawDataBegin() const
     {
-        return *reinterpret_cast<const char * const *>(reinterpret_cast<const char *>(this) + sizeof(*this));
+        return reinterpret_cast<const PODArrayBase<ELEMENT_SIZE, 4096, Allocator<false>, 15, 16> *>(reinterpret_cast<const char *>(this) + sizeof(*this))->raw_data();
     }
 
     template <size_t ELEMENT_SIZE>
diff --git a/dbms/src/Columns/IColumn.h b/dbms/src/Columns/IColumn.h
index 2560b9639ad..86a1097d368 100644
--- a/dbms/src/Columns/IColumn.h
+++ b/dbms/src/Columns/IColumn.h
@@ -253,6 +253,10 @@ public:
     /// Zero, if could be determined.
     virtual size_t allocatedBytes() const = 0;
 
+    /// Make memory region readonly with mprotect if it is large enough.
+    /// The operation is slow and performed only for debug builds.
+    virtual void protect() {}
+
     /// If the column contains subcolumns (such as Array, Nullable, etc), do callback on them.
     /// Shallow: doesn't do recursive calls; don't do call for itself.
     using ColumnCallback = std::function<void(Ptr&)>;
diff --git a/dbms/src/Common/Allocator.cpp b/dbms/src/Common/Allocator.cpp
index ba0c7820187..92ff10eafb7 100644
--- a/dbms/src/Common/Allocator.cpp
+++ b/dbms/src/Common/Allocator.cpp
@@ -43,11 +43,30 @@ namespace ErrorCodes
   *
   * PS. This is also required, because tcmalloc can not allocate a chunk of memory greater than 16 GB.
   */
-static constexpr size_t MMAP_THRESHOLD = 64 * (1ULL << 20);
+#ifdef NDEBUG
+    static constexpr size_t MMAP_THRESHOLD = 64 * (1ULL << 20);
+#else
+    /// In debug build, use small mmap threshold to reproduce more memory stomping bugs.
+    /// Along with ASLR it will hopefully detect more issues than ASan.
+    /// The program may fail due to the limit on number of memory mappings.
+    static constexpr size_t MMAP_THRESHOLD = 4096;
+#endif
+
 static constexpr size_t MMAP_MIN_ALIGNMENT = 4096;
 static constexpr size_t MALLOC_MIN_ALIGNMENT = 8;
 
 
+template <bool clear_memory_>
+void * Allocator<clear_memory_>::mmap_hint()
+{
+#if ALLOCATOR_ASLR
+    return reinterpret_cast<void *>(std::uniform_int_distribution<intptr_t>(0x100000000000UL, 0x700000000000UL)(rng));
+#else
+    return nullptr;
+#endif
+}
+
+
 template <bool clear_memory_>
 void * Allocator<clear_memory_>::alloc(size_t size, size_t alignment)
 {
@@ -61,7 +80,7 @@ void * Allocator<clear_memory_>::alloc(size_t size, size_t alignment)
             throw DB::Exception("Too large alignment " + formatReadableSizeWithBinarySuffix(alignment) + ": more than page size when allocating "
                 + formatReadableSizeWithBinarySuffix(size) + ".", DB::ErrorCodes::BAD_ARGUMENTS);
 
-        buf = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+        buf = mmap(mmap_hint(), size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
         if (MAP_FAILED == buf)
             DB::throwFromErrno("Allocator: Cannot mmap " + formatReadableSizeWithBinarySuffix(size) + ".", DB::ErrorCodes::CANNOT_ALLOCATE_MEMORY);
 
diff --git a/dbms/src/Common/Allocator.h b/dbms/src/Common/Allocator.h
index 9a2ab0b975c..d2a81f77b62 100644
--- a/dbms/src/Common/Allocator.h
+++ b/dbms/src/Common/Allocator.h
@@ -2,6 +2,19 @@
 
 #include <string.h>
 
+#ifdef NDEBUG
+    /// If set to 1 - randomize memory mappings manually (address space layout randomization) to reproduce more memory stomping bugs.
+    /// Note that Linux doesn't do it by default. This may lead to worse TLB performance.
+    #define ALLOCATOR_ASLR 0
+#else
+    #define ALLOCATOR_ASLR 1
+#endif
+
+#if ALLOCATOR_ASLR
+    #include <pcg_random.hpp>
+    #include <Common/randomSeed.h>
+#endif
+
 
 /** Responsible for allocating / freeing memory. Used, for example, in PODArray, Arena.
   * Also used in hash tables.
@@ -14,6 +27,12 @@
 template <bool clear_memory_>
 class Allocator
 {
+#if ALLOCATOR_ASLR
+private:
+    pcg64 rng{randomSeed()};
+#endif
+    void * mmap_hint();
+
 protected:
     static constexpr bool clear_memory = clear_memory_;
 
diff --git a/dbms/src/Common/ErrorCodes.cpp b/dbms/src/Common/ErrorCodes.cpp
index d3401427037..f974b2bdaf6 100644
--- a/dbms/src/Common/ErrorCodes.cpp
+++ b/dbms/src/Common/ErrorCodes.cpp
@@ -419,6 +419,7 @@ namespace ErrorCodes
     extern const int BAD_DATABASE_FOR_TEMPORARY_TABLE = 442;
     extern const int NO_COMMON_COLUMNS_WITH_PROTOBUF_SCHEMA = 443;
     extern const int UNKNOWN_PROTOBUF_FORMAT = 444;
+    extern const int CANNOT_MPROTECT = 445;
 
     extern const int KEEPER_EXCEPTION = 999;
     extern const int POCO_EXCEPTION = 1000;
diff --git a/dbms/src/Common/PODArray.h b/dbms/src/Common/PODArray.h
index 462842f8236..a7b8b02bb98 100644
--- a/dbms/src/Common/PODArray.h
+++ b/dbms/src/Common/PODArray.h
@@ -17,10 +17,19 @@
 #include <Common/BitHelpers.h>
 #include <Common/memcpySmall.h>
 
+#ifndef NDEBUG
+    #include <sys/mman.h>
+#endif
+
 
 namespace DB
 {
 
+namespace ErrorCodes
+{
+    extern const int CANNOT_MPROTECT;
+}
+
 inline constexpr size_t integerRoundUp(size_t value, size_t dividend)
 {
     return ((value + dividend - 1) / dividend) * dividend;
@@ -108,6 +117,8 @@ protected:
         if (c_start == null)
             return;
 
+        unprotect();
+
         TAllocator::free(c_start - pad_left, allocated_bytes());
     }
 
@@ -120,6 +131,8 @@ protected:
             return;
         }
 
+        unprotect();
+
         ptrdiff_t end_diff = c_end - c_start;
 
         c_start = reinterpret_cast<char *>(
@@ -155,6 +168,28 @@ protected:
             realloc(allocated_bytes() * 2, std::forward<TAllocatorParams>(allocator_params)...);
     }
 
+#ifndef NDEBUG
+    /// Make memory region readonly with mprotect if it is large enough.
+    /// The operation is slow and performed only for debug builds.
+    void protectImpl(int prot)
+    {
+        static constexpr size_t PAGE_SIZE = 4096;
+
+        char * left_rounded_up = reinterpret_cast<char *>((reinterpret_cast<intptr_t>(c_start) - pad_left + PAGE_SIZE - 1) / PAGE_SIZE * PAGE_SIZE);
+        char * right_rounded_down = reinterpret_cast<char *>((reinterpret_cast<intptr_t>(c_end_of_storage) + pad_right) / PAGE_SIZE * PAGE_SIZE);
+
+        if (right_rounded_down > left_rounded_up)
+        {
+            size_t length = right_rounded_down - left_rounded_up;
+            if (0 != mprotect(left_rounded_up, length, prot))
+                throwFromErrno("Cannot mprotect memory region", ErrorCodes::CANNOT_MPROTECT);
+        }
+    }
+
+    /// Restore memory protection in destructor or realloc for further reuse by allocator.
+    bool mprotected = false;
+#endif
+
 public:
     bool empty() const { return c_end == c_start; }
     size_t size() const { return (c_end - c_start) / ELEMENT_SIZE; }
@@ -199,6 +234,23 @@ public:
         c_end += byte_size(1);
     }
 
+    void protect()
+    {
+#ifndef NDEBUG
+        protectImpl(PROT_READ);
+        mprotected = true;
+#endif
+    }
+
+    void unprotect()
+    {
+#ifndef NDEBUG
+        if (mprotected)
+            protectImpl(PROT_WRITE);
+        mprotected = false;
+#endif
+    }
+
     ~PODArrayBase()
     {
         dealloc();
@@ -402,6 +454,11 @@ public:
 
     void swap(PODArray & rhs)
     {
+#ifndef NDEBUG
+        this->unprotect();
+        rhs.unprotect();
+#endif
+
         /// Swap two PODArray objects, arr1 and arr2, that satisfy the following conditions:
         /// - The elements of arr1 are stored on stack.
         /// - The elements of arr2 are stored on heap.
@@ -450,7 +507,9 @@ public:
         };
 
         if (!this->isInitialized() && !rhs.isInitialized())
+        {
             return;
+        }
         else if (!this->isInitialized() && rhs.isInitialized())
         {
             do_move(rhs, *this);
@@ -494,9 +553,13 @@ public:
             rhs.c_end = rhs.c_start + this->byte_size(lhs_size);
         }
         else if (this->isAllocatedFromStack() && !rhs.isAllocatedFromStack())
+        {
             swap_stack_heap(*this, rhs);
+        }
         else if (!this->isAllocatedFromStack() && rhs.isAllocatedFromStack())
+        {
             swap_stack_heap(rhs, *this);
+        }
         else
         {
             std::swap(this->c_start, rhs.c_start);
diff --git a/dbms/src/Interpreters/AggregationCommon.h b/dbms/src/Interpreters/AggregationCommon.h
index 12c2d53819b..74836d4463d 100644
--- a/dbms/src/Interpreters/AggregationCommon.h
+++ b/dbms/src/Interpreters/AggregationCommon.h
@@ -102,23 +102,23 @@ static inline T ALWAYS_INLINE packFixed(
         switch (key_sizes[j])
         {
             case 1:
-                memcpy(bytes + offset, static_cast<const ColumnVectorHelper *>(column)->getRawDataBegin() + index, 1);
+                memcpy(bytes + offset, static_cast<const ColumnVectorHelper *>(column)->getRawDataBegin<1>() + index, 1);
                 offset += 1;
                 break;
             case 2:
-                memcpy(bytes + offset, static_cast<const ColumnVectorHelper *>(column)->getRawDataBegin() + index * 2, 2);
+                memcpy(bytes + offset, static_cast<const ColumnVectorHelper *>(column)->getRawDataBegin<2>() + index * 2, 2);
                 offset += 2;
                 break;
             case 4:
-                memcpy(bytes + offset, static_cast<const ColumnVectorHelper *>(column)->getRawDataBegin() + index * 4, 4);
+                memcpy(bytes + offset, static_cast<const ColumnVectorHelper *>(column)->getRawDataBegin<4>() + index * 4, 4);
                 offset += 4;
                 break;
             case 8:
-                memcpy(bytes + offset, static_cast<const ColumnVectorHelper *>(column)->getRawDataBegin() + index * 8, 8);
+                memcpy(bytes + offset, static_cast<const ColumnVectorHelper *>(column)->getRawDataBegin<8>() + index * 8, 8);
                 offset += 8;
                 break;
             default:
-                memcpy(bytes + offset, static_cast<const ColumnVectorHelper *>(column)->getRawDataBegin() + index * key_sizes[j], key_sizes[j]);
+                memcpy(bytes + offset, static_cast<const ColumnVectorHelper *>(column)->getRawDataBegin<1>() + index * key_sizes[j], key_sizes[j]);
                 offset += key_sizes[j];
         }
     }
@@ -168,23 +168,23 @@ static inline T ALWAYS_INLINE packFixed(
         switch (key_sizes[j])
         {
             case 1:
-                memcpy(bytes + offset, static_cast<const ColumnVectorHelper *>(key_columns[j])->getRawDataBegin() + i, 1);
+                memcpy(bytes + offset, static_cast<const ColumnVectorHelper *>(key_columns[j])->getRawDataBegin<1>() + i, 1);
                 offset += 1;
                 break;
             case 2:
-                memcpy(bytes + offset, static_cast<const ColumnVectorHelper *>(key_columns[j])->getRawDataBegin() + i * 2, 2);
+                memcpy(bytes + offset, static_cast<const ColumnVectorHelper *>(key_columns[j])->getRawDataBegin<2>() + i * 2, 2);
                 offset += 2;
                 break;
             case 4:
-                memcpy(bytes + offset, static_cast<const ColumnVectorHelper *>(key_columns[j])->getRawDataBegin() + i * 4, 4);
+                memcpy(bytes + offset, static_cast<const ColumnVectorHelper *>(key_columns[j])->getRawDataBegin<4>() + i * 4, 4);
                 offset += 4;
                 break;
             case 8:
-                memcpy(bytes + offset, static_cast<const ColumnVectorHelper *>(key_columns[j])->getRawDataBegin() + i * 8, 8);
+                memcpy(bytes + offset, static_cast<const ColumnVectorHelper *>(key_columns[j])->getRawDataBegin<8>() + i * 8, 8);
                 offset += 8;
                 break;
             default:
-                memcpy(bytes + offset, static_cast<const ColumnVectorHelper *>(key_columns[j])->getRawDataBegin() + i * key_sizes[j], key_sizes[j]);
+                memcpy(bytes + offset, static_cast<const ColumnVectorHelper *>(key_columns[j])->getRawDataBegin<1>() + i * key_sizes[j], key_sizes[j]);
                 offset += key_sizes[j];
         }
     }
diff --git a/dbms/src/Storages/MergeTree/MergeTreeDataPart.cpp b/dbms/src/Storages/MergeTree/MergeTreeDataPart.cpp
index bf9c5b3409d..01ff4c4cdac 100644
--- a/dbms/src/Storages/MergeTree/MergeTreeDataPart.cpp
+++ b/dbms/src/Storages/MergeTree/MergeTreeDataPart.cpp
@@ -513,13 +513,16 @@ void MergeTreeDataPart::loadIndex()
 
         for (size_t i = 0; i < marks_count; ++i)    //-V756
             for (size_t j = 0; j < key_size; ++j)
-                storage.primary_key_data_types[j]->deserializeBinary(*loaded_index[j].get(), index_file);
+                storage.primary_key_data_types[j]->deserializeBinary(*loaded_index[j], index_file);
 
         for (size_t i = 0; i < key_size; ++i)
+        {
+            loaded_index[i]->protect();
             if (loaded_index[i]->size() != marks_count)
                 throw Exception("Cannot read all data from index file " + index_path
                     + "(expected size: " + toString(marks_count) + ", read: " + toString(loaded_index[i]->size()) + ")",
                     ErrorCodes::CANNOT_READ_ALL_DATA);
+        }
 
         if (!index_file.eof())
             throw Exception("Index file " + index_path + " is unexpectedly long", ErrorCodes::EXPECTED_END_OF_FILE);
diff --git a/dbms/src/Storages/MergeTree/MergeTreeReaderStream.cpp b/dbms/src/Storages/MergeTree/MergeTreeReaderStream.cpp
index 9091228d80a..89f5aaeafd5 100644
--- a/dbms/src/Storages/MergeTree/MergeTreeReaderStream.cpp
+++ b/dbms/src/Storages/MergeTree/MergeTreeReaderStream.cpp
@@ -132,6 +132,7 @@ void MergeTreeReaderStream::loadMarks()
         if (buffer.eof() || buffer.buffer().size() != file_size)
             throw Exception("Cannot read all marks from file " + mrk_path, ErrorCodes::CANNOT_READ_ALL_DATA);
 
+        res->protect();
         return res;
     };
 
diff --git a/libs/libcommon/include/common/mremap.h b/libs/libcommon/include/common/mremap.h
index f569ff05d4e..31ca74da827 100644
--- a/libs/libcommon/include/common/mremap.h
+++ b/libs/libcommon/include/common/mremap.h
@@ -12,7 +12,8 @@
 
 #define MREMAP_MAYMOVE 1
 
-void * mremap(void * old_address,
+void * mremap(
+    void * old_address,
     size_t old_size,
     size_t new_size,
     int flags = 0,
@@ -23,7 +24,8 @@ void * mremap(void * old_address,
 
 #endif
 
-inline void * clickhouse_mremap(void * old_address,
+inline void * clickhouse_mremap(
+    void * old_address,
     size_t old_size,
     size_t new_size,
     int flags = 0,
@@ -32,7 +34,8 @@ inline void * clickhouse_mremap(void * old_address,
     [[maybe_unused]] int mmap_fd = -1,
     [[maybe_unused]] off_t mmap_offset = 0)
 {
-    return mremap(old_address,
+    return mremap(
+        old_address,
         old_size,
         new_size,
         flags