Merge branch 'master' of github.com:yandex/ClickHouse

2024-10-15 13:00:50 +00:00 · 2019-01-18 19:08:47 +03:00 · 2019-01-18 19:08:47 +03:00 · 16bd6234ff
commit 16bd6234ff
parent 4d3ad6d3a2 84f1fe6e75
94 changed files with 26681 additions and 1095 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -179,8 +179,8 @@ endif (TEST_COVERAGE)

 if (ENABLE_TESTS)
    message (STATUS "Tests are enabled")
-    enable_testing()
 endif ()
+enable_testing() # Enable for tests without binary

 # when installing to /usr - place configs to /etc but for /usr/local place to /usr/local/etc
 if (CMAKE_INSTALL_PREFIX STREQUAL "/usr")
--- a/PreLoad.cmake
+++ b/PreLoad.cmake
@ -1,7 +1,7 @@
 # Use Ninja instead of Unix Makefiles by default.
 # https://stackoverflow.com/questions/11269833/cmake-selecting-a-generator-within-cmakelists-txt
 #
-# Reason: it have better startup time than make and it parallelize jobs more uniformly.
+# Reason: it has better startup time than make and it parallelizes jobs more uniformly.
 # (when comparing to make with Makefiles that was generated by CMake)
 #
 # How to install Ninja on Ubuntu:
--- a/contrib/CMakeLists.txt
+++ b/contrib/CMakeLists.txt
@ -54,6 +54,7 @@ if (USE_INTERNAL_UNWIND_LIBRARY)
 endif ()

 if (USE_INTERNAL_ZLIB_LIBRARY)
+    set (ZLIB_ENABLE_TESTS 0 CACHE INTERNAL "")
    set (ZLIB_COMPAT 1 CACHE INTERNAL "") # also enables WITH_GZFILEOP
    set (WITH_NATIVE_INSTRUCTIONS ${ARCH_NATIVE} CACHE INTERNAL "")
    if (OS_FREEBSD OR ARCH_I386)
@ -74,15 +75,15 @@ if (USE_INTERNAL_ZLIB_LIBRARY)
       target_compile_definitions (zlibstatic PUBLIC X86_64 UNALIGNED_OK)
    endif ()

-    set_target_properties(example PROPERTIES EXCLUDE_FROM_ALL 1)
-    if (TARGET example64)
-        set_target_properties(example64 PROPERTIES EXCLUDE_FROM_ALL 1)
-    endif ()
+    #set_target_properties(example PROPERTIES EXCLUDE_FROM_ALL 1)
+    #if (TARGET example64)
+    #    set_target_properties(example64 PROPERTIES EXCLUDE_FROM_ALL 1)
+    #endif ()

-    set_target_properties(minigzip PROPERTIES EXCLUDE_FROM_ALL 1)
-    if (TARGET minigzip64)
-        set_target_properties(minigzip64 PROPERTIES EXCLUDE_FROM_ALL 1)
-    endif ()
+    #set_target_properties(minigzip PROPERTIES EXCLUDE_FROM_ALL 1)
+    #if (TARGET minigzip64)
+    #    set_target_properties(minigzip64 PROPERTIES EXCLUDE_FROM_ALL 1)
+    #endif ()
 endif ()

 if (USE_INTERNAL_CCTZ_LIBRARY)
--- a/dbms/cmake/version.cmake
+++ b/dbms/cmake/version.cmake
@ -2,10 +2,10 @@
 set(VERSION_REVISION 54413)
 set(VERSION_MAJOR 19)
 set(VERSION_MINOR 1)
-set(VERSION_PATCH 0)
-set(VERSION_GITHASH 014e344a36bc19a58621e0add379984cf62b9067)
-set(VERSION_DESCRIBE v19.1.0-testing)
-set(VERSION_STRING 19.1.0)
+set(VERSION_PATCH 1)
+set(VERSION_GITHASH 4e7747117123f5a1b027a64865844b4faa10447d)
+set(VERSION_DESCRIBE v19.1.1-testing)
+set(VERSION_STRING 19.1.1)
 # end of autochange

 set(VERSION_EXTRA "" CACHE STRING "")
--- a/dbms/programs/clang/CMakeLists.txt
+++ b/dbms/programs/clang/CMakeLists.txt
@ -27,7 +27,7 @@ elseif (EXISTS ${INTERNAL_COMPILER_BIN_ROOT}${INTERNAL_COMPILER_EXECUTABLE})
 endif ()

 if (COPY_HEADERS_COMPILER AND OS_LINUX)
-    add_custom_target (copy-headers env CLANG=${COPY_HEADERS_COMPILER} BUILD_PATH=${ClickHouse_BINARY_DIR} DESTDIR=${ClickHouse_SOURCE_DIR} ${ClickHouse_SOURCE_DIR}/copy_headers.sh ${ClickHouse_SOURCE_DIR} ${TMP_HEADERS_DIR} DEPENDS ${COPY_HEADERS_DEPENDS} WORKING_DIRECTORY ${ClickHouse_SOURCE_DIR} SOURCES ${ClickHouse_SOURCE_DIR}/copy_headers.sh)
+    add_custom_target (copy-headers [ -f ${TMP_HEADERS_DIR}/dbms/src/Interpreters/SpecializedAggregator.h ] || env CLANG=${COPY_HEADERS_COMPILER} BUILD_PATH=${ClickHouse_BINARY_DIR} DESTDIR=${ClickHouse_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/copy_headers.sh ${ClickHouse_SOURCE_DIR} ${TMP_HEADERS_DIR} DEPENDS ${COPY_HEADERS_DEPENDS} WORKING_DIRECTORY ${ClickHouse_SOURCE_DIR} SOURCES copy_headers.sh)

    if (USE_INTERNAL_LLVM_LIBRARY)
        set (CLANG_HEADERS_DIR "${ClickHouse_SOURCE_DIR}/contrib/llvm/clang/lib/Headers")
--- a/dbms/programs/clang/copy_headers.sh
+++ b/dbms/programs/clang/copy_headers.sh
@ -19,7 +19,7 @@ set -e
 #
 # sudo ./copy_headers.sh . /usr/share/clickhouse/headers/

-SOURCE_PATH=${1:-.}
+SOURCE_PATH=${1:-../../..}
 DST=${2:-$SOURCE_PATH/../headers}
 BUILD_PATH=${BUILD_PATH=${3:-$SOURCE_PATH/build}}

--- a/dbms/programs/server/CMakeLists.txt
+++ b/dbms/programs/server/CMakeLists.txt
@ -25,7 +25,10 @@ endif ()

 if (OS_LINUX AND MAKE_STATIC_LIBRARIES)
    set (GLIBC_MAX_REQUIRED 2.4 CACHE INTERNAL "")
-    add_test(NAME GLIBC_required_version COMMAND bash -c "readelf -s ${CMAKE_CURRENT_BINARY_DIR}/../clickhouse-server | grep '@GLIBC' | grep -oP 'GLIBC_[\\d\\.]+' | sort | uniq | sort -r | perl -lnE 'exit 1 if $_ gt q{GLIBC_${GLIBC_MAX_REQUIRED}}'")
+    # temporary disabled. to enable - change 'exit 0' to 'exit $a'
+    add_test(NAME GLIBC_required_version COMMAND bash -c "readelf -s ${CMAKE_CURRENT_BINARY_DIR}/../clickhouse-server | perl -nE 'END {exit 0 if $a} ++$a, print if /\\x40GLIBC_(\\S+)/ and pack(q{C*}, split /\\./, \$1) gt pack q{C*}, split /\\./, q{${GLIBC_MAX_REQUIRED}}'")
+
+    #add_test(NAME GLIBC_required_version COMMAND bash -c "readelf -s ${CMAKE_CURRENT_BINARY_DIR}/../clickhouse-server | grep '@GLIBC' | grep -oP 'GLIBC_[\\d\\.]+' | sort | uniq | sort --version-sort --reverse | perl -lnE 'warn($_), exit 1 if $_ gt q{GLIBC_${GLIBC_MAX_REQUIRED}}'") # old
 endif ()

 install (
--- a/dbms/src/AggregateFunctions/AggregateFunctionSequenceMatch.h
+++ b/dbms/src/AggregateFunctions/AggregateFunctionSequenceMatch.h
@ -235,6 +235,11 @@ private:
        actions.clear();
        actions.emplace_back(PatternActionType::KleeneStar);

+        dfa_states.clear();
+        dfa_states.emplace_back(true);
+
+        pattern_has_time = false;
+
        const char * pos = pattern.data();
        const char * begin = pos;
        const char * end = pos + pattern.size();
@ -285,6 +290,7 @@ private:
                        actions.back().type != PatternActionType::KleeneStar)
                        throw Exception{"Temporal condition should be preceeded by an event condition", ErrorCodes::BAD_ARGUMENTS};

+                    pattern_has_time = true;
                    actions.emplace_back(type, duration);
                }
                else
@ -299,6 +305,9 @@ private:
                        throw Exception{"Event number " + toString(event_number) + " is out of range", ErrorCodes::BAD_ARGUMENTS};

                    actions.emplace_back(PatternActionType::SpecificEvent, event_number - 1);
+                    dfa_states.back().transition = DFATransition::SpecificEvent;
+                    dfa_states.back().event = event_number - 1;
+                    dfa_states.emplace_back();
                }

                if (!match(")"))
@ -306,17 +315,88 @@ private:

            }
            else if (match(".*"))
+            {
                actions.emplace_back(PatternActionType::KleeneStar);
+                dfa_states.back().has_kleene = true;
+            }
            else if (match("."))
+            {
                actions.emplace_back(PatternActionType::AnyEvent);
+                dfa_states.back().transition = DFATransition::AnyEvent;
+                dfa_states.emplace_back();
+            }
            else
                throw_exception("Could not parse pattern, unexpected starting symbol");
        }
    }

 protected:
+    /// Uses a DFA based approach in order to better handle patterns without
+    /// time assertions.
+    ///
+    /// NOTE: This implementation relies on the assumption that the pattern are *small*.
+    ///
+    /// This algorithm performs in O(mn) (with m the number of DFA states and N the number
+    /// of events) with a memory consumption and memory allocations in O(m). It means that
+    /// if n >>> m (which is expected to be the case), this algorithm can be considered linear.
    template <typename T>
-    bool match(T & events_it, const T events_end) const
+    bool dfaMatch(T & events_it, const T events_end) const
+    {
+        using ActiveStates = std::vector<bool>;
+
+        /// Those two vectors keep track of which states should be considered for the current
+        /// event as well as the states which should be considered for the next event.
+        ActiveStates active_states(dfa_states.size(), false);
+        ActiveStates next_active_states(dfa_states.size(), false);
+        active_states[0] = true;
+
+        /// Keeps track of dead-ends in order not to iterate over all the events to realize that
+        /// the match failed.
+        size_t n_active = 1;
+
+        for (/* empty */; events_it != events_end && n_active > 0 && !active_states.back(); ++events_it)
+        {
+            n_active = 0;
+            next_active_states.assign(dfa_states.size(), false);
+
+            for (size_t state = 0; state < dfa_states.size(); ++state)
+            {
+                if (!active_states[state])
+                {
+                    continue;
+                }
+
+                switch (dfa_states[state].transition)
+                {
+                    case DFATransition::None:
+                        break;
+                    case DFATransition::AnyEvent:
+                        next_active_states[state + 1] = true;
+                        ++n_active;
+                        break;
+                    case DFATransition::SpecificEvent:
+                        if (events_it->second.test(dfa_states[state].event))
+                        {
+                            next_active_states[state + 1] = true;
+                            ++n_active;
+                        }
+                        break;
+                }
+
+                if (dfa_states[state].has_kleene)
+                {
+                    next_active_states[state] = true;
+                    ++n_active;
+                }
+            }
+            swap(active_states, next_active_states);
+        }
+
+        return active_states.back();
+    }
+
+    template <typename T>
+    bool backtrackingMatch(T & events_it, const T events_end) const
    {
        const auto action_begin = std::begin(actions);
        const auto action_end = std::end(actions);
@ -445,10 +525,53 @@ protected:
        return action_it == action_end;
    }

+private:
+    enum class DFATransition : char
+    {
+        ///   .-------.
+        ///   |       |
+        ///   `-------'
+        None,
+        ///   .-------.  (?[0-9])
+        ///   |       | ----------
+        ///   `-------'
+        SpecificEvent,
+        ///   .-------.      .
+        ///   |       | ----------
+        ///   `-------'
+        AnyEvent,
+    };
+
+    struct DFAState
+    {
+        DFAState(bool has_kleene = false)
+            : has_kleene{has_kleene}, event{0}, transition{DFATransition::None}
+        {}
+
+        ///   .-------.
+        ///   |       | - - -
+        ///   `-------'
+        ///     |_^
+        bool has_kleene;
+        /// In the case of a state transitions with a `SpecificEvent`,
+        /// `event` contains the value of the event.
+        uint32_t event;
+        /// The kind of transition out of this state.
+        DFATransition transition;
+    };
+
+    using DFAStates = std::vector<DFAState>;
+
+protected:
+    /// `True` if the parsed pattern contains time assertions (?t...), `false` otherwise.
+    bool pattern_has_time;
+
 private:
    std::string pattern;
    size_t arg_count;
    PatternActions actions;
+
+    DFAStates dfa_states;
 };


@ -471,7 +594,8 @@ public:
        const auto events_end = std::end(data_ref.events_list);
        auto events_it = events_begin;

-        static_cast<ColumnUInt8 &>(to).getData().push_back(match(events_it, events_end));
+        bool match = pattern_has_time ? backtrackingMatch(events_it, events_end) : dfaMatch(events_it, events_end);
+        static_cast<ColumnUInt8 &>(to).getData().push_back(match);
    }
 };

@ -501,7 +625,7 @@ private:
        auto events_it = events_begin;

        size_t count = 0;
-        while (events_it != events_end && match(events_it, events_end))
+        while (events_it != events_end && backtrackingMatch(events_it, events_end))
            ++count;

        return count;
--- a/dbms/src/Common/Volnitsky.h
+++ b/dbms/src/Common/Volnitsky.h
@ -1,15 +1,17 @@
 #pragma once

-#include <Common/StringSearcher.h>
-#include <Common/StringUtils/StringUtils.h>
+#include <algorithm>
+#include <vector>
+#include <stdint.h>
+#include <string.h>
+#include <Columns/ColumnString.h>
 #include <Core/Types.h>
 #include <Poco/UTF8Encoding.h>
 #include <Poco/Unicode.h>
+#include <Common/StringSearcher.h>
+#include <Common/StringUtils/StringUtils.h>
+#include <common/StringRef.h>
 #include <common/unaligned.h>
-#include <ext/range.h>
-#include <stdint.h>
-#include <string.h>
-

 /** Search for a substring in a string by Volnitsky's algorithm
  * http://volnitsky.com/project/str_search/
@ -28,117 +30,38 @@
  * - if it did not match, we check the next cell of the hash table from the collision resolution chain;
  * - if not found, skip to haystack almost the size of the needle bytes;
  *
-  * Unaligned memory access is used.
+  * MultiVolnitsky - search for multiple substrings in a string:
+  * - Add bigrams to hash table with string index. Then the usual Volnitsky search is used.
+  * - We are adding while searching, limiting the number of fallback searchers and the total number of added bigrams
  */


 namespace DB
 {
-
-
-/// @todo store lowercase needle to speed up in case there are numerous occurrences of bigrams from needle in haystack
-template <typename CRTP>
-class VolnitskyBase
+namespace VolnitskyTraits
 {
-protected:
    using Offset = UInt8; /// Offset in the needle. For the basic algorithm, the length of the needle must not be greater than 255.
+    using Id = UInt8; /// Index of the string (within the array of multiple needles), must not be greater than 255.
    using Ngram = UInt16; /// n-gram (2 bytes).

-    const UInt8 * const needle;
-    const size_t needle_size;
-    const UInt8 * const needle_end = needle + needle_size;
-    /// For how long we move, if the n-gram from haystack is not found in the hash table.
-    const size_t step = needle_size - sizeof(Ngram) + 1;
-
-    /** max needle length is 255, max distinct ngrams for case-sensitive is (255 - 1), case-insensitive is 4 * (255 - 1)
-      *  storage of 64K ngrams (n = 2, 128 KB) should be large enough for both cases */
-    static const size_t hash_size = 64 * 1024;    /// Fits into the L2 cache (of common Intel CPUs).
-    Offset hash[hash_size];    /// Hash table.
+    /** Fits into the L2 cache (of common Intel CPUs).
+     * This number is extremely good for compilers as it is numeric_limits<Uint16>::max() and there are optimizations with movzwl and other instructions with 2 bytes
+     */
+    static constexpr size_t hash_size = 64 * 1024;

    /// min haystack size to use main algorithm instead of fallback
-    static constexpr auto min_haystack_size_for_algorithm = 20000;
-    const bool fallback; /// Do we need to use the fallback algorithm.
+    static constexpr size_t min_haystack_size_for_algorithm = 20000;

-public:
-    /** haystack_size_hint - the expected total size of the haystack for `search` calls. Optional (zero means unspecified).
-      * If you specify it small enough, the fallback algorithm will be used,
-      *  since it is considered that it's useless to waste time initializing the hash table.
-      */
-    VolnitskyBase(const char * const needle, const size_t needle_size, size_t haystack_size_hint = 0)
-    : needle{reinterpret_cast<const UInt8 *>(needle)}, needle_size{needle_size},
-      fallback{
-          needle_size < 2 * sizeof(Ngram)
-          || needle_size >= std::numeric_limits<Offset>::max()
-          || (haystack_size_hint && haystack_size_hint < min_haystack_size_for_algorithm)}
+    static inline bool isFallbackNeedle(const size_t needle_size, size_t haystack_size_hint = 0)
    {
-        if (fallback)
-            return;
-
-        memset(hash, 0, sizeof(hash));
-
-        /// int is used here because unsigned can't be used with condition like `i >= 0`, unsigned always >= 0
-        for (auto i = static_cast<int>(needle_size - sizeof(Ngram)); i >= 0; --i)
-            self().putNGram(this->needle + i, i + 1, this->needle);
+        return needle_size < 2 * sizeof(Ngram) || needle_size >= std::numeric_limits<Offset>::max()
+            || (haystack_size_hint && haystack_size_hint < min_haystack_size_for_algorithm);
    }

+    static inline Ngram toNGram(const UInt8 * const pos) { return unalignedLoad<Ngram>(pos); }

-    /// If not found, the end of the haystack is returned.
-    const UInt8 * search(const UInt8 * const haystack, const size_t haystack_size) const
-    {
-        if (needle_size == 0)
-            return haystack;
-
-        const auto haystack_end = haystack + haystack_size;
-
-        if (needle_size == 1 || fallback || haystack_size <= needle_size)
-            return self().search_fallback(haystack, haystack_end);
-
-        /// Let's "apply" the needle to the haystack and compare the n-gram from the end of the needle.
-        const auto * pos = haystack + needle_size - sizeof(Ngram);
-        for (; pos <= haystack_end - needle_size; pos += step)
-        {
-            /// We look at all the cells of the hash table that can correspond to the n-gram from haystack.
-            for (size_t cell_num = toNGram(pos) % hash_size; hash[cell_num];
-                 cell_num = (cell_num + 1) % hash_size)
-            {
-                /// When found - compare bytewise, using the offset from the hash table.
-                const auto res = pos - (hash[cell_num] - 1);
-
-                if (self().compare(res))
-                    return res;
-            }
-        }
-
-        /// The remaining tail.
-        return self().search_fallback(pos - step + 1, haystack_end);
-    }
-
-    const char * search(const char * haystack, size_t haystack_size) const
-    {
-        return reinterpret_cast<const char *>(search(reinterpret_cast<const UInt8 *>(haystack), haystack_size));
-    }
-
-protected:
-    CRTP & self() { return static_cast<CRTP &>(*this); }
-    const CRTP & self() const { return const_cast<VolnitskyBase *>(this)->self(); }
-
-    static Ngram toNGram(const UInt8 * const pos)
-    {
-        return unalignedLoad<Ngram>(pos);
-    }
-
-    void putNGramBase(const Ngram ngram, const int offset)
-    {
-        /// Put the offset for the n-gram in the corresponding cell or the nearest free cell.
-        size_t cell_num = ngram % hash_size;
-
-        while (hash[cell_num])
-            cell_num = (cell_num + 1) % hash_size; /// Search for the next free cell.
-
-        hash[cell_num] = offset;
-    }
-
-    void putNGramASCIICaseInsensitive(const UInt8 * const pos, const int offset)
+    template <typename Callback>
+    static inline void putNGramASCIICaseInsensitive(const UInt8 * const pos, const int offset, const Callback & putNGramBase)
    {
        struct Chars
        {
@ -186,74 +109,21 @@ protected:
            /// 1 combination: 01
            putNGramBase(n, offset);
    }
-};

-
-template <bool CaseSensitive, bool ASCII> struct VolnitskyImpl;
-
-/// Case sensitive comparison
-template <bool ASCII> struct VolnitskyImpl<true, ASCII> : VolnitskyBase<VolnitskyImpl<true, ASCII>>
-{
-    VolnitskyImpl(const char * const needle_, const size_t needle_size_, const size_t haystack_size_hint = 0)
-        : VolnitskyBase<VolnitskyImpl<true, ASCII>>{needle_, needle_size_, haystack_size_hint},
-          fallback_searcher{needle_, needle_size_}
+    template <bool CaseSensitive, bool ASCII, typename Callback>
+    static inline void putNGram(const UInt8 * const pos, const int offset, [[maybe_unused]] const UInt8 * const begin, const Callback & putNGramBase)
    {
-    }
-
-    void putNGram(const UInt8 * const pos, const int offset, const UInt8 * const /*begin*/)
+        if constexpr (CaseSensitive)
        {
-        this->putNGramBase(this->toNGram(pos), offset);
+            putNGramBase(toNGram(pos), offset);
        }
-
-    bool compare(const UInt8 * const pos) const
+        else
        {
-        /// @todo: maybe just use memcmp for this case and rely on internal SSE optimization as in case with memcpy?
-        return fallback_searcher.compare(pos);
-    }
-
-    const UInt8 * search_fallback(const UInt8 * const haystack, const UInt8 * const haystack_end) const
+            if constexpr (ASCII)
            {
-        return fallback_searcher.search(haystack, haystack_end);
+                putNGramASCIICaseInsensitive(pos, offset, putNGramBase);
            }
-
-    ASCIICaseSensitiveStringSearcher fallback_searcher;
-};
-
-/// Case-insensitive ASCII
-template <> struct VolnitskyImpl<false, true> : VolnitskyBase<VolnitskyImpl<false, true>>
-{
-    VolnitskyImpl(const char * const needle_, const size_t needle_size_, const size_t haystack_size_hint = 0)
-        : VolnitskyBase{needle_, needle_size_, haystack_size_hint}, fallback_searcher{needle_, needle_size_}
-    {
-    }
-
-    void putNGram(const UInt8 * const pos, const int offset, const UInt8 * const /*begin*/)
-    {
-        putNGramASCIICaseInsensitive(pos, offset);
-    }
-
-    bool compare(const UInt8 * const pos) const
-    {
-        return fallback_searcher.compare(pos);
-    }
-
-    const UInt8 * search_fallback(const UInt8 * const haystack, const UInt8 * const haystack_end) const
-    {
-        return fallback_searcher.search(haystack, haystack_end);
-    }
-
-    ASCIICaseInsensitiveStringSearcher fallback_searcher;
-};
-
-/// Case-sensitive UTF-8
-template <> struct VolnitskyImpl<false, false> : VolnitskyBase<VolnitskyImpl<false, false>>
-{
-    VolnitskyImpl(const char * const needle_, const size_t needle_size_, const size_t haystack_size_hint = 0)
-        : VolnitskyBase{needle_, needle_size_, haystack_size_hint}, fallback_searcher{needle_, needle_size_}
-    {
-    }
-
-    void putNGram(const UInt8 * const pos, const int offset, const UInt8 * const begin)
+            else
            {
                struct Chars
                {
@ -263,16 +133,14 @@ template <> struct VolnitskyImpl<false, false> : VolnitskyBase<VolnitskyImpl<fal

                union
                {
-            Ngram n;
+                    VolnitskyTraits::Ngram n;
                    Chars chars;
                };

                n = toNGram(pos);

                if (isascii(chars.c0) && isascii(chars.c1))
-        {
-            putNGramASCIICaseInsensitive(pos, offset);
-        }
+                    putNGramASCIICaseInsensitive(pos, offset, putNGramBase);
                else
                {
                    /** n-gram (in the case of n = 2)
@ -435,25 +303,389 @@ template <> struct VolnitskyImpl<false, false> : VolnitskyBase<VolnitskyImpl<fal
                    }
                }
            }
+        }
+    }
+}

-    bool compare(const UInt8 * const pos) const
+
+/// @todo store lowercase needle to speed up in case there are numerous occurrences of bigrams from needle in haystack
+template <bool CaseSensitive, bool ASCII, typename FallbackSearcher>
+class VolnitskyBase
+{
+protected:
+    const UInt8 * const needle;
+    const size_t needle_size;
+    const UInt8 * const needle_end = needle + needle_size;
+    /// For how long we move, if the n-gram from haystack is not found in the hash table.
+    const size_t step = needle_size - sizeof(VolnitskyTraits::Ngram) + 1;
+
+    /** max needle length is 255, max distinct ngrams for case-sensitive is (255 - 1), case-insensitive is 4 * (255 - 1)
+      *  storage of 64K ngrams (n = 2, 128 KB) should be large enough for both cases */
+    VolnitskyTraits::Offset hash[VolnitskyTraits::hash_size]; /// Hash table.
+
+    const bool fallback; /// Do we need to use the fallback algorithm.
+
+    FallbackSearcher fallback_searcher;
+
+public:
+    /** haystack_size_hint - the expected total size of the haystack for `search` calls. Optional (zero means unspecified).
+      * If you specify it small enough, the fallback algorithm will be used,
+      *  since it is considered that it's useless to waste time initializing the hash table.
+      */
+    VolnitskyBase(const char * const needle, const size_t needle_size, size_t haystack_size_hint = 0)
+        : needle{reinterpret_cast<const UInt8 *>(needle)}
+        , needle_size{needle_size}
+        , fallback{VolnitskyTraits::isFallbackNeedle(needle_size, haystack_size_hint)}
+        , fallback_searcher{needle, needle_size}
    {
-        return fallback_searcher.compare(pos);
+        if (fallback)
+            return;
+
+        memset(hash, 0, sizeof(hash));
+
+        auto callback = [this](const VolnitskyTraits::Ngram ngram, const int offset) { return this->putNGramBase(ngram, offset); };
+        /// ssize_t is used here because unsigned can't be used with condition like `i >= 0`, unsigned always >= 0
+        for (auto i = static_cast<ssize_t>(needle_size - sizeof(VolnitskyTraits::Ngram)); i >= 0; --i)
+            VolnitskyTraits::putNGram<CaseSensitive, ASCII>(this->needle + i, i + 1, this->needle, callback);
    }

-    const UInt8 * search_fallback(const UInt8 * const haystack, const UInt8 * const haystack_end) const
+
+    /// If not found, the end of the haystack is returned.
+    const UInt8 * search(const UInt8 * const haystack, const size_t haystack_size) const
    {
+        if (needle_size == 0)
+            return haystack;
+
+        const auto haystack_end = haystack + haystack_size;
+
+        if (fallback || haystack_size <= needle_size)
            return fallback_searcher.search(haystack, haystack_end);
+
+        /// Let's "apply" the needle to the haystack and compare the n-gram from the end of the needle.
+        const auto * pos = haystack + needle_size - sizeof(VolnitskyTraits::Ngram);
+        for (; pos <= haystack_end - needle_size; pos += step)
+        {
+            /// We look at all the cells of the hash table that can correspond to the n-gram from haystack.
+            for (size_t cell_num = VolnitskyTraits::toNGram(pos) % VolnitskyTraits::hash_size; hash[cell_num];
+                 cell_num = (cell_num + 1) % VolnitskyTraits::hash_size)
+            {
+                /// When found - compare bytewise, using the offset from the hash table.
+                const auto res = pos - (hash[cell_num] - 1);
+
+                /// pointer in the code is always padded array so we can use pagesafe semantics
+                if (fallback_searcher.compare(res))
+                    return res;
+            }
        }

-    UTF8CaseInsensitiveStringSearcher fallback_searcher;
+        return fallback_searcher.search(pos - step + 1, haystack_end);
+    }
+
+    const char * search(const char * haystack, size_t haystack_size) const
+    {
+        return reinterpret_cast<const char *>(search(reinterpret_cast<const UInt8 *>(haystack), haystack_size));
+    }
+
+protected:
+    void putNGramBase(const VolnitskyTraits::Ngram ngram, const int offset)
+    {
+        /// Put the offset for the n-gram in the corresponding cell or the nearest free cell.
+        size_t cell_num = ngram % VolnitskyTraits::hash_size;
+
+        while (hash[cell_num])
+            cell_num = (cell_num + 1) % VolnitskyTraits::hash_size; /// Search for the next free cell.
+
+        hash[cell_num] = offset;
+    }
 };


-using Volnitsky = VolnitskyImpl<true, true>;
-using VolnitskyUTF8 = VolnitskyImpl<true, false>;    /// exactly same as Volnitsky
-using VolnitskyCaseInsensitive = VolnitskyImpl<false, true>;    /// ignores non-ASCII bytes
-using VolnitskyCaseInsensitiveUTF8 = VolnitskyImpl<false, false>;
+template <bool CaseSensitive, bool ASCII, typename FallbackSearcher>
+class MultiVolnitskyBase
+{
+private:
+    /// needles and their offsets
+    const std::vector<StringRef> & needles;
+
+
+    /// fallback searchers
+    std::vector<size_t> fallback_needles;
+    std::vector<FallbackSearcher> fallback_searchers;
+
+    /// because std::pair<> is not POD
+    struct OffsetId
+    {
+        VolnitskyTraits::Id id;
+        VolnitskyTraits::Offset off;
+    };
+
+    OffsetId hash[VolnitskyTraits::hash_size];
+
+    /// step for each bunch of strings
+    size_t step;
+
+    /// last index of offsets that was not processed
+    size_t last;
+
+    /// limit for adding to hashtable. In worst case with case insentive search, the table will be filled at most as half
+    static constexpr size_t small_limit = VolnitskyTraits::hash_size / 8;
+
+public:
+    MultiVolnitskyBase(const std::vector<StringRef> & needles_) : needles{needles_}, step{0}, last{0}
+    {
+        fallback_searchers.reserve(needles.size());
+    }
+
+    template <typename ResultType, typename AnsCallback>
+    void searchAll(
+        const ColumnString::Chars & haystack_data,
+        const ColumnString::Offsets & haystack_offsets,
+        const AnsCallback & ansCallback,
+        ResultType & ans)
+    {
+        const size_t haystack_string_size = haystack_offsets.size();
+        const size_t needles_size = needles.size();
+
+        /// something can be uninitialized after
+        std::fill(ans.begin(), ans.end(), 0);
+
+        while (!reset())
+        {
+            size_t fallback_size = fallback_needles.size();
+            size_t prev_offset = 0;
+            for (size_t j = 0, from = 0; j < haystack_string_size; ++j, from += needles_size)
+            {
+                const auto * haystack = &haystack_data[prev_offset];
+                const auto * haystack_end = haystack + haystack_offsets[j] - prev_offset - 1;
+                for (size_t i = 0; i < fallback_size; ++i)
+                {
+                    const UInt8 * ptr = fallback_searchers[fallback_needles[i]].search(haystack, haystack_end);
+                    if (ptr != haystack_end)
+                        ans[from + fallback_needles[i]] = ansCallback(haystack, ptr);
+                }
+
+                /// check if we have one non empty volnitsky searcher
+                if (step != std::numeric_limits<size_t>::max())
+                {
+                    const auto * pos = haystack + step - sizeof(VolnitskyTraits::Ngram);
+                    for (; pos <= haystack_end - sizeof(VolnitskyTraits::Ngram); pos += step)
+                    {
+                        for (size_t cell_num = VolnitskyTraits::toNGram(pos) % VolnitskyTraits::hash_size; hash[cell_num].off;
+                             cell_num = (cell_num + 1) % VolnitskyTraits::hash_size)
+                        {
+                            if (pos >= haystack + hash[cell_num].off - 1)
+                            {
+                                const auto * res = pos - (hash[cell_num].off - 1);
+                                const size_t ind = hash[cell_num].id;
+                                if (ans[from + ind] == 0 && res + needles[ind].size <= haystack_end)
+                                {
+                                    if (fallback_searchers[ind].compare(res))
+                                    {
+                                        ans[from + ind] = ansCallback(haystack, res);
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+                prev_offset = haystack_offsets[j];
+            }
+        }
+    }
+
+    template <typename ResultType>
+    void search(const ColumnString::Chars & haystack_data, const ColumnString::Offsets & haystack_offsets, ResultType & ans)
+    {
+        auto callback = [this](const UInt8 * haystack, const UInt8 * haystack_end) -> bool
+        {
+            return this->searchOne(haystack, haystack_end);
+        };
+        searchInternal(haystack_data, haystack_offsets, callback, ans);
+    }
+
+    template <typename ResultType>
+    void searchIndex(const ColumnString::Chars & haystack_data, const ColumnString::Offsets & haystack_offsets, ResultType & ans)
+    {
+        auto callback = [this](const UInt8 * haystack, const UInt8 * haystack_end) -> size_t
+        {
+            return this->searchOneIndex(haystack, haystack_end);
+        };
+        searchInternal(haystack_data, haystack_offsets, callback, ans);
+    }
+
+private:
+    /**
+     * This function is needed to initialize hash table
+     * Returns `true` if there is nothing to initialize
+     * and `false` if we have something to initialize and initializes it.
+     * This function is a kind of fallback if there are many needles.
+     * We actually destroy the hash table and initialize it with uninitialized needles
+     * and search through the haystack again.
+     * The actual usage of this function is like this:
+     * while (!reset())
+     * {
+     *     search inside the haystack with the known needles
+     * }
+     */
+    bool reset()
+    {
+        if (last == needles.size())
+            return true;
+
+        memset(hash, 0, sizeof(hash));
+        fallback_needles.clear();
+        step = std::numeric_limits<size_t>::max();
+
+        size_t buf = 0;
+        size_t size = needles.size();
+
+        for (; last < size; ++last)
+        {
+            const char * cur_needle_data = needles[last].data;
+            const size_t cur_needle_size = needles[last].size;
+
+            /// save the indices of fallback searchers
+            if (VolnitskyTraits::isFallbackNeedle(cur_needle_size))
+            {
+                fallback_needles.push_back(last);
+            }
+            else
+            {
+                /// put all bigrams
+                auto callback = [this](const VolnitskyTraits::Ngram ngram, const int offset)
+                {
+                    return this->putNGramBase(ngram, offset, this->last);
+                };
+
+                buf += cur_needle_size - sizeof(VolnitskyTraits::Ngram) + 1;
+
+                /// this is the condition when we actually need to stop and start searching with known needles
+                if (buf > small_limit)
+                    break;
+
+                step = std::min(step, cur_needle_size - sizeof(VolnitskyTraits::Ngram) + 1);
+                for (auto i = static_cast<int>(cur_needle_size - sizeof(VolnitskyTraits::Ngram)); i >= 0; --i)
+                {
+                    VolnitskyTraits::putNGram<CaseSensitive, ASCII>(
+                        reinterpret_cast<const UInt8 *>(cur_needle_data) + i,
+                        i + 1,
+                        reinterpret_cast<const UInt8 *>(cur_needle_data),
+                        callback);
+                }
+            }
+            fallback_searchers.emplace_back(cur_needle_data, cur_needle_size);
+        }
+        return false;
+    }
+
+    template <typename OneSearcher, typename ResultType>
+    inline void searchInternal(
+        const ColumnString::Chars & haystack_data,
+        const ColumnString::Offsets & haystack_offsets,
+        const OneSearcher & searchFallback,
+        ResultType & ans)
+    {
+        const size_t haystack_string_size = haystack_offsets.size();
+        while (!reset())
+        {
+            size_t prev_offset = 0;
+            for (size_t j = 0; j < haystack_string_size; ++j)
+            {
+                const auto * haystack = &haystack_data[prev_offset];
+                const auto * haystack_end = haystack + haystack_offsets[j] - prev_offset - 1;
+                ans[j] = searchFallback(haystack, haystack_end);
+                prev_offset = haystack_offsets[j];
+            }
+        }
+    }
+
+    inline bool searchOne(const UInt8 * haystack, const UInt8 * haystack_end) const
+    {
+        const size_t fallback_size = fallback_needles.size();
+        for (size_t i = 0; i < fallback_size; ++i)
+            if (fallback_searchers[fallback_needles[i]].search(haystack, haystack_end) != haystack_end)
+                return true;
+
+        /// check if we have one non empty volnitsky searcher
+        if (step != std::numeric_limits<size_t>::max())
+        {
+            const auto * pos = haystack + step - sizeof(VolnitskyTraits::Ngram);
+            for (; pos <= haystack_end - sizeof(VolnitskyTraits::Ngram); pos += step)
+            {
+                for (size_t cell_num = VolnitskyTraits::toNGram(pos) % VolnitskyTraits::hash_size; hash[cell_num].off;
+                     cell_num = (cell_num + 1) % VolnitskyTraits::hash_size)
+                {
+                    if (pos >= haystack + hash[cell_num].off - 1)
+                    {
+                        const auto res = pos - (hash[cell_num].off - 1);
+                        const size_t ind = hash[cell_num].id;
+                        if (res + needles[ind].size <= haystack_end && fallback_searchers[ind].compare(res))
+                            return true;
+                    }
+                }
+            }
+        }
+        return false;
+    }
+
+    inline size_t searchOneIndex(const UInt8 * haystack, const UInt8 * haystack_end) const
+    {
+        const size_t fallback_size = fallback_needles.size();
+
+        size_t ans = std::numeric_limits<size_t>::max();
+
+        for (size_t i = 0; i < fallback_size; ++i)
+            if (fallback_searchers[fallback_needles[i]].search(haystack, haystack_end) != haystack_end)
+                ans = std::min(ans, fallback_needles[i]);
+
+        /// check if we have one non empty volnitsky searcher
+        if (step != std::numeric_limits<size_t>::max())
+        {
+            const auto * pos = haystack + step - sizeof(VolnitskyTraits::Ngram);
+            for (; pos <= haystack_end - sizeof(VolnitskyTraits::Ngram); pos += step)
+            {
+                for (size_t cell_num = VolnitskyTraits::toNGram(pos) % VolnitskyTraits::hash_size; hash[cell_num].off;
+                     cell_num = (cell_num + 1) % VolnitskyTraits::hash_size)
+                {
+                    if (pos >= haystack + hash[cell_num].off - 1)
+                    {
+                        const auto res = pos - (hash[cell_num].off - 1);
+                        const size_t ind = hash[cell_num].id;
+                        if (res + needles[ind].size <= haystack_end && fallback_searchers[ind].compare(res))
+                            ans = std::min(ans, ind);
+                    }
+                }
+            }
+        }
+
+        /*
+        * if nothing was found, ans + 1 will be equal to zero and we can
+        * assign it into the result because we need to return the position starting with one
+        */
+        return ans + 1;
+    }
+
+    void putNGramBase(const VolnitskyTraits::Ngram ngram, const int offset, const size_t num)
+    {
+        size_t cell_num = ngram % VolnitskyTraits::hash_size;
+
+        while (hash[cell_num].off)
+            cell_num = (cell_num + 1) % VolnitskyTraits::hash_size;
+
+        hash[cell_num] = {static_cast<VolnitskyTraits::Id>(num), static_cast<VolnitskyTraits::Offset>(offset)};
+    }
+};
+
+
+using Volnitsky = VolnitskyBase<true, true, ASCIICaseSensitiveStringSearcher>;
+using VolnitskyUTF8 = VolnitskyBase<true, false, ASCIICaseSensitiveStringSearcher>; /// exactly same as Volnitsky
+using VolnitskyCaseInsensitive = VolnitskyBase<false, true, ASCIICaseInsensitiveStringSearcher>; /// ignores non-ASCII bytes
+using VolnitskyCaseInsensitiveUTF8 = VolnitskyBase<false, false, UTF8CaseInsensitiveStringSearcher>;
+
+using MultiVolnitsky = MultiVolnitskyBase<true, true, ASCIICaseSensitiveStringSearcher>;
+using MultiVolnitskyUTF8 = MultiVolnitskyBase<true, false, ASCIICaseSensitiveStringSearcher>;
+using MultiVolnitskyCaseInsensitive = MultiVolnitskyBase<false, true, ASCIICaseInsensitiveStringSearcher>;
+using MultiVolnitskyCaseInsensitiveUTF8 = MultiVolnitskyBase<false, false, UTF8CaseInsensitiveStringSearcher>;


 }
--- a/dbms/src/Common/ZooKeeper/IKeeper.h
+++ b/dbms/src/Common/ZooKeeper/IKeeper.h
@ -61,6 +61,7 @@ struct Request
 {
    Request() = default;
    Request(const Request &) = default;
+    Request & operator=(const Request &) = default;
    virtual ~Request() = default;
    virtual String getPath() const = 0;
    virtual void addRootPath(const String & /* root_path */) {}
@ -76,6 +77,7 @@ struct Response
    int32_t error = 0;
    Response() = default;
    Response(const Response &) = default;
+    Response & operator=(const Response &) = default;
    virtual ~Response() = default;
    virtual void removeRootPath(const String & /* root_path */) {}
 };
--- a/dbms/src/Formats/CapnProtoRowInputStream.cpp
+++ b/dbms/src/Formats/CapnProtoRowInputStream.cpp
@ -23,6 +23,7 @@ namespace ErrorCodes
    extern const int BAD_TYPE_OF_FIELD;
    extern const int BAD_ARGUMENTS;
    extern const int THERE_IS_NO_COLUMN;
+    extern const int LOGICAL_ERROR;
 }

 static String getSchemaPath(const String & schema_dir, const String & schema_file)
@ -39,7 +40,7 @@ CapnProtoRowInputStream::NestedField split(const Block & header, size_t i)
    if (name.size() > 0 && name[0] == '.')
        name.erase(0, 1);

-    boost::split(field.tokens, name, boost::is_any_of("."));
+    boost::split(field.tokens, name, boost::is_any_of("._"));
    return field;
 }

@ -109,44 +110,62 @@ capnp::StructSchema::Field getFieldOrThrow(capnp::StructSchema node, const std::
        throw Exception("Field " + field + " doesn't exist in schema " + node.getShortDisplayName().cStr(), ErrorCodes::THERE_IS_NO_COLUMN);
 }

-void CapnProtoRowInputStream::createActions(const NestedFieldList & sortedFields, capnp::StructSchema reader)
-{
-    String last;
-    size_t level = 0;
-    capnp::StructSchema::Field parent;

-    for (const auto & field : sortedFields)
+void CapnProtoRowInputStream::createActions(const NestedFieldList & sorted_fields, capnp::StructSchema reader)
+{
+    /// Columns in a table can map to fields in Cap'n'Proto or to structs.
+
+    /// Store common parents and their tokens in order to backtrack.
+    std::vector<capnp::StructSchema::Field> parents;
+    std::vector<std::string> parent_tokens;
+
+    capnp::StructSchema cur_reader = reader;
+
+    for (const auto & field : sorted_fields)
    {
-        // Move to a different field in the same structure, keep parent
-        if (level > 0 && field.tokens[level - 1] != last)
+        if (field.tokens.empty())
+            throw Exception("Logical error in CapnProtoRowInputStream", ErrorCodes::LOGICAL_ERROR);
+
+        // Backtrack to common parent
+        while (field.tokens.size() < parent_tokens.size() + 1
+            || !std::equal(parent_tokens.begin(), parent_tokens.end(), field.tokens.begin()))
        {
-            auto child = getFieldOrThrow(parent.getContainingStruct(), field.tokens[level - 1]);
-            reader = child.getType().asStruct();
            actions.push_back({Action::POP});
-            actions.push_back({Action::PUSH, child});
-        }
-        // Descend to a nested structure
-        for (; level < field.tokens.size() - 1; ++level)
+            parents.pop_back();
+            parent_tokens.pop_back();
+
+            if (parents.empty())
            {
-            auto node = getFieldOrThrow(reader, field.tokens[level]);
+                cur_reader = reader;
+                break;
+            }
+            else
+                cur_reader = parents.back().getType().asStruct();
+        }
+
+        // Go forward
+        while (parent_tokens.size() + 1 < field.tokens.size())
+        {
+            const auto & token = field.tokens[parents.size()];
+            auto node = getFieldOrThrow(cur_reader, token);
            if (node.getType().isStruct())
            {
                // Descend to field structure
-                last = field.tokens[level];
-                parent = node;
-                reader = parent.getType().asStruct();
-                actions.push_back({Action::PUSH, parent});
+                parents.emplace_back(node);
+                parent_tokens.emplace_back(token);
+                cur_reader = node.getType().asStruct();
+                actions.push_back({Action::PUSH, node});
            }
            else if (node.getType().isList())
            {
                break; // Collect list
            }
            else
-                throw Exception("Field " + field.tokens[level] + "is neither Struct nor List", ErrorCodes::BAD_TYPE_OF_FIELD);
+                throw Exception("Field " + token + " is neither Struct nor List", ErrorCodes::BAD_TYPE_OF_FIELD);
        }

        // Read field from the structure
-        auto node = getFieldOrThrow(reader, field.tokens[level]);
+        auto node = getFieldOrThrow(cur_reader, field.tokens[parents.size()]);
        if (node.getType().isList() && actions.size() > 0 && actions.back().field == node)
        {
            // The field list here flattens Nested elements into multiple arrays
@ -168,7 +187,6 @@ void CapnProtoRowInputStream::createActions(const NestedFieldList & sortedFields
 CapnProtoRowInputStream::CapnProtoRowInputStream(ReadBuffer & istr_, const Block & header_, const String & schema_dir, const String & schema_file, const String & root_object)
    : istr(istr_), header(header_), parser(std::make_shared<SchemaParser>())
 {
-
    // Parse the schema and fetch the root object

 #pragma GCC diagnostic push
@ -188,14 +206,8 @@ CapnProtoRowInputStream::CapnProtoRowInputStream(ReadBuffer & istr_, const Block
    for (size_t i = 0; i < num_columns; ++i)
        list.push_back(split(header, i));

-    // Reorder list to make sure we don't have to backtrack
-    std::sort(list.begin(), list.end(), [](const NestedField & a, const NestedField & b)
-    {
-        if (a.tokens.size() == b.tokens.size())
-            return a.tokens < b.tokens;
-           return a.tokens.size() < b.tokens.size();
-    });
-
+    // Order list first by value of strings then by length of string vector.
+    std::sort(list.begin(), list.end(), [](const NestedField & a, const NestedField & b) { return a.tokens < b.tokens; });
    createActions(list, root);
 }

--- a/dbms/src/Functions/FunctionsStringSearch.cpp
+++ b/dbms/src/Functions/FunctionsStringSearch.cpp
@ -1,28 +1,28 @@
 #include <Functions/FunctionsStringSearch.h>

-#include <memory>
-#include <mutex>
-#include <Poco/UTF8String.h>
 #include <Columns/ColumnFixedString.h>
-#include <Common/Volnitsky.h>
+#include <Common/config.h>
+
 #include <DataTypes/DataTypeFixedString.h>
 #include <Functions/FunctionFactory.h>
 #include <Functions/Regexps.h>
 #include <IO/WriteHelpers.h>
-#include <Common/config.h>
 #include <re2/re2.h>
 #include <re2/stringpiece.h>
+#include <Poco/UTF8String.h>
+#include <Common/Volnitsky.h>
+
+#include <algorithm>
+#include <memory>

 #if USE_RE2_ST
-    #include <re2_st/re2.h> // Y_IGNORE
+#    include <re2_st/re2.h> // Y_IGNORE
 #else
-    #define re2_st re2
+#    define re2_st re2
 #endif

-
 namespace DB
 {
-
 namespace ErrorCodes
 {
    extern const int BAD_ARGUMENTS;
@ -35,7 +35,10 @@ namespace ErrorCodes
 struct PositionCaseSensitiveASCII
 {
    /// For searching single substring inside big-enough contiguous chunk of data. Coluld have slightly expensive initialization.
-    using SearcherInBigHaystack = VolnitskyImpl<true, true>;
+    using SearcherInBigHaystack = Volnitsky;
+
+    /// For search many substrings in one string
+    using MultiSearcherInBigHaystack = MultiVolnitsky;

    /// For searching single substring, that is different each time. This object is created for each row of data. It must have cheap initialization.
    using SearcherInSmallHaystack = LibCASCIICaseSensitiveStringSearcher;
@ -50,23 +53,24 @@ struct PositionCaseSensitiveASCII
        return SearcherInSmallHaystack(needle_data, needle_size);
    }

-    /// Number of code points between 'begin' and 'end' (this has different behaviour for ASCII and UTF-8).
-    static size_t countChars(const char * begin, const char * end)
+    static MultiSearcherInBigHaystack createMultiSearcherInBigHaystack(const std::vector<StringRef> & needles)
    {
-        return end - begin;
+        return MultiSearcherInBigHaystack(needles);
    }

+    /// Number of code points between 'begin' and 'end' (this has different behaviour for ASCII and UTF-8).
+    static size_t countChars(const char * begin, const char * end) { return end - begin; }
+
    /// Convert string to lowercase. Only for case-insensitive search.
    /// Implementation is permitted to be inefficient because it is called for single string.
-    static void toLowerIfNeed(std::string &)
-    {
-    }
+    static void toLowerIfNeed(std::string &) {}
 };

 struct PositionCaseInsensitiveASCII
 {
    /// `Volnitsky` is not used here, because one person has measured that this is better. It will be good if you question it.
    using SearcherInBigHaystack = ASCIICaseInsensitiveStringSearcher;
+    using MultiSearcherInBigHaystack = MultiVolnitskyCaseInsensitive;
    using SearcherInSmallHaystack = LibCASCIICaseInsensitiveStringSearcher;

    static SearcherInBigHaystack createSearcherInBigHaystack(const char * needle_data, size_t needle_size, size_t /*haystack_size_hint*/)
@ -79,20 +83,20 @@ struct PositionCaseInsensitiveASCII
        return SearcherInSmallHaystack(needle_data, needle_size);
    }

-    static size_t countChars(const char * begin, const char * end)
+    static MultiSearcherInBigHaystack createMultiSearcherInBigHaystack(const std::vector<StringRef> & needles)
    {
-        return end - begin;
+        return MultiSearcherInBigHaystack(needles);
    }

-    static void toLowerIfNeed(std::string & s)
-    {
-        std::transform(std::begin(s), std::end(s), std::begin(s), tolower);
-    }
+    static size_t countChars(const char * begin, const char * end) { return end - begin; }
+
+    static void toLowerIfNeed(std::string & s) { std::transform(std::begin(s), std::end(s), std::begin(s), tolower); }
 };

 struct PositionCaseSensitiveUTF8
 {
-    using SearcherInBigHaystack = VolnitskyImpl<true, false>;
+    using SearcherInBigHaystack = VolnitskyUTF8;
+    using MultiSearcherInBigHaystack = MultiVolnitskyUTF8;
    using SearcherInSmallHaystack = LibCASCIICaseSensitiveStringSearcher;

    static SearcherInBigHaystack createSearcherInBigHaystack(const char * needle_data, size_t needle_size, size_t haystack_size_hint)
@ -105,6 +109,11 @@ struct PositionCaseSensitiveUTF8
        return SearcherInSmallHaystack(needle_data, needle_size);
    }

+    static MultiSearcherInBigHaystack createMultiSearcherInBigHaystack(const std::vector<StringRef> & needles)
+    {
+        return MultiSearcherInBigHaystack(needles);
+    }
+
    static size_t countChars(const char * begin, const char * end)
    {
        size_t res = 0;
@ -114,14 +123,13 @@ struct PositionCaseSensitiveUTF8
        return res;
    }

-    static void toLowerIfNeed(std::string &)
-    {
-    }
+    static void toLowerIfNeed(std::string &) {}
 };

 struct PositionCaseInsensitiveUTF8
 {
-    using SearcherInBigHaystack = VolnitskyImpl<false, false>;
+    using SearcherInBigHaystack = VolnitskyCaseInsensitiveUTF8;
+    using MultiSearcherInBigHaystack = MultiVolnitskyCaseInsensitiveUTF8;
    using SearcherInSmallHaystack = UTF8CaseInsensitiveStringSearcher; /// TODO Very suboptimal.

    static SearcherInBigHaystack createSearcherInBigHaystack(const char * needle_data, size_t needle_size, size_t haystack_size_hint)
@ -134,6 +142,11 @@ struct PositionCaseInsensitiveUTF8
        return SearcherInSmallHaystack(needle_data, needle_size);
    }

+    static MultiSearcherInBigHaystack createMultiSearcherInBigHaystack(const std::vector<StringRef> & needles)
+    {
+        return MultiSearcherInBigHaystack(needles);
+    }
+
    static size_t countChars(const char * begin, const char * end)
    {
        size_t res = 0;
@ -143,10 +156,7 @@ struct PositionCaseInsensitiveUTF8
        return res;
    }

-    static void toLowerIfNeed(std::string & s)
-    {
-        Poco::UTF8::toLowerInPlace(s);
-    }
+    static void toLowerIfNeed(std::string & s) { Poco::UTF8::toLowerInPlace(s); }
 };

 template <typename Impl>
@ -155,10 +165,8 @@ struct PositionImpl
    using ResultType = UInt64;

    /// Find one substring in many strings.
-    static void vector_constant(const ColumnString::Chars & data,
-        const ColumnString::Offsets & offsets,
-        const std::string & needle,
-        PaddedPODArray<UInt64> & res)
+    static void vector_constant(
+        const ColumnString::Chars & data, const ColumnString::Offsets & offsets, const std::string & needle, PaddedPODArray<UInt64> & res)
    {
        const UInt8 * begin = data.data();
        const UInt8 * pos = begin;
@ -210,7 +218,8 @@ struct PositionImpl
    }

    /// Search each time for a different single substring inside each time different string.
-    static void vector_vector(const ColumnString::Chars & haystack_data,
+    static void vector_vector(
+        const ColumnString::Chars & haystack_data,
        const ColumnString::Offsets & haystack_offsets,
        const ColumnString::Chars & needle_data,
        const ColumnString::Offsets & needle_offsets,
@ -234,8 +243,8 @@ struct PositionImpl
            else
            {
                /// It is assumed that the StringSearcher is not very difficult to initialize.
-                typename Impl::SearcherInSmallHaystack searcher
-                    = Impl::createSearcherInSmallHaystack(reinterpret_cast<const char *>(&needle_data[prev_needle_offset]),
+                typename Impl::SearcherInSmallHaystack searcher = Impl::createSearcherInSmallHaystack(
+                    reinterpret_cast<const char *>(&needle_data[prev_needle_offset]),
                    needle_offsets[i] - prev_needle_offset - 1); /// zero byte at the end

                /// searcher returns a pointer to the found substring or to the end of `haystack`.
@ -244,7 +253,9 @@ struct PositionImpl

                if (pos != haystack_size)
                {
-                    res[i] = 1 + Impl::countChars(reinterpret_cast<const char *>(&haystack_data[prev_haystack_offset]),
+                    res[i] = 1
+                        + Impl::countChars(
+                                 reinterpret_cast<const char *>(&haystack_data[prev_haystack_offset]),
                                 reinterpret_cast<const char *>(&haystack_data[prev_haystack_offset + pos]));
                }
                else
@ -256,8 +267,9 @@ struct PositionImpl
        }
    }

-    /// Find many substrings in one line.
-    static void constant_vector(const String & haystack,
+    /// Find many substrings in single string.
+    static void constant_vector(
+        const String & haystack,
        const ColumnString::Chars & needle_data,
        const ColumnString::Offsets & needle_offsets,
        PaddedPODArray<UInt64> & res)
@ -281,7 +293,8 @@ struct PositionImpl
                typename Impl::SearcherInSmallHaystack searcher = Impl::createSearcherInSmallHaystack(
                    reinterpret_cast<const char *>(&needle_data[prev_needle_offset]), needle_offsets[i] - prev_needle_offset - 1);

-                size_t pos = searcher.search(reinterpret_cast<const UInt8 *>(haystack.data()),
+                size_t pos = searcher.search(
+                                 reinterpret_cast<const UInt8 *>(haystack.data()),
                                 reinterpret_cast<const UInt8 *>(haystack.data()) + haystack.size())
                    - reinterpret_cast<const UInt8 *>(haystack.data());

@ -298,6 +311,56 @@ struct PositionImpl
    }
 };

+template <typename Impl>
+struct MultiPositionImpl
+{
+    using ResultType = UInt64;
+
+    static void vector_constant(
+        const ColumnString::Chars & haystack_data,
+        const ColumnString::Offsets & haystack_offsets,
+        const std::vector<StringRef> & needles,
+        PaddedPODArray<UInt64> & res)
+    {
+        auto resCallback = [](const UInt8 * start, const UInt8 * end) -> UInt64
+        {
+            return 1 + Impl::countChars(reinterpret_cast<const char *>(start), reinterpret_cast<const char *>(end));
+        };
+
+        Impl::createMultiSearcherInBigHaystack(needles).searchAll(haystack_data, haystack_offsets, resCallback, res);
+    }
+};
+
+template <typename Impl>
+struct MultiSearchImpl
+{
+    using ResultType = UInt64;
+
+    static void vector_constant(
+        const ColumnString::Chars & haystack_data,
+        const ColumnString::Offsets & haystack_offsets,
+        const std::vector<StringRef> & needles,
+        PaddedPODArray<UInt64> & res)
+    {
+        Impl::createMultiSearcherInBigHaystack(needles).search(haystack_data, haystack_offsets, res);
+    }
+};
+
+template <typename Impl>
+struct FirstMatchImpl
+{
+    using ResultType = UInt64;
+
+    static void vector_constant(
+        const ColumnString::Chars & haystack_data,
+        const ColumnString::Offsets & haystack_offsets,
+        const std::vector<StringRef> & needles,
+        PaddedPODArray<UInt64> & res)
+    {
+        Impl::createMultiSearcherInBigHaystack(needles).searchIndex(haystack_data, haystack_offsets, res);
+    }
+};
+

 /// Is the LIKE expression reduced to finding a substring in a string?
 inline bool likePatternIsStrstr(const String & pattern, String & res)
@ -348,10 +411,8 @@ struct MatchImpl
 {
    using ResultType = UInt8;

-    static void vector_constant(const ColumnString::Chars & data,
-        const ColumnString::Offsets & offsets,
-        const std::string & pattern,
-        PaddedPODArray<UInt8> & res)
+    static void vector_constant(
+        const ColumnString::Chars & data, const ColumnString::Offsets & offsets, const std::string & pattern, PaddedPODArray<UInt8> & res)
    {
        if (offsets.empty())
            return;
@ -467,13 +528,14 @@ struct MatchImpl
                            size_t str_size = (i != 0 ? offsets[i] - offsets[i - 1] : offsets[0]) - 1;

                            /** Even in the case of `required_substring_is_prefix` use UNANCHORED check for regexp,
-                              *  so that it can match when `required_substring` occurs into the line several times,
+                              *  so that it can match when `required_substring` occurs into the string several times,
                              *  and at the first occurrence, the regexp is not a match.
                              */

                            if (required_substring_is_prefix)
                                res[i] = revert
-                                    ^ regexp->getRE2()->Match(re2_st::StringPiece(str_data, str_size),
+                                    ^ regexp->getRE2()->Match(
+                                          re2_st::StringPiece(str_data, str_size),
                                          reinterpret_cast<const char *>(pos) - str_data,
                                          str_size,
                                          re2_st::RE2::UNANCHORED,
@ -504,13 +566,15 @@ struct MatchImpl
        res = revert ^ regexp->match(data);
    }

-    template <typename... Args> static void vector_vector(Args &&...)
+    template <typename... Args>
+    static void vector_vector(Args &&...)
    {
        throw Exception("Functions 'like' and 'match' don't support non-constant needle argument", ErrorCodes::ILLEGAL_COLUMN);
    }

    /// Search different needles in single haystack.
-    template <typename... Args> static void constant_vector(Args &&...)
+    template <typename... Args>
+    static void constant_vector(Args &&...)
    {
        throw Exception("Functions 'like' and 'match' don't support non-constant needle argument", ErrorCodes::ILLEGAL_COLUMN);
    }
@ -519,7 +583,8 @@ struct MatchImpl

 struct ExtractImpl
 {
-    static void vector(const ColumnString::Chars & data,
+    static void vector(
+        const ColumnString::Chars & data,
        const ColumnString::Offsets & offsets,
        const std::string & pattern,
        ColumnString::Chars & res_data,
@ -613,16 +678,17 @@ struct ReplaceRegexpImpl

        for (const auto & it : instructions)
            if (it.first >= num_captures)
-                throw Exception("Invalid replace instruction in replacement string. Id: " + toString(it.first) + ", but regexp has only "
-                        + toString(num_captures - 1)
-                        + " subpatterns",
+                throw Exception(
+                    "Invalid replace instruction in replacement string. Id: " + toString(it.first) + ", but regexp has only "
+                        + toString(num_captures - 1) + " subpatterns",
                    ErrorCodes::BAD_ARGUMENTS);

        return instructions;
    }


-    static void processString(const re2_st::StringPiece & input,
+    static void processString(
+        const re2_st::StringPiece & input,
        ColumnString::Chars & res_data,
        ColumnString::Offset & res_offset,
        re2_st::RE2 & searcher,
@ -687,7 +753,8 @@ struct ReplaceRegexpImpl
    }


-    static void vector(const ColumnString::Chars & data,
+    static void vector(
+        const ColumnString::Chars & data,
        const ColumnString::Offsets & offsets,
        const std::string & needle,
        const std::string & replacement,
@ -715,7 +782,8 @@ struct ReplaceRegexpImpl
        }
    }

-    static void vector_fixed(const ColumnString::Chars & data,
+    static void vector_fixed(
+        const ColumnString::Chars & data,
        size_t n,
        const std::string & needle,
        const std::string & replacement,
@ -749,7 +817,8 @@ struct ReplaceRegexpImpl
 template <bool replace_one = false>
 struct ReplaceStringImpl
 {
-    static void vector(const ColumnString::Chars & data,
+    static void vector(
+        const ColumnString::Chars & data,
        const ColumnString::Offsets & offsets,
        const std::string & needle,
        const std::string & replacement,
@ -791,7 +860,7 @@ struct ReplaceStringImpl
            if (i == offsets.size())
                break;

-            /// Is it true that this line no longer needs to perform transformations.
+            /// Is it true that this string no longer needs to perform transformations.
            bool can_finish_current_string = false;

            /// We check that the entry does not go through the boundaries of strings.
@ -824,7 +893,8 @@ struct ReplaceStringImpl

    /// Note: this function converts fixed-length strings to variable-length strings
    ///       and each variable-length string should ends with zero byte.
-    static void vector_fixed(const ColumnString::Chars & data,
+    static void vector_fixed(
+        const ColumnString::Chars & data,
        size_t n,
        const std::string & needle,
        const std::string & replacement,
@ -851,7 +921,8 @@ struct ReplaceStringImpl
            const UInt8 * match = searcher.search(pos, end - pos);

 #define COPY_REST_OF_CURRENT_STRING() \
-    do { \
+    do \
+    { \
        const size_t len = begin + n * (i + 1) - pos; \
        res_data.resize(res_data.size() + len + 1); \
        memcpy(&res_data[res_offset], pos, len); \
@ -878,7 +949,7 @@ struct ReplaceStringImpl
            memcpy(&res_data[res_offset], pos, match - pos);
            res_offset += (match - pos);

-            /// Is it true that this line no longer needs to perform conversions.
+            /// Is it true that this string no longer needs to perform conversions.
            bool can_finish_current_string = false;

            /// We check that the entry does not pass through the boundaries of strings.
@ -935,20 +1006,11 @@ class FunctionStringReplace : public IFunction
 {
 public:
    static constexpr auto name = Name::name;
-    static FunctionPtr create(const Context &)
-    {
-        return std::make_shared<FunctionStringReplace>();
-    }
+    static FunctionPtr create(const Context &) { return std::make_shared<FunctionStringReplace>(); }

-    String getName() const override
-    {
-        return name;
-    }
+    String getName() const override { return name; }

-    size_t getNumberOfArguments() const override
-    {
-        return 3;
-    }
+    size_t getNumberOfArguments() const override { return 3; }

    bool useDefaultImplementationForConstants() const override { return true; }
    ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1, 2}; }
@ -956,15 +1018,18 @@ public:
    DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
    {
        if (!isStringOrFixedString(arguments[0]))
-            throw Exception("Illegal type " + arguments[0]->getName() + " of first argument of function " + getName(),
+            throw Exception(
+                "Illegal type " + arguments[0]->getName() + " of first argument of function " + getName(),
                ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);

        if (!isStringOrFixedString(arguments[1]))
-            throw Exception("Illegal type " + arguments[1]->getName() + " of second argument of function " + getName(),
+            throw Exception(
+                "Illegal type " + arguments[1]->getName() + " of second argument of function " + getName(),
                ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);

        if (!isStringOrFixedString(arguments[2]))
-            throw Exception("Illegal type " + arguments[2]->getName() + " of third argument of function " + getName(),
+            throw Exception(
+                "Illegal type " + arguments[2]->getName() + " of third argument of function " + getName(),
                ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);

        return std::make_shared<DataTypeString>();
@ -1025,6 +1090,54 @@ struct NamePositionCaseInsensitiveUTF8
 {
    static constexpr auto name = "positionCaseInsensitiveUTF8";
 };
+struct NameMultiPosition
+{
+    static constexpr auto name = "multiPosition";
+};
+struct NameMultiPositionUTF8
+{
+    static constexpr auto name = "multiPositionUTF8";
+};
+struct NameMultiPositionCaseInsensitive
+{
+    static constexpr auto name = "multiPositionCaseInsensitive";
+};
+struct NameMultiPositionCaseInsensitiveUTF8
+{
+    static constexpr auto name = "multiPositionCaseInsensitiveUTF8";
+};
+struct NameMultiSearch
+{
+    static constexpr auto name = "multiSearch";
+};
+struct NameMultiSearchUTF8
+{
+    static constexpr auto name = "multiSearchUTF8";
+};
+struct NameMultiSearchCaseInsensitive
+{
+    static constexpr auto name = "multiSearchCaseInsensitive";
+};
+struct NameMultiSearchCaseInsensitiveUTF8
+{
+    static constexpr auto name = "multiSearchCaseInsensitiveUTF8";
+};
+struct NameFirstMatch
+{
+    static constexpr auto name = "firstMatch";
+};
+struct NameFirstMatchUTF8
+{
+    static constexpr auto name = "firstMatchUTF8";
+};
+struct NameFirstMatchCaseInsensitive
+{
+    static constexpr auto name = "firstMatchCaseInsensitive";
+};
+struct NameFirstMatchCaseInsensitiveUTF8
+{
+    static constexpr auto name = "firstMatchCaseInsensitiveUTF8";
+};
 struct NameMatch
 {
    static constexpr auto name = "match";
@ -1064,6 +1177,27 @@ using FunctionPositionCaseInsensitive = FunctionsStringSearch<PositionImpl<Posit
 using FunctionPositionCaseInsensitiveUTF8
    = FunctionsStringSearch<PositionImpl<PositionCaseInsensitiveUTF8>, NamePositionCaseInsensitiveUTF8>;

+using FunctionMultiPosition = FunctionsMultiStringPosition<MultiPositionImpl<PositionCaseSensitiveASCII>, NameMultiPosition>;
+using FunctionMultiPositionUTF8 = FunctionsMultiStringPosition<MultiPositionImpl<PositionCaseSensitiveUTF8>, NameMultiPositionUTF8>;
+using FunctionMultiPositionCaseInsensitive
+    = FunctionsMultiStringPosition<MultiPositionImpl<PositionCaseInsensitiveASCII>, NameMultiPositionCaseInsensitive>;
+using FunctionMultiPositionCaseInsensitiveUTF8
+    = FunctionsMultiStringPosition<MultiPositionImpl<PositionCaseInsensitiveUTF8>, NameMultiPositionCaseInsensitiveUTF8>;
+
+using FunctionMultiSearch = FunctionsMultiStringSearch<MultiSearchImpl<PositionCaseSensitiveASCII>, NameMultiSearch>;
+using FunctionMultiSearchUTF8 = FunctionsMultiStringSearch<MultiSearchImpl<PositionCaseSensitiveUTF8>, NameMultiSearchUTF8>;
+using FunctionMultiSearchCaseInsensitive
+    = FunctionsMultiStringSearch<MultiSearchImpl<PositionCaseInsensitiveASCII>, NameMultiSearchCaseInsensitive>;
+using FunctionMultiSearchCaseInsensitiveUTF8
+    = FunctionsMultiStringSearch<MultiSearchImpl<PositionCaseInsensitiveUTF8>, NameMultiSearchCaseInsensitiveUTF8>;
+
+using FunctionFirstMatch = FunctionsMultiStringSearch<FirstMatchImpl<PositionCaseSensitiveASCII>, NameFirstMatch>;
+using FunctionFirstMatchUTF8 = FunctionsMultiStringSearch<FirstMatchImpl<PositionCaseSensitiveUTF8>, NameFirstMatchUTF8>;
+using FunctionFirstMatchCaseInsensitive
+    = FunctionsMultiStringSearch<FirstMatchImpl<PositionCaseInsensitiveASCII>, NameFirstMatchCaseInsensitive>;
+using FunctionFirstMatchCaseInsensitiveUTF8
+    = FunctionsMultiStringSearch<FirstMatchImpl<PositionCaseInsensitiveUTF8>, NameFirstMatchCaseInsensitiveUTF8>;
+
 using FunctionMatch = FunctionsStringSearch<MatchImpl<false>, NameMatch>;
 using FunctionLike = FunctionsStringSearch<MatchImpl<true>, NameLike>;
 using FunctionNotLike = FunctionsStringSearch<MatchImpl<true, true>, NameNotLike>;
@ -1080,14 +1214,32 @@ void registerFunctionsStringSearch(FunctionFactory & factory)
    factory.registerFunction<FunctionReplaceAll>();
    factory.registerFunction<FunctionReplaceRegexpOne>();
    factory.registerFunction<FunctionReplaceRegexpAll>();
+
    factory.registerFunction<FunctionPosition>(FunctionFactory::CaseInsensitive);
    factory.registerFunction<FunctionPositionUTF8>();
    factory.registerFunction<FunctionPositionCaseInsensitive>();
    factory.registerFunction<FunctionPositionCaseInsensitiveUTF8>();
+
+    factory.registerFunction<FunctionMultiPosition>();
+    factory.registerFunction<FunctionMultiPositionUTF8>();
+    factory.registerFunction<FunctionMultiPositionCaseInsensitive>();
+    factory.registerFunction<FunctionMultiPositionCaseInsensitiveUTF8>();
+
+    factory.registerFunction<FunctionMultiSearch>();
+    factory.registerFunction<FunctionMultiSearchUTF8>();
+    factory.registerFunction<FunctionMultiSearchCaseInsensitive>();
+    factory.registerFunction<FunctionMultiSearchCaseInsensitiveUTF8>();
+
+    factory.registerFunction<FunctionFirstMatch>();
+    factory.registerFunction<FunctionFirstMatchUTF8>();
+    factory.registerFunction<FunctionFirstMatchCaseInsensitive>();
+    factory.registerFunction<FunctionFirstMatchCaseInsensitiveUTF8>();
+
    factory.registerFunction<FunctionMatch>();
    factory.registerFunction<FunctionLike>();
    factory.registerFunction<FunctionNotLike>();
    factory.registerFunction<FunctionExtract>();
+
    factory.registerAlias("locate", NamePosition::name, FunctionFactory::CaseInsensitive);
    factory.registerAlias("replace", NameReplaceAll::name, FunctionFactory::CaseInsensitive);
 }
--- a/dbms/src/Functions/FunctionsStringSearch.h
+++ b/dbms/src/Functions/FunctionsStringSearch.h
@ -1,17 +1,20 @@
 #pragma once

+#include <Columns/ColumnArray.h>
 #include <Columns/ColumnConst.h>
 #include <Columns/ColumnString.h>
 #include <Columns/ColumnVector.h>
+#include <Core/Field.h>
+#include <DataTypes/DataTypeArray.h>
 #include <DataTypes/DataTypeString.h>
 #include <DataTypes/DataTypesNumber.h>
-#include <Functions/IFunction.h>
 #include <Functions/FunctionHelpers.h>
-
+#include <Functions/IFunction.h>
+#include <IO/WriteHelpers.h>
+#include <common/StringRef.h>

 namespace DB
 {
-
 /** Search and replace functions in strings:
  *
  * position(haystack, needle)     - the normal search for a substring in a string, returns the position (in bytes) of the found substring starting with 1, or 0 if no substring is found.
@ -35,12 +38,28 @@ namespace DB
  *
  * replaceRegexpOne(haystack, pattern, replacement) - replaces the pattern with the specified regexp, only the first occurrence.
  * replaceRegexpAll(haystack, pattern, replacement) - replaces the pattern with the specified type, all occurrences.
+  *
+  * multiPosition(haystack, [pattern_1, pattern_2, ..., pattern_n]) -- find first occurences (positions) of all the const patterns inside haystack
+  * multiPositionUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n])
+  * multiPositionCaseInsensitive(haystack, [pattern_1, pattern_2, ..., pattern_n])
+  * multiPositionCaseInsensitiveUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n])
+  *
+  * multiSearch(haystack, [pattern_1, pattern_2, ..., pattern_n]) -- find any of the const patterns inside haystack and return 0 or 1
+  * multiSearchUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n])
+  * multiSearchCaseInsensitive(haystack, [pattern_1, pattern_2, ..., pattern_n])
+  * multiSearchCaseInsensitiveUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n])
+
+  * firstMatch(haystack, [pattern_1, pattern_2, ..., pattern_n]) -- returns the first index of the matched string or zero if nothing was found
+  * firstMatchUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n])
+  * firstMatchCaseInsensitive(haystack, [pattern_1, pattern_2, ..., pattern_n])
+  * firstMatchCaseInsensitiveUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n])
  */

 namespace ErrorCodes
 {
    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
    extern const int ILLEGAL_COLUMN;
+    extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
 }

 template <typename Impl, typename Name>
@ -48,20 +67,11 @@ class FunctionsStringSearch : public IFunction
 {
 public:
    static constexpr auto name = Name::name;
-    static FunctionPtr create(const Context &)
-    {
-        return std::make_shared<FunctionsStringSearch>();
-    }
+    static FunctionPtr create(const Context &) { return std::make_shared<FunctionsStringSearch>(); }

-    String getName() const override
-    {
-        return name;
-    }
+    String getName() const override { return name; }

-    size_t getNumberOfArguments() const override
-    {
-        return 2;
-    }
+    size_t getNumberOfArguments() const override { return 2; }

    DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
    {
@ -90,7 +100,8 @@ public:
        {
            ResultType res{};
            Impl::constant_constant(col_haystack_const->getValue<String>(), col_needle_const->getValue<String>(), res);
-            block.getByPosition(result).column = block.getByPosition(result).type->createColumnConst(col_haystack_const->size(), toField(res));
+            block.getByPosition(result).column
+                = block.getByPosition(result).type->createColumnConst(col_haystack_const->size(), toField(res));
            return;
        }

@ -103,20 +114,22 @@ public:
        const ColumnString * col_needle_vector = checkAndGetColumn<ColumnString>(&*column_needle);

        if (col_haystack_vector && col_needle_vector)
-            Impl::vector_vector(col_haystack_vector->getChars(),
+            Impl::vector_vector(
+                col_haystack_vector->getChars(),
                col_haystack_vector->getOffsets(),
                col_needle_vector->getChars(),
                col_needle_vector->getOffsets(),
                vec_res);
        else if (col_haystack_vector && col_needle_const)
-            Impl::vector_constant(col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), col_needle_const->getValue<String>(), vec_res);
+            Impl::vector_constant(
+                col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), col_needle_const->getValue<String>(), vec_res);
        else if (col_haystack_const && col_needle_vector)
-            Impl::constant_vector(col_haystack_const->getValue<String>(), col_needle_vector->getChars(), col_needle_vector->getOffsets(), vec_res);
+            Impl::constant_vector(
+                col_haystack_const->getValue<String>(), col_needle_vector->getChars(), col_needle_vector->getOffsets(), vec_res);
        else
-            throw Exception("Illegal columns " + block.getByPosition(arguments[0]).column->getName() + " and "
-                    + block.getByPosition(arguments[1]).column->getName()
-                    + " of arguments of function "
-                    + getName(),
+            throw Exception(
+                "Illegal columns " + block.getByPosition(arguments[0]).column->getName() + " and "
+                    + block.getByPosition(arguments[1]).column->getName() + " of arguments of function " + getName(),
                ErrorCodes::ILLEGAL_COLUMN);

        block.getByPosition(result).column = std::move(col_res);
@ -129,20 +142,11 @@ class FunctionsStringSearchToString : public IFunction
 {
 public:
    static constexpr auto name = Name::name;
-    static FunctionPtr create(const Context &)
-    {
-        return std::make_shared<FunctionsStringSearchToString>();
-    }
+    static FunctionPtr create(const Context &) { return std::make_shared<FunctionsStringSearchToString>(); }

-    String getName() const override
-    {
-        return name;
-    }
+    String getName() const override { return name; }

-    size_t getNumberOfArguments() const override
-    {
-        return 2;
-    }
+    size_t getNumberOfArguments() const override { return 2; }

    bool useDefaultImplementationForConstants() const override { return true; }
    ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; }
@ -186,4 +190,156 @@ public:
    }
 };

+template <typename Impl, typename Name>
+class FunctionsMultiStringPosition : public IFunction
+{
+public:
+    static constexpr auto name = Name::name;
+    static FunctionPtr create(const Context &) { return std::make_shared<FunctionsMultiStringPosition>(); }
+
+    String getName() const override { return name; }
+
+    size_t getNumberOfArguments() const override { return 2; }
+
+    DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
+    {
+        if (arguments.size() + 1 >= std::numeric_limits<UInt8>::max())
+            throw Exception(
+                "Number of arguments for function " + getName() + " doesn't match: passed " + std::to_string(arguments.size())
+                    + ", should be at most 255.",
+                ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
+
+        if (!isString(arguments[0]))
+            throw Exception(
+                "Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+
+        const DataTypeArray * array_type = checkAndGetDataType<DataTypeArray>(arguments[1].get());
+        if (!array_type || !checkAndGetDataType<DataTypeString>(array_type->getNestedType().get()))
+            throw Exception(
+                "Illegal type " + arguments[1]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+
+
+        return std::make_shared<DataTypeArray>(std::make_shared<DataTypeUInt64>());
+    }
+
+    void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) override
+    {
+        using ResultType = typename Impl::ResultType;
+
+        const ColumnPtr & column_haystack = block.getByPosition(arguments[0]).column;
+
+        const ColumnString * col_haystack_vector = checkAndGetColumn<ColumnString>(&*column_haystack);
+
+        const ColumnPtr & arr_ptr = block.getByPosition(arguments[1]).column;
+        const ColumnConst * col_const_arr = checkAndGetColumnConst<ColumnArray>(arr_ptr.get());
+
+        if (!col_const_arr)
+            throw Exception(
+                "Illegal column " + block.getByPosition(arguments[1]).column->getName() + ". The array is not const",
+                ErrorCodes::ILLEGAL_COLUMN);
+
+        Array src_arr = col_const_arr->getValue<Array>();
+
+        std::vector<StringRef> refs;
+        for (const auto & el : src_arr)
+            refs.emplace_back(el.get<String>());
+
+        const size_t column_haystack_size = column_haystack->size();
+
+        auto col_res = ColumnVector<ResultType>::create();
+        auto col_offsets = ColumnArray::ColumnOffsets::create(column_haystack_size);
+
+        auto & vec_res = col_res->getData();
+        auto & offsets_res = col_offsets->getData();
+
+        vec_res.resize(column_haystack_size * refs.size());
+
+        if (col_haystack_vector)
+            Impl::vector_constant(col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), refs, vec_res);
+        else
+            throw Exception("Illegal column " + block.getByPosition(arguments[0]).column->getName(), ErrorCodes::ILLEGAL_COLUMN);
+
+        size_t refs_size = refs.size();
+        size_t accum = refs_size;
+
+        for (size_t i = 0; i < column_haystack_size; ++i, accum += refs_size)
+            offsets_res[i] = accum;
+
+        block.getByPosition(result).column = ColumnArray::create(std::move(col_res), std::move(col_offsets));
+    }
+};
+
+template <typename Impl, typename Name>
+class FunctionsMultiStringSearch : public IFunction
+{
+public:
+    static constexpr auto name = Name::name;
+    static FunctionPtr create(const Context &) { return std::make_shared<FunctionsMultiStringSearch>(); }
+
+    String getName() const override { return name; }
+
+    size_t getNumberOfArguments() const override { return 2; }
+
+    DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
+    {
+        if (arguments.size() + 1 >= std::numeric_limits<UInt8>::max())
+            throw Exception(
+                "Number of arguments for function " + getName() + " doesn't match: passed " + std::to_string(arguments.size())
+                    + ", should be at most 255.",
+                ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
+
+        if (!isString(arguments[0]))
+            throw Exception(
+                "Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+
+        const DataTypeArray * array_type = checkAndGetDataType<DataTypeArray>(arguments[1].get());
+        if (!array_type || !checkAndGetDataType<DataTypeString>(array_type->getNestedType().get()))
+            throw Exception(
+                "Illegal type " + arguments[1]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+
+
+        return std::make_shared<DataTypeNumber<typename Impl::ResultType>>();
+    }
+
+    void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) override
+    {
+        using ResultType = typename Impl::ResultType;
+
+        const ColumnPtr & column_haystack = block.getByPosition(arguments[0]).column;
+
+        const ColumnString * col_haystack_vector = checkAndGetColumn<ColumnString>(&*column_haystack);
+
+        const ColumnPtr & arr_ptr = block.getByPosition(arguments[1]).column;
+        const ColumnConst * col_const_arr = checkAndGetColumnConst<ColumnArray>(arr_ptr.get());
+
+        if (!col_const_arr)
+            throw Exception(
+                "Illegal column " + block.getByPosition(arguments[1]).column->getName() + ". The array is not const",
+                ErrorCodes::ILLEGAL_COLUMN);
+
+        Array src_arr = col_const_arr->getValue<Array>();
+
+        std::vector<StringRef> refs;
+        refs.reserve(src_arr.size());
+
+        for (const auto & el : src_arr)
+            refs.emplace_back(el.get<String>());
+
+        const size_t column_haystack_size = column_haystack->size();
+
+        auto col_res = ColumnVector<ResultType>::create();
+
+        auto & vec_res = col_res->getData();
+
+        vec_res.resize(column_haystack_size);
+
+        if (col_haystack_vector)
+            Impl::vector_constant(col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), refs, vec_res);
+        else
+            throw Exception("Illegal column " + block.getByPosition(arguments[0]).column->getName(), ErrorCodes::ILLEGAL_COLUMN);
+
+        block.getByPosition(result).column = std::move(col_res);
+    }
+};
+
 }
--- a/dbms/src/Interpreters/ActionsVisitor.cpp
+++ b/dbms/src/Interpreters/ActionsVisitor.cpp
@ -446,13 +446,11 @@ void ActionsVisitor::visit(const ASTPtr & ast)

                    for (size_t j = 0; j < lambda_arg_asts.size(); ++j)
                    {
-                        ASTIdentifier * lambda_identifier = typeid_cast<ASTIdentifier *>(lambda_arg_asts[j].get());
-                        if (!lambda_identifier)
+                        auto opt_arg_name = getIdentifierName(lambda_arg_asts[j]);
+                        if (!opt_arg_name)
                            throw Exception("lambda argument declarations must be identifiers", ErrorCodes::TYPE_MISMATCH);

-                        String arg_name = lambda_identifier->name;
-
-                        lambda_arguments.emplace_back(arg_name, lambda_type->getArgumentTypes()[j]);
+                        lambda_arguments.emplace_back(*opt_arg_name, lambda_type->getArgumentTypes()[j]);
                    }

                    actions_stack.pushLevel(lambda_arguments);
@ -541,9 +539,6 @@ void ActionsVisitor::makeSet(const ASTFunction * node, const Block & sample_bloc
    const ASTIdentifier * identifier = typeid_cast<const ASTIdentifier *>(arg.get());
    if (typeid_cast<const ASTSubquery *>(arg.get()) || identifier)
    {
-        /// We get the stream of blocks for the subquery. Create Set and put it in place of the subquery.
-        String set_id = arg->getColumnName();
-
        /// A special case is if the name of the table is specified on the right side of the IN statement,
        ///  and the table has the type Set (a previously prepared set).
        if (identifier)
@ -563,6 +558,9 @@ void ActionsVisitor::makeSet(const ASTFunction * node, const Block & sample_bloc
            }
        }

+        /// We get the stream of blocks for the subquery. Create Set and put it in place of the subquery.
+        String set_id = arg->getColumnName();
+
        SubqueryForSet & subquery_for_set = subqueries_for_sets[set_id];

        /// If you already created a Set with the same subquery / table.
--- a/dbms/src/Interpreters/AddDefaultDatabaseVisitor.h
+++ b/dbms/src/Interpreters/AddDefaultDatabaseVisitor.h
@ -10,6 +10,7 @@
 #include <Parsers/ASTTablesInSelectQuery.h>
 #include <Parsers/ASTFunction.h>
 #include <Parsers/DumpASTNode.h>
+#include <Interpreters/DatabaseAndTableWithAlias.h>

 namespace DB
 {
@ -90,19 +91,15 @@ private:
    void visit(ASTTableExpression & table_expression, ASTPtr &) const
    {
        if (table_expression.database_and_table_name)
-        {
            tryVisit<ASTIdentifier>(table_expression.database_and_table_name);
-
-            if (table_expression.database_and_table_name->children.size() != 2)
-                throw Exception("Logical error: more than two components in table expression", ErrorCodes::LOGICAL_ERROR);
-        }
        else if (table_expression.subquery)
            tryVisit<ASTSubquery>(table_expression.subquery);
    }

+    /// @note It expects that only table (not column) identifiers are visited.
    void visit(const ASTIdentifier & identifier, ASTPtr & ast) const
    {
-        if (ast->children.empty())
+        if (identifier.name_parts.empty())
            ast = createTableIdentifier(database_name, identifier.name);
    }

--- a/dbms/src/Interpreters/Cluster.cpp
+++ b/dbms/src/Interpreters/Cluster.cpp
@ -67,12 +67,13 @@ Cluster::Address::Address(const Poco::Util::AbstractConfiguration & config, cons
 }


-Cluster::Address::Address(const String & host_port_, const String & user_, const String & password_, UInt16 clickhouse_port)
+Cluster::Address::Address(const String & host_port_, const String & user_, const String & password_, UInt16 clickhouse_port, bool secure_)
    : user(user_), password(password_)
 {
    auto parsed_host_port = parseAddress(host_port_, clickhouse_port);
    host_name = parsed_host_port.first;
    port = parsed_host_port.second;
+    secure = secure_ ? Protocol::Secure::Enable : Protocol::Secure::Disable;

    initially_resolved_address = DNSResolver::instance().resolveAddress(parsed_host_port.first, parsed_host_port.second);
    is_local = isLocal(*this, initially_resolved_address, clickhouse_port);
@ -319,7 +320,7 @@ Cluster::Cluster(const Poco::Util::AbstractConfiguration & config, const Setting


 Cluster::Cluster(const Settings & settings, const std::vector<std::vector<String>> & names,
-                 const String & username, const String & password, UInt16 clickhouse_port, bool treat_local_as_remote)
+                 const String & username, const String & password, UInt16 clickhouse_port, bool treat_local_as_remote, bool secure)
 {
    UInt32 current_shard_num = 1;

@ -327,7 +328,7 @@ Cluster::Cluster(const Settings & settings, const std::vector<std::vector<String
    {
        Addresses current;
        for (auto & replica : shard)
-            current.emplace_back(replica, username, password, clickhouse_port);
+            current.emplace_back(replica, username, password, clickhouse_port, secure);

        addresses_with_failover.emplace_back(current);

--- a/dbms/src/Interpreters/Cluster.h
+++ b/dbms/src/Interpreters/Cluster.h
@ -24,7 +24,7 @@ public:
    /// This parameter is needed only to check that some address is local (points to ourself).
    Cluster(const Settings & settings, const std::vector<std::vector<String>> & names,
            const String & username, const String & password,
-            UInt16 clickhouse_port, bool treat_local_as_remote);
+            UInt16 clickhouse_port, bool treat_local_as_remote, bool secure = false);

    Cluster(const Cluster &) = delete;
    Cluster & operator=(const Cluster &) = delete;
@ -69,7 +69,7 @@ public:

        Address() = default;
        Address(const Poco::Util::AbstractConfiguration & config, const String & config_prefix);
-        Address(const String & host_port_, const String & user_, const String & password_, UInt16 clickhouse_port);
+        Address(const String & host_port_, const String & user_, const String & password_, UInt16 clickhouse_port, bool secure_ = false);

        /// Returns 'escaped_host_name:port'
        String toString() const;
--- a/dbms/src/Interpreters/DatabaseAndTableWithAlias.cpp
+++ b/dbms/src/Interpreters/DatabaseAndTableWithAlias.cpp
@ -13,7 +13,7 @@ namespace DB

 /// Checks that ast is ASTIdentifier and remove num_qualifiers_to_strip components from left.
 /// Example: 'database.table.name' -> (num_qualifiers_to_strip = 2) -> 'name'.
-void stripIdentifier(DB::ASTPtr & ast, size_t num_qualifiers_to_strip)
+void stripIdentifier(const DB::ASTPtr & ast, size_t num_qualifiers_to_strip)
 {
    ASTIdentifier * identifier = typeid_cast<ASTIdentifier *>(ast.get());

@ -22,29 +22,15 @@ void stripIdentifier(DB::ASTPtr & ast, size_t num_qualifiers_to_strip)

    if (num_qualifiers_to_strip)
    {
-        size_t num_components = identifier->children.size();
-
-        /// plain column
-        if (num_components - num_qualifiers_to_strip == 1)
-        {
-            DB::String node_alias = identifier->tryGetAlias();
-            ast = identifier->children.back();
-            if (!node_alias.empty())
-                ast->setAlias(node_alias);
-        }
-        else
-            /// nested column
-        {
-            identifier->children.erase(identifier->children.begin(), identifier->children.begin() + num_qualifiers_to_strip);
+        identifier->name_parts.erase(identifier->name_parts.begin(), identifier->name_parts.begin() + num_qualifiers_to_strip);
        DB::String new_name;
-            for (const auto & child : identifier->children)
+        for (const auto & part : identifier->name_parts)
        {
            if (!new_name.empty())
                new_name += '.';
-                new_name += static_cast<const ASTIdentifier &>(*child.get()).name;
-            }
-            identifier->name = new_name;
+            new_name += part;
        }
+        identifier->name.swap(new_name);
    }
 }

@ -52,32 +38,16 @@ void stripIdentifier(DB::ASTPtr & ast, size_t num_qualifiers_to_strip)
 size_t getNumComponentsToStripInOrderToTranslateQualifiedName(const ASTIdentifier & identifier,
                                                              const DatabaseAndTableWithAlias & names)
 {
-    size_t num_qualifiers_to_strip = 0;
-
-    /// It is compound identifier
-    if (!identifier.children.empty())
-    {
-        size_t num_components = identifier.children.size();
-
    /// database.table.column
-        if (num_components >= 3
-            && !names.database.empty()
-            && *getIdentifierName(identifier.children[0]) == names.database
-            && *getIdentifierName(identifier.children[1]) == names.table)
-        {
-            num_qualifiers_to_strip = 2;
-        }
+    if (doesIdentifierBelongTo(identifier, names.database, names.table))
+        return 2;

-        /// table.column or alias.column. If num_components > 2, it is like table.nested.column.
-        if (num_components >= 2
-            && ((!names.table.empty() && *getIdentifierName(identifier.children[0]) == names.table)
-                || (!names.alias.empty() && *getIdentifierName(identifier.children[0]) == names.alias)))
-        {
-            num_qualifiers_to_strip = 1;
-        }
-    }
+    /// table.column or alias.column.
+    if (doesIdentifierBelongTo(identifier, names.table) ||
+        doesIdentifierBelongTo(identifier, names.alias))
+        return 1;

-    return num_qualifiers_to_strip;
+    return 0;
 }


@ -87,13 +57,13 @@ DatabaseAndTableWithAlias::DatabaseAndTableWithAlias(const ASTIdentifier & ident
    table = identifier.name;
    alias = identifier.tryGetAlias();

-    if (!identifier.children.empty())
+    if (!identifier.name_parts.empty())
    {
-        if (identifier.children.size() != 2)
-            throw Exception("Logical error: number of components in table expression not equal to two", ErrorCodes::LOGICAL_ERROR);
+        if (identifier.name_parts.size() != 2)
+            throw Exception("Logical error: 2 components expected in table expression '" + identifier.name + "'", ErrorCodes::LOGICAL_ERROR);

-        getIdentifierName(identifier.children[0], database);
-        getIdentifierName(identifier.children[1], table);
+        database = identifier.name_parts[0];
+        table = identifier.name_parts[1];
    }
 }

@ -118,6 +88,22 @@ DatabaseAndTableWithAlias::DatabaseAndTableWithAlias(const ASTTableExpression &
        throw Exception("Logical error: no known elements in ASTTableExpression", ErrorCodes::LOGICAL_ERROR);
 }

+bool DatabaseAndTableWithAlias::satisfies(const DatabaseAndTableWithAlias & db_table, bool table_may_be_an_alias)
+{
+    /// table.*, alias.* or database.table.*
+
+    if (database.empty())
+    {
+        if (!db_table.table.empty() && table == db_table.table)
+            return true;
+
+        if (!db_table.alias.empty())
+            return (alias == db_table.alias) || (table_may_be_an_alias && table == db_table.alias);
+    }
+
+    return database == db_table.database && table == db_table.table;
+}
+
 String DatabaseAndTableWithAlias::getQualifiedNamePrefix() const
 {
    if (alias.empty() && table.empty())
@ -133,17 +119,7 @@ void DatabaseAndTableWithAlias::makeQualifiedName(const ASTPtr & ast) const
        String prefix = getQualifiedNamePrefix();
        identifier->name.insert(identifier->name.begin(), prefix.begin(), prefix.end());

-        Names qualifiers;
-        if (!alias.empty())
-            qualifiers.push_back(alias);
-        else
-        {
-            qualifiers.push_back(database);
-            qualifiers.push_back(table);
-        }
-
-        for (const auto & qualifier : qualifiers)
-            identifier->children.emplace_back(std::make_shared<ASTIdentifier>(qualifier));
+        addIdentifierQualifier(*identifier, database, table, alias);
    }
 }

@ -209,21 +185,13 @@ std::optional<DatabaseAndTableWithAlias> getDatabaseAndTable(const ASTSelectQuer
    return DatabaseAndTableWithAlias(database_and_table_name);
 }

-ASTPtr getTableFunctionOrSubquery(const ASTSelectQuery & select, size_t table_number)
+ASTPtr extractTableExpression(const ASTSelectQuery & select, size_t table_number)
 {
-    const ASTTableExpression * table_expression = getTableExpression(select, table_number);
-    if (table_expression)
+    if (const ASTTableExpression * table_expression = getTableExpression(select, table_number))
    {
-#if 1   /// TODO: It hides some logical error in InterpreterSelectQuery & distributed tables
        if (table_expression->database_and_table_name)
-        {
-            if (table_expression->database_and_table_name->children.empty())
            return table_expression->database_and_table_name;

-            if (table_expression->database_and_table_name->children.size() == 2)
-                return table_expression->database_and_table_name->children[1];
-        }
-#endif
        if (table_expression->table_function)
            return table_expression->table_function;

--- a/dbms/src/Interpreters/DatabaseAndTableWithAlias.h
+++ b/dbms/src/Interpreters/DatabaseAndTableWithAlias.h
@ -2,8 +2,9 @@

 #include <memory>
 #include <optional>
+
 #include <Core/Types.h>
-#include <Parsers/ASTSelectQuery.h>
+

 namespace DB
 {
@ -33,9 +34,12 @@ struct DatabaseAndTableWithAlias

    /// If ast is ASTIdentifier, prepend getQualifiedNamePrefix() to it's name.
    void makeQualifiedName(const ASTPtr & ast) const;
+
+    /// Check if it satisfies another db_table name. @note opterion is not symmetric.
+    bool satisfies(const DatabaseAndTableWithAlias & table, bool table_may_be_an_alias);
 };

-void stripIdentifier(DB::ASTPtr & ast, size_t num_qualifiers_to_strip);
+void stripIdentifier(const DB::ASTPtr & ast, size_t num_qualifiers_to_strip);

 size_t getNumComponentsToStripInOrderToTranslateQualifiedName(const ASTIdentifier & identifier,
                                                              const DatabaseAndTableWithAlias & names);
@ -44,6 +48,6 @@ std::vector<DatabaseAndTableWithAlias> getDatabaseAndTables(const ASTSelectQuery
 std::optional<DatabaseAndTableWithAlias> getDatabaseAndTable(const ASTSelectQuery & select, size_t table_number);

 std::vector<const ASTTableExpression *> getSelectTablesExpression(const ASTSelectQuery & select_query);
-ASTPtr getTableFunctionOrSubquery(const ASTSelectQuery & select, size_t table_number);
+ASTPtr extractTableExpression(const ASTSelectQuery & select, size_t table_number);

 }
--- a/dbms/src/Interpreters/ExecuteScalarSubqueriesVisitor.cpp
+++ b/dbms/src/Interpreters/ExecuteScalarSubqueriesVisitor.cpp
@ -1,6 +1,7 @@
 #include <Parsers/ASTFunction.h>
 #include <Parsers/ASTLiteral.h>
 #include <Parsers/ASTSubquery.h>
+#include <Parsers/ASTSelectQuery.h>
 #include <Parsers/ASTTablesInSelectQuery.h>
 #include <Parsers/ASTExpressionList.h>

--- a/dbms/src/Interpreters/ExpressionAnalyzer.cpp
+++ b/dbms/src/Interpreters/ExpressionAnalyzer.cpp
@ -310,7 +310,7 @@ void ExpressionAnalyzer::makeSetsForIndexImpl(const ASTPtr & node, const Block &

            if (!prepared_sets.count(arg->range)) /// Not already prepared.
            {
-                if (typeid_cast<ASTSubquery *>(arg.get()) || typeid_cast<ASTIdentifier *>(arg.get()))
+                if (typeid_cast<ASTSubquery *>(arg.get()) || isIdentifier(arg))
                {
                    if (settings.use_index_for_in_with_subqueries)
                        tryMakeSetForIndexFromSubquery(arg);
--- a/dbms/src/Interpreters/GlobalSubqueriesVisitor.h
+++ b/dbms/src/Interpreters/GlobalSubqueriesVisitor.h
@ -55,7 +55,7 @@ public:
            ASTPtr table_name;
            ASTPtr subquery_or_table_name;

-            if (typeid_cast<const ASTIdentifier *>(subquery_or_table_name_or_table_expression.get()))
+            if (isIdentifier(subquery_or_table_name_or_table_expression))
            {
                table_name = subquery_or_table_name_or_table_expression;
                subquery_or_table_name = table_name;
@ -86,7 +86,7 @@ public:
            if (table_name)
            {
                /// If this is already an external table, you do not need to add anything. Just remember its presence.
-                if (external_tables.end() != external_tables.find(static_cast<const ASTIdentifier &>(*table_name).name))
+                if (external_tables.end() != external_tables.find(*getIdentifierName(table_name)))
                    return;
            }

--- a/dbms/src/Interpreters/InterpreterDescribeQuery.cpp
+++ b/dbms/src/Interpreters/InterpreterDescribeQuery.cpp
@ -86,20 +86,17 @@ BlockInputStreamPtr InterpreterDescribeQuery::executeImpl()
            String database_name;
            String table_name;

-            auto identifier = table_expression->database_and_table_name;
-            if (identifier->children.size() > 2)
+            auto identifier = typeid_cast<const ASTIdentifier *>(table_expression->database_and_table_name.get());
+            if (identifier->name_parts.size() > 2)
                throw Exception("Logical error: more than two components in table expression", ErrorCodes::LOGICAL_ERROR);

-            if (identifier->children.size() > 1)
+            if (identifier->name_parts.size() > 1)
            {
-                auto database_ptr = identifier->children[0];
-                auto table_ptr = identifier->children[1];
-
-                getIdentifierName(database_ptr, database_name);
-                getIdentifierName(table_ptr, table_name);
+                database_name = identifier->name_parts[0];
+                table_name = identifier->name_parts[1];
            }
            else
-                getIdentifierName(identifier, table_name);
+                table_name = identifier->name;

            table = context.getTable(database_name, table_name);
        }
--- a/dbms/src/Interpreters/InterpreterSelectQuery.cpp
+++ b/dbms/src/Interpreters/InterpreterSelectQuery.cpp
@ -147,14 +147,22 @@ InterpreterSelectQuery::InterpreterSelectQuery(

    max_streams = settings.max_threads;

-    ASTPtr table_expression = getTableFunctionOrSubquery(query, 0);
+    ASTPtr table_expression = extractTableExpression(query, 0);
+
+    bool is_table_func = false;
+    bool is_subquery = false;
+    if (table_expression)
+    {
+        is_table_func = typeid_cast<const ASTFunction *>(table_expression.get());
+        is_subquery = typeid_cast<const ASTSelectWithUnionQuery *>(table_expression.get());
+    }

    if (input)
    {
        /// Read from prepared input.
        source_header = input->getHeader();
    }
-    else if (table_expression && typeid_cast<const ASTSelectWithUnionQuery *>(table_expression.get()))
+    else if (is_subquery)
    {
        /// Read from subquery.
        interpreter_subquery = std::make_unique<InterpreterSelectWithUnionQuery>(
@ -164,7 +172,7 @@ InterpreterSelectQuery::InterpreterSelectQuery(
    }
    else if (!storage)
    {
-        if (table_expression && typeid_cast<const ASTFunction *>(table_expression.get()))
+        if (is_table_func)
        {
            /// Read from table function.
            storage = context.getQueryContext().executeTableFunction(table_expression);
@ -208,7 +216,7 @@ InterpreterSelectQuery::InterpreterSelectQuery(
        if (query_analyzer->isRewriteSubqueriesPredicate())
        {
            /// remake interpreter_subquery when PredicateOptimizer is rewrite subqueries and main table is subquery
-            if (table_expression && typeid_cast<ASTSelectWithUnionQuery *>(table_expression.get()))
+            if (is_subquery)
                interpreter_subquery = std::make_unique<InterpreterSelectWithUnionQuery>(
                    table_expression, getSubqueryContext(context), required_columns, QueryProcessingStage::Complete, subquery_depth + 1,
                    only_analyze);
@ -921,7 +929,7 @@ void InterpreterSelectQuery::executeFetchColumns(
        /// If we need less number of columns that subquery have - update the interpreter.
        if (required_columns.size() < source_header.columns())
        {
-            ASTPtr subquery = getTableFunctionOrSubquery(query, 0);
+            ASTPtr subquery = extractTableExpression(query, 0);
            if (!subquery)
                throw Exception("Subquery expected", ErrorCodes::LOGICAL_ERROR);

@ -1396,7 +1404,7 @@ bool hasWithTotalsInAnySubqueryInFromClause(const ASTSelectQuery & query)
      * In other cases, totals will be computed on the initiating server of the query, and it is not necessary to read the data to the end.
      */

-    if (auto query_table = getTableFunctionOrSubquery(query, 0))
+    if (auto query_table = extractTableExpression(query, 0))
    {
        if (auto ast_union = typeid_cast<const ASTSelectWithUnionQuery *>(query_table.get()))
        {
--- a/dbms/src/Interpreters/JoinToSubqueryTransformVisitor.cpp
+++ b/dbms/src/Interpreters/JoinToSubqueryTransformVisitor.cpp
@ -1,6 +1,5 @@
 #include <Common/typeid_cast.h>
 #include <Interpreters/JoinToSubqueryTransformVisitor.h>
-#include <Interpreters/SemanticSelectQuery.h>
 #include <Parsers/ASTSelectQuery.h>
 #include <Parsers/ASTTablesInSelectQuery.h>
 #include <Parsers/ASTIdentifier.h>
@ -19,6 +18,7 @@ namespace ErrorCodes
    extern const int TOO_DEEP_AST;
 }

+#if 0
 /// Attach additional semantic info to generated select.
 struct AppendSemanticVisitorData
 {
@ -35,6 +35,7 @@ struct AppendSemanticVisitorData
        done = true;
    }
 };
+#endif

 /// Replaces one table element with pair.
 struct RewriteTablesVisitorData
@ -124,7 +125,7 @@ void JoinToSubqueryTransformMatcher::visit(ASTSelectQuery & select, ASTPtr & ast
        if (!left)
            return;

-        SemanticSelectQuery::hideNames(select, hidden_names, subquery_name);
+        //SemanticSelectQuery::hideNames(select, hidden_names, subquery_name);
    }

    select.tables = std::make_shared<ASTTablesInSelectQuery>();
@ -135,11 +136,15 @@ void JoinToSubqueryTransformMatcher::visit(ASTSelectQuery & select, ASTPtr & ast
    data.done = true;
 }

-ASTPtr JoinToSubqueryTransformMatcher::replaceJoin(ASTSelectQuery & select, ASTPtr ast_left, ASTPtr ast_right, const String & subquery_alias)
+ASTPtr JoinToSubqueryTransformMatcher::replaceJoin(ASTSelectQuery &, ASTPtr ast_left, ASTPtr ast_right, const String & subquery_alias)
 {
+#if 0
    using RewriteMatcher = LinkedMatcher<
        OneTypeMatcher<RewriteTablesVisitorData>,
        OneTypeMatcher<AppendSemanticVisitorData>>;
+#else
+    using RewriteMatcher = OneTypeMatcher<RewriteTablesVisitorData>;
+#endif
    using RewriteVisitor = InDepthNodeVisitor<RewriteMatcher, true>;

    auto left = typeid_cast<const ASTTablesInSelectQueryElement *>(ast_left.get());
@ -160,8 +165,12 @@ ASTPtr JoinToSubqueryTransformMatcher::replaceJoin(ASTSelectQuery & select, ASTP
    if (!res)
        throw Exception("Cannot parse rewrite query", ErrorCodes::LOGICAL_ERROR);

+#if 0
    RewriteVisitor::Data visitor_data =
        std::make_pair<RewriteTablesVisitorData, AppendSemanticVisitorData>({ast_left, ast_right}, {select.semantic});
+#else
+    RewriteVisitor::Data visitor_data{ast_left, ast_right};
+#endif
    RewriteVisitor(visitor_data).visit(res);
    return res;
 }
--- a/dbms/src/Interpreters/PredicateExpressionsOptimizer.cpp
+++ b/dbms/src/Interpreters/PredicateExpressionsOptimizer.cpp
@ -133,8 +133,12 @@ void PredicateExpressionsOptimizer::getDependenciesAndQualifiedOfExpression(cons
 {
    if (const auto identifier = typeid_cast<ASTIdentifier *>(expression.get()))
    {
-        if (!identifier->children.empty())
-            dependencies_and_qualified.emplace_back(std::pair(identifier, expression->getAliasOrColumnName()));
+        String table_alias;
+        if (!identifier->name_parts.empty())
+        {
+            if (!tables_with_aliases.empty())
+                table_alias = tables_with_aliases[0].getQualifiedNamePrefix();
+        }
        else
        {
            size_t best_table_pos = 0;
@ -153,9 +157,11 @@ void PredicateExpressionsOptimizer::getDependenciesAndQualifiedOfExpression(cons
                }
            }

-            String qualified_name = tables_with_aliases[best_table_pos].getQualifiedNamePrefix() + expression->getAliasOrColumnName();
-            dependencies_and_qualified.emplace_back(std::pair(identifier, qualified_name));
+            table_alias = tables_with_aliases[best_table_pos].getQualifiedNamePrefix();
        }
+
+        String qualified_name = table_alias + expression->getAliasOrColumnName();
+        dependencies_and_qualified.emplace_back(std::pair(identifier, qualified_name));
    }
    else
    {
@ -356,31 +362,17 @@ ASTs PredicateExpressionsOptimizer::evaluateAsterisk(ASTSelectQuery * select_que
        if (qualified_asterisk->children.size() != 1)
            throw Exception("Logical error: qualified asterisk must have exactly one child", ErrorCodes::LOGICAL_ERROR);

-        ASTIdentifier * ident = typeid_cast<ASTIdentifier *>(qualified_asterisk->children[0].get());
-        if (!ident)
-            throw Exception("Logical error: qualified asterisk must have identifier as its child", ErrorCodes::LOGICAL_ERROR);
+        DatabaseAndTableWithAlias ident_db_and_name(qualified_asterisk->children[0]);

-        size_t num_components = ident->children.size();
-        if (num_components > 2)
-            throw Exception("Qualified asterisk cannot have more than two qualifiers", ErrorCodes::UNKNOWN_ELEMENT_IN_AST);
-
-        for (auto it = tables_expression.begin(); it != tables_expression.end(); ++it)
+        for (auto it = tables_expression.begin(); it != tables_expression.end();)
        {
            const ASTTableExpression * table_expression = *it;
            DatabaseAndTableWithAlias database_and_table_with_alias(*table_expression, context.getCurrentDatabase());
-            /// database.table.*
-            if (num_components == 2 && !database_and_table_with_alias.database.empty()
-                && static_cast<const ASTIdentifier &>(*ident->children[0]).name == database_and_table_with_alias.database
-                && static_cast<const ASTIdentifier &>(*ident->children[1]).name == database_and_table_with_alias.table)
-                continue;
-            /// table.* or alias.*
-            else if (num_components == 0
-                     && ((!database_and_table_with_alias.table.empty() && ident->name == database_and_table_with_alias.table)
-                         || (!database_and_table_with_alias.alias.empty() && ident->name == database_and_table_with_alias.alias)))
-                continue;
+
+            if (ident_db_and_name.satisfies(database_and_table_with_alias, true))
+                ++it;
            else
-                /// It's not a required table
-                tables_expression.erase(it);
+                it = tables_expression.erase(it); /// It's not a required table
        }
    }

--- a/dbms/src/Interpreters/QueryNormalizer.cpp
+++ b/dbms/src/Interpreters/QueryNormalizer.cpp
@ -168,18 +168,11 @@ void QueryNormalizer::visit(ASTExpressionList & node, const ASTPtr &, Data & dat
        }
        else if (const auto * qualified_asterisk = typeid_cast<const ASTQualifiedAsterisk *>(child.get()))
        {
-            const ASTIdentifier * identifier = typeid_cast<const ASTIdentifier *>(qualified_asterisk->children[0].get());
-            size_t num_components = identifier->children.size();
+            DatabaseAndTableWithAlias ident_db_and_name(qualified_asterisk->children[0]);

            for (const auto & [table_name, table_columns] : tables_with_columns)
            {
-                if ((num_components == 2                    /// database.table.*
-                        && !table_name.database.empty()     /// This is normal (not a temporary) table.
-                        && static_cast<const ASTIdentifier &>(*identifier->children[0]).name == table_name.database
-                        && static_cast<const ASTIdentifier &>(*identifier->children[1]).name == table_name.table)
-                    || (num_components == 0                                                         /// t.*
-                        && ((!table_name.table.empty() && identifier->name == table_name.table)         /// table.*
-                            || (!table_name.alias.empty() && identifier->name == table_name.alias))))   /// alias.*
+                if (ident_db_and_name.satisfies(table_name, true))
                {
                    for (const auto & column_name : table_columns)
                        node.children.emplace_back(std::make_shared<ASTIdentifier>(column_name));
--- a/dbms/src/Interpreters/SemanticSelectQuery.h
+++ b/dbms/src/Interpreters/SemanticSelectQuery.h
@ -1,43 +0,0 @@
-#pragma once
-#include <Parsers/IAST.h>
-#include <Parsers/ASTSelectQuery.h>
-
-namespace DB
-{
-
-/// Additional information for ASTSelectQuery
-class SemanticSelectQuery : public ISemantic
-{
-public:
-    SemanticPtr clone() const override { return std::make_shared<SemanticSelectQuery>(*this); }
-
-    std::vector<String> getPossibleNames(const String & name) const
-    {
-        std::vector<String> res;
-        res.push_back(name);
-
-        for (auto it = hidings.find(name); it != hidings.end(); it = hidings.find(it->second))
-            res.push_back(it->second);
-        return res;
-    }
-
-    static void hideNames(ASTSelectQuery & select, const std::vector<String> & hidden, const String & new_name)
-    {
-        if (!select.semantic)
-            select.semantic = std::make_shared<SemanticSelectQuery>();
-
-        auto & sema = static_cast<SemanticSelectQuery &>(*select.semantic);
-        sema.hideNames(hidden, new_name);
-    }
-
-private:
-    std::unordered_map<String, String> hidings;
-
-    void hideNames(const std::vector<String> & hidden, const String & new_name)
-    {
-        for (auto & name : hidden)
-            hidings.emplace(name, new_name);
-    }
-};
-
-}
--- a/dbms/src/Interpreters/TranslateQualifiedNamesVisitor.cpp
+++ b/dbms/src/Interpreters/TranslateQualifiedNamesVisitor.cpp
@ -89,38 +89,17 @@ std::vector<ASTPtr *> TranslateQualifiedNamesMatcher::visit(const ASTIdentifier

 std::vector<ASTPtr *> TranslateQualifiedNamesMatcher::visit(const ASTQualifiedAsterisk & , const ASTPtr & ast, Data & data)
 {
-    const std::vector<DatabaseAndTableWithAlias> & tables = data.tables;
-
    if (ast->children.size() != 1)
        throw Exception("Logical error: qualified asterisk must have exactly one child", ErrorCodes::LOGICAL_ERROR);

-    ASTIdentifier * ident = typeid_cast<ASTIdentifier *>(ast->children[0].get());
-    if (!ident)
-        throw Exception("Logical error: qualified asterisk must have identifier as its child", ErrorCodes::LOGICAL_ERROR);
+    auto & ident = ast->children[0];

-    size_t num_components = ident->children.size();
-    if (num_components > 2)
-        throw Exception("Qualified asterisk cannot have more than two qualifiers", ErrorCodes::UNKNOWN_ELEMENT_IN_AST);
+    /// @note it could contain table alias as table name.
+    DatabaseAndTableWithAlias db_and_table(ident);

-    DatabaseAndTableWithAlias db_and_table(*ident);
-
-    for (const auto & table_names : tables)
-    {
-        /// database.table.*, table.* or alias.*
-        if (num_components == 2)
-        {
-            if (!table_names.database.empty() &&
-                db_and_table.database == table_names.database &&
-                db_and_table.table == table_names.table)
+    for (const auto & known_table : data.tables)
+        if (db_and_table.satisfies(known_table, true))
            return {};
-        }
-        else if (num_components == 0)
-        {
-            if ((!table_names.table.empty() && db_and_table.table == table_names.table) ||
-                (!table_names.alias.empty() && db_and_table.table == table_names.alias))
-                return {};
-        }
-    }

    throw Exception("Unknown qualified identifier: " + ident->getAliasOrColumnName(), ErrorCodes::UNKNOWN_IDENTIFIER);
 }
--- a/dbms/src/Interpreters/TranslateQualifiedNamesVisitor.h
+++ b/dbms/src/Interpreters/TranslateQualifiedNamesVisitor.h
@ -2,6 +2,7 @@

 #include <vector>

+#include <Core/Names.h>
 #include <Interpreters/DatabaseAndTableWithAlias.h>
 #include <Interpreters/InDepthNodeVisitor.h>

--- a/dbms/src/Parsers/ASTIdentifier.cpp
+++ b/dbms/src/Parsers/ASTIdentifier.cpp
@ -18,14 +18,14 @@ void ASTIdentifier::formatImplWithoutAlias(const FormatSettings & settings, Form

    /// A simple or compound identifier?

-    if (children.size() > 1)
+    if (name_parts.size() > 1)
    {
-        for (size_t i = 0, size = children.size(); i < size; ++i)
+        for (size_t i = 0, size = name_parts.size(); i < size; ++i)
        {
            if (i != 0)
                settings.ostr << '.';

-            format_element(static_cast<const ASTIdentifier &>(*children[i].get()).name);
+            format_element(name_parts[i]);
        }
    }
    else
@ -44,11 +44,7 @@ ASTPtr createTableIdentifier(const String & database_name, const String & table_
    if (database_name.empty())
        return ASTIdentifier::createSpecial(table_name);

-    ASTPtr database = ASTIdentifier::createSpecial(database_name);
-    ASTPtr table = ASTIdentifier::createSpecial(table_name);
-
-    ASTPtr database_and_table = ASTIdentifier::createSpecial(database_name + "." + table_name);
-    database_and_table->children = {database, table};
+    ASTPtr database_and_table = ASTIdentifier::createSpecial(database_name + "." + table_name, {database_name, table_name});
    return database_and_table;
 }

@ -117,4 +113,35 @@ void setIdentifierSpecial(ASTPtr & ast)
            id->setSpecial();
 }

+void addIdentifierQualifier(ASTIdentifier & identifier, const String & database, const String & table, const String & alias)
+{
+    if (!alias.empty())
+    {
+        identifier.name_parts.emplace_back(alias);
+    }
+    else
+    {
+        if (!database.empty())
+            identifier.name_parts.emplace_back(database);
+        identifier.name_parts.emplace_back(table);
+    }
+}
+
+bool doesIdentifierBelongTo(const ASTIdentifier & identifier, const String & database, const String & table)
+{
+    size_t num_components = identifier.name_parts.size();
+    if (num_components >= 3)
+        return identifier.name_parts[0] == database &&
+               identifier.name_parts[1] == table;
+    return false;
+}
+
+bool doesIdentifierBelongTo(const ASTIdentifier & identifier, const String & table)
+{
+    size_t num_components = identifier.name_parts.size();
+    if (num_components >= 2)
+        return identifier.name_parts[0] == table;
+    return false;
+}
+
 }
--- a/dbms/src/Parsers/ASTIdentifier.h
+++ b/dbms/src/Parsers/ASTIdentifier.h
@ -12,11 +12,14 @@ namespace DB
 class ASTIdentifier : public ASTWithAlias
 {
 public:
-    /// name. The composite identifier here will have a concatenated name (of the form a.b.c), and individual components will be available inside the children.
+    /// The composite identifier will have a concatenated name (of the form a.b.c),
+    /// and individual components will be available inside the name_parts.
    String name;
+    std::vector<String> name_parts;

-    ASTIdentifier(const String & name_)
+    ASTIdentifier(const String & name_, std::vector<String> && name_parts_ = {})
        : name(name_)
+        , name_parts(name_parts_)
        , special(false)
    {
        range = StringRange(name.data(), name.data() + name.size());
@ -37,11 +40,13 @@ protected:
    void appendColumnNameImpl(WriteBuffer & ostr) const override;

 private:
+    using ASTWithAlias::children; /// ASTIdentifier is child free
+
    bool special; /// TODO: it would be ptr to semantic here

-    static std::shared_ptr<ASTIdentifier> createSpecial(const String & name_)
+    static std::shared_ptr<ASTIdentifier> createSpecial(const String & name, std::vector<String> && name_parts = {})
    {
-        auto ret = std::make_shared<ASTIdentifier>(name_);
+        auto ret = std::make_shared<ASTIdentifier>(name, std::move(name_parts));
        ret->special = true;
        return ret;
    }
@ -77,5 +82,8 @@ std::optional<String> getTableIdentifierName(const ASTIdentifier & node);
 std::optional<String> getTableIdentifierName(const ASTPtr & ast);

 void setIdentifierSpecial(ASTPtr & ast);
+void addIdentifierQualifier(ASTIdentifier & identifier, const String & database, const String & table, const String & alias);
+bool doesIdentifierBelongTo(const ASTIdentifier & identifier, const String & table_or_alias);
+bool doesIdentifierBelongTo(const ASTIdentifier & identifier, const String & database, const String & table);

 }
--- a/dbms/src/Parsers/ASTSelectQuery.cpp
+++ b/dbms/src/Parsers/ASTSelectQuery.cpp
@ -51,8 +51,6 @@ ASTPtr ASTSelectQuery::clone() const

 #undef CLONE

-    if (semantic)
-        res->semantic = semantic->clone();
    return res;
 }

--- a/dbms/src/Parsers/ExpressionElementParsers.cpp
+++ b/dbms/src/Parsers/ExpressionElementParsers.cpp
@ -169,19 +169,19 @@ bool ParserCompoundIdentifier::parseImpl(Pos & pos, ASTPtr & node, Expected & ex
        return false;

    String name;
+    std::vector<String> parts;
    const ASTExpressionList & list = static_cast<const ASTExpressionList &>(*id_list.get());
    for (const auto & child : list.children)
    {
        if (!name.empty())
            name += '.';
-        name += *getIdentifierName(child);
+        parts.emplace_back(*getIdentifierName(child));
+        name += parts.back();
    }

-    node = std::make_shared<ASTIdentifier>(name);
-
-    /// In `children`, remember the identifiers-components, if there are more than one.
-    if (list.children.size() > 1)
-        node->children.insert(node->children.end(), list.children.begin(), list.children.end());
+    if (parts.size() == 1)
+        parts.clear();
+    node = std::make_shared<ASTIdentifier>(name, std::move(parts));

    return true;
 }
--- a/dbms/src/Parsers/IAST.h
+++ b/dbms/src/Parsers/IAST.h
@ -31,20 +31,6 @@ class IAST;
 using ASTPtr = std::shared_ptr<IAST>;
 using ASTs = std::vector<ASTPtr>;

-class ISemantic;
-using SemanticPtr = std::shared_ptr<ISemantic>;
-
-/// Interfase to set additional information to IAST. Derived classes should be named according to their AST nodes' types:
-/// ASTIdentifier => SemanticIdentifer, ASTSome => SemanticSome, ...
-class ISemantic
-{
-public:
-    virtual ~ISemantic() = default;
-    ISemantic() = default;
-    ISemantic(const ISemantic &) = default;
-    virtual SemanticPtr clone() const = 0;
-};
-
 class WriteBuffer;


@ -58,7 +44,6 @@ public:

    /// This pointer does not allow it to be deleted while the range refers to it.
    StringPtr owned_string;
-    SemanticPtr semantic;

    virtual ~IAST() = default;
    IAST() = default;
--- a/dbms/src/Storages/MergeTree/MergeTreeData.h
+++ b/dbms/src/Storages/MergeTree/MergeTreeData.h
@ -219,6 +219,8 @@ public:
        /// If commit() was not called, deletes temporary files, canceling the ALTER.
        ~AlterDataPartTransaction();

+        const String & getPartName() const { return data_part->name; }
+
        /// Review the changes before the commit.
        const NamesAndTypesList & getNewColumns() const { return new_columns; }
        const DataPart::Checksums & getNewChecksums() const { return new_checksums; }
--- a/dbms/src/Storages/MergeTree/MergeTreeDataPartChecksum.cpp
+++ b/dbms/src/Storages/MergeTree/MergeTreeDataPartChecksum.cpp
@ -312,7 +312,11 @@ String MergeTreeDataPartChecksums::getTotalChecksumHex() const
 void MinimalisticDataPartChecksums::serialize(WriteBuffer & to) const
 {
    writeString("checksums format version: 5\n", to);
+    serializeWithoutHeader(to);
+}

+void MinimalisticDataPartChecksums::serializeWithoutHeader(WriteBuffer & to) const
+{
    writeVarUInt(num_compressed_files, to);
    writeVarUInt(num_uncompressed_files, to);

@ -337,26 +341,31 @@ bool MinimalisticDataPartChecksums::deserialize(ReadBuffer & in)

    if (format_version < MINIMAL_VERSION_WITH_MINIMALISTIC_CHECKSUMS)
    {
-        auto full_checksums_ptr = std::make_unique<MergeTreeDataPartChecksums>();
-        if (!full_checksums_ptr->read(in, format_version))
+        MergeTreeDataPartChecksums new_full_checksums;
+        if (!new_full_checksums.read(in, format_version))
            return false;

-        computeTotalChecksums(*full_checksums_ptr);
-        full_checksums = std::move(full_checksums_ptr);
+        computeTotalChecksums(new_full_checksums);
+        full_checksums = std::move(new_full_checksums);
        return true;
    }

    if (format_version > MINIMAL_VERSION_WITH_MINIMALISTIC_CHECKSUMS)
        throw Exception("Unknown checksums format version: " + DB::toString(format_version), ErrorCodes::UNKNOWN_FORMAT);

+    deserializeWithoutHeader(in);
+
+    return true;
+}
+
+void MinimalisticDataPartChecksums::deserializeWithoutHeader(ReadBuffer & in)
+{
    readVarUInt(num_compressed_files, in);
    readVarUInt(num_uncompressed_files, in);

    readPODBinary(hash_of_all_files, in);
    readPODBinary(hash_of_uncompressed_files, in);
    readPODBinary(uncompressed_hash_of_compressed_files, in);
-
-    return true;
 }

 void MinimalisticDataPartChecksums::computeTotalChecksums(const MergeTreeDataPartChecksums & full_checksums_)
@ -410,7 +419,7 @@ String MinimalisticDataPartChecksums::getSerializedString(const MergeTreeDataPar
    return checksums.getSerializedString();
 }

-void MinimalisticDataPartChecksums::checkEqual(const MinimalisticDataPartChecksums & rhs, bool check_uncompressed_hash_in_compressed_files)
+void MinimalisticDataPartChecksums::checkEqual(const MinimalisticDataPartChecksums & rhs, bool check_uncompressed_hash_in_compressed_files) const
 {
    if (full_checksums && rhs.full_checksums)
        full_checksums->checkEqual(*rhs.full_checksums, check_uncompressed_hash_in_compressed_files);
@ -419,7 +428,7 @@ void MinimalisticDataPartChecksums::checkEqual(const MinimalisticDataPartChecksu
    checkEqualImpl(rhs, check_uncompressed_hash_in_compressed_files);
 }

-void MinimalisticDataPartChecksums::checkEqual(const MergeTreeDataPartChecksums & rhs, bool check_uncompressed_hash_in_compressed_files)
+void MinimalisticDataPartChecksums::checkEqual(const MergeTreeDataPartChecksums & rhs, bool check_uncompressed_hash_in_compressed_files) const
 {
    if (full_checksums)
        full_checksums->checkEqual(rhs, check_uncompressed_hash_in_compressed_files);
@ -430,7 +439,7 @@ void MinimalisticDataPartChecksums::checkEqual(const MergeTreeDataPartChecksums
    checkEqualImpl(rhs_minimalistic, check_uncompressed_hash_in_compressed_files);
 }

-void MinimalisticDataPartChecksums::checkEqualImpl(const MinimalisticDataPartChecksums & rhs, bool check_uncompressed_hash_in_compressed_files)
+void MinimalisticDataPartChecksums::checkEqualImpl(const MinimalisticDataPartChecksums & rhs, bool check_uncompressed_hash_in_compressed_files) const
 {
    if (num_compressed_files != rhs.num_compressed_files || num_uncompressed_files != rhs.num_uncompressed_files)
    {
--- a/dbms/src/Storages/MergeTree/MergeTreeDataPartChecksum.h
+++ b/dbms/src/Storages/MergeTree/MergeTreeDataPartChecksum.h
@ -4,7 +4,7 @@
 #include <IO/WriteBuffer.h>
 #include <city.h>
 #include <map>
-#include <memory>
+#include <optional>


 class SipHash;
@ -112,7 +112,7 @@ struct MinimalisticDataPartChecksums
    }

    /// Is set only for old formats
-    std::unique_ptr<MergeTreeDataPartChecksums> full_checksums;
+    std::optional<MergeTreeDataPartChecksums> full_checksums;

    static constexpr size_t MINIMAL_VERSION_WITH_MINIMALISTIC_CHECKSUMS = 5;

@ -120,15 +120,17 @@ struct MinimalisticDataPartChecksums
    void computeTotalChecksums(const MergeTreeDataPartChecksums & full_checksums);

    bool deserialize(ReadBuffer & in);
+    void deserializeWithoutHeader(ReadBuffer & in);
    static MinimalisticDataPartChecksums deserializeFrom(const String & s);

    void serialize(WriteBuffer & to) const;
+    void serializeWithoutHeader(WriteBuffer & to) const;
    String getSerializedString();
    static String getSerializedString(const MergeTreeDataPartChecksums & full_checksums, bool minimalistic);

-    void checkEqual(const MinimalisticDataPartChecksums & rhs, bool check_uncompressed_hash_in_compressed_files);
-    void checkEqual(const MergeTreeDataPartChecksums & rhs, bool check_uncompressed_hash_in_compressed_files);
-    void checkEqualImpl(const MinimalisticDataPartChecksums & rhs, bool check_uncompressed_hash_in_compressed_files);
+    void checkEqual(const MinimalisticDataPartChecksums & rhs, bool check_uncompressed_hash_in_compressed_files) const;
+    void checkEqual(const MergeTreeDataPartChecksums & rhs, bool check_uncompressed_hash_in_compressed_files) const;
+    void checkEqualImpl(const MinimalisticDataPartChecksums & rhs, bool check_uncompressed_hash_in_compressed_files) const;
 };


--- a/dbms/src/Storages/MergeTree/MergeTreeSettings.h
+++ b/dbms/src/Storages/MergeTree/MergeTreeSettings.h
@ -148,6 +148,13 @@ struct MergeTreeSettings
      */                                                                                                      \
    M(SettingBool, use_minimalistic_checksums_in_zookeeper, true)                                             \
                                                                                                              \
+    /** Store part header (checksums and columns) in a compact format and a single part znode                 \
+      *  instead of separate znodes (<part>/columns and <part>/checksums).                                    \
+      * This can dramatically reduce snapshot size in ZooKeeper.                                              \
+      * Before enabling check that all replicas support new format.                                           \
+      */                                                                                                      \
+    M(SettingBool, use_minimalistic_part_header_in_zookeeper, false)                                          \
+                                                                                                              \
    /** How many records about mutations that are done to keep.                                               \
     *  If zero, then keep all of them */                                                                     \
    M(SettingUInt64, finished_mutations_to_keep, 100)                                                         \
--- a/dbms/src/Storages/MergeTree/ReplicatedMergeTreeAlterThread.cpp
+++ b/dbms/src/Storages/MergeTree/ReplicatedMergeTreeAlterThread.cpp
@ -1,5 +1,6 @@
 #include <Storages/MergeTree/ReplicatedMergeTreeAlterThread.h>
 #include <Storages/MergeTree/ReplicatedMergeTreeTableMetadata.h>
+#include <Storages/MergeTree/ReplicatedMergeTreePartHeader.h>
 #include <Storages/ColumnsDescription.h>
 #include <Storages/StorageReplicatedMergeTree.h>
 #include <Common/setThreadName.h>
@ -155,32 +156,9 @@ void ReplicatedMergeTreeAlterThread::run()
                if (!transaction)
                    continue;

+                storage.updatePartHeaderInZooKeeperAndCommit(zookeeper, *transaction);
+
                ++changed_parts;
-
-                /// Update part metadata in ZooKeeper.
-                Coordination::Requests ops;
-                ops.emplace_back(zkutil::makeSetRequest(
-                    storage.replica_path + "/parts/" + part->name + "/columns", transaction->getNewColumns().toString(), -1));
-                ops.emplace_back(zkutil::makeSetRequest(
-                    storage.replica_path + "/parts/" + part->name + "/checksums",
-                    storage.getChecksumsForZooKeeper(transaction->getNewChecksums()),
-                    -1));
-
-                try
-                {
-                    zookeeper->multi(ops);
-                }
-                catch (const Coordination::Exception & e)
-                {
-                    /// The part does not exist in ZK. We will add to queue for verification - maybe the part is superfluous, and it must be removed locally.
-                    if (e.code == Coordination::ZNONODE)
-                        storage.enqueuePartForCheck(part->name);
-
-                    throw;
-                }
-
-                /// Apply file changes.
-                transaction->commit();
            }

            /// Columns sizes could be quietly changed in case of MODIFY/ADD COLUMN
--- a/dbms/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp
+++ b/dbms/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp
@ -1,5 +1,6 @@
 #include <Storages/MergeTree/ReplicatedMergeTreePartCheckThread.h>
 #include <Storages/MergeTree/checkDataPart.h>
+#include <Storages/MergeTree/ReplicatedMergeTreePartHeader.h>
 #include <Storages/StorageReplicatedMergeTree.h>
 #include <Common/setThreadName.h>

@ -204,22 +205,34 @@ void ReplicatedMergeTreePartCheckThread::checkPart(const String & part_name)
        auto zookeeper = storage.getZooKeeper();
        auto table_lock = storage.lockStructure(false);

+        auto local_part_header = ReplicatedMergeTreePartHeader::fromColumnsAndChecksums(
+            part->columns, part->checksums);
+
+        String part_path = storage.replica_path + "/parts/" + part_name;
+        String part_znode;
        /// If the part is in ZooKeeper, check its data with its checksums, and them with ZooKeeper.
-        if (zookeeper->exists(storage.replica_path + "/parts/" + part_name))
+        if (zookeeper->tryGet(part_path, part_znode))
        {
            LOG_WARNING(log, "Checking data of part " << part_name << ".");

            try
            {
-                auto zk_checksums = MinimalisticDataPartChecksums::deserializeFrom(
-                    zookeeper->get(storage.replica_path + "/parts/" + part_name + "/checksums"));
-                zk_checksums.checkEqual(part->checksums, true);
+                ReplicatedMergeTreePartHeader zk_part_header;
+                if (!part_znode.empty())
+                    zk_part_header = ReplicatedMergeTreePartHeader::fromString(part_znode);
+                else
+                {
+                    String columns_znode = zookeeper->get(part_path + "/columns");
+                    String checksums_znode = zookeeper->get(part_path + "/checksums");
+                    zk_part_header = ReplicatedMergeTreePartHeader::fromColumnsAndChecksumsZNodes(
+                        columns_znode, checksums_znode);
+                }

-                auto zk_columns = NamesAndTypesList::parse(
-                    zookeeper->get(storage.replica_path + "/parts/" + part_name + "/columns"));
-                if (part->columns != zk_columns)
+                if (local_part_header.getColumnsHash() != zk_part_header.getColumnsHash())
                    throw Exception("Columns of local part " + part_name + " are different from ZooKeeper", ErrorCodes::TABLE_DIFFERS_TOO_MUCH);

+                zk_part_header.getChecksums().checkEqual(local_part_header.getChecksums(), true);
+
                checkDataPart(
                    storage.data.getFullPath() + part_name,
                    storage.data.index_granularity,
--- a/dbms/src/Storages/MergeTree/ReplicatedMergeTreePartHeader.cpp
+++ b/dbms/src/Storages/MergeTree/ReplicatedMergeTreePartHeader.cpp
@ -0,0 +1,66 @@
+#include <Storages/MergeTree/ReplicatedMergeTreePartHeader.h>
+#include <Core/NamesAndTypes.h>
+#include <IO/WriteHelpers.h>
+#include <IO/ReadBufferFromString.h>
+#include <Common/SipHash.h>
+#include <Common/StringUtils/StringUtils.h>
+
+namespace DB
+{
+
+static std::array<char, 16> getSipHash(const String & str)
+{
+    SipHash hash;
+    hash.update(str.data(), str.size());
+    std::array<char, 16> result;
+    hash.get128(result.data());
+    return result;
+}
+
+ReplicatedMergeTreePartHeader ReplicatedMergeTreePartHeader::fromColumnsAndChecksumsZNodes(
+    const String & columns_znode, const String & checksums_znode)
+{
+    auto columns_hash = getSipHash(columns_znode);
+    auto checksums = MinimalisticDataPartChecksums::deserializeFrom(checksums_znode);
+    return ReplicatedMergeTreePartHeader(std::move(columns_hash), std::move(checksums));
+}
+
+ReplicatedMergeTreePartHeader ReplicatedMergeTreePartHeader::fromColumnsAndChecksums(
+    const NamesAndTypesList & columns,
+    const MergeTreeDataPartChecksums & full_checksums)
+{
+    MinimalisticDataPartChecksums checksums;
+    checksums.computeTotalChecksums(full_checksums);
+    return ReplicatedMergeTreePartHeader(getSipHash(columns.toString()), std::move(checksums));
+}
+
+void ReplicatedMergeTreePartHeader::read(ReadBuffer & in)
+{
+    in >> "part header format version: 1\n";
+    in.readStrict(columns_hash.data(), columns_hash.size());
+    checksums.deserializeWithoutHeader(in);
+}
+
+ReplicatedMergeTreePartHeader ReplicatedMergeTreePartHeader::fromString(const String & str)
+{
+    ReadBufferFromString in(str);
+    ReplicatedMergeTreePartHeader result;
+    result.read(in);
+    return result;
+}
+
+void ReplicatedMergeTreePartHeader::write(WriteBuffer & out) const
+{
+    writeString("part header format version: 1\n", out);
+    out.write(columns_hash.data(), columns_hash.size());
+    checksums.serializeWithoutHeader(out);
+}
+
+String ReplicatedMergeTreePartHeader::toString() const
+{
+    WriteBufferFromOwnString out;
+    write(out);
+    return out.str();
+}
+
+}
--- a/dbms/src/Storages/MergeTree/ReplicatedMergeTreePartHeader.h
+++ b/dbms/src/Storages/MergeTree/ReplicatedMergeTreePartHeader.h
@ -0,0 +1,50 @@
+#pragma once
+
+#include <Storages/MergeTree/MergeTreeDataPartChecksum.h>
+#include <Core/Types.h>
+#include <IO/WriteBuffer.h>
+#include <IO/ReadBuffer.h>
+#include <IO/Operators.h>
+#include <array>
+
+
+namespace DB
+{
+
+class NamesAndTypesList;
+
+/// This class provides a compact representation of part metadata (available columns and checksums)
+/// that is intended to be stored in the part znode in ZooKeeper.
+/// It can also be initialized from the legacy format (from the contents of separate <part>/columns
+/// and <part>/checksums znodes).
+class ReplicatedMergeTreePartHeader
+{
+public:
+    ReplicatedMergeTreePartHeader() = default;
+
+    static ReplicatedMergeTreePartHeader fromColumnsAndChecksumsZNodes(
+        const String & columns_znode, const String & checksums_znode);
+
+    static ReplicatedMergeTreePartHeader fromColumnsAndChecksums(
+        const NamesAndTypesList & columns, const MergeTreeDataPartChecksums & full_checksums);
+
+    void read(ReadBuffer & in);
+    static ReplicatedMergeTreePartHeader fromString(const String & str);
+
+    void write(WriteBuffer & out) const;
+    String toString() const;
+
+    const std::array<char, 16> & getColumnsHash() const { return columns_hash; }
+    const MinimalisticDataPartChecksums & getChecksums() const { return checksums; }
+
+private:
+    ReplicatedMergeTreePartHeader(std::array<char, 16> columns_hash_, MinimalisticDataPartChecksums checksums_)
+        : columns_hash(std::move(columns_hash_)), checksums(std::move(checksums_))
+    {
+    }
+
+    std::array<char, 16> columns_hash;
+    MinimalisticDataPartChecksums checksums;
+};
+
+}
--- a/dbms/src/Storages/StorageMaterializedView.cpp
+++ b/dbms/src/Storages/StorageMaterializedView.cpp
@ -30,7 +30,7 @@ namespace ErrorCodes
 static void extractDependentTable(ASTSelectQuery & query, String & select_database_name, String & select_table_name)
 {
    auto db_and_table = getDatabaseAndTable(query, 0);
-    ASTPtr subquery = getTableFunctionOrSubquery(query, 0);
+    ASTPtr subquery = extractTableExpression(query, 0);

    if (!db_and_table && !subquery)
        return;
@ -69,7 +69,7 @@ static void checkAllowedQueries(const ASTSelectQuery & query)
    if (query.prewhere_expression || query.final() || query.sample_size())
        throw Exception("MATERIALIZED VIEW cannot have PREWHERE, SAMPLE or FINAL.", DB::ErrorCodes::QUERY_IS_NOT_SUPPORTED_IN_MATERIALIZED_VIEW);

-    ASTPtr subquery = getTableFunctionOrSubquery(query, 0);
+    ASTPtr subquery = extractTableExpression(query, 0);
    if (!subquery)
        return;

--- a/dbms/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/dbms/src/Storages/StorageReplicatedMergeTree.cpp
@ -8,13 +8,14 @@
 #include <Storages/ColumnsDescription.h>
 #include <Storages/StorageReplicatedMergeTree.h>
 #include <Storages/MergeTree/MergeTreeDataPart.h>
+#include <Storages/MergeTree/MergeList.h>
 #include <Storages/MergeTree/ReplicatedMergeTreeTableMetadata.h>
 #include <Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.h>
 #include <Storages/MergeTree/ReplicatedMergeTreeQuorumEntry.h>
 #include <Storages/MergeTree/ReplicatedMergeTreeMutationEntry.h>
-#include <Storages/MergeTree/MergeList.h>
 #include <Storages/MergeTree/ReplicatedMergeTreeAddress.h>
 #include <Storages/MergeTree/ReplicatedMergeTreeQuorumAddedParts.h>
+#include <Storages/MergeTree/ReplicatedMergeTreePartHeader.h>

 #include <Databases/IDatabase.h>

@ -566,11 +567,8 @@ void StorageReplicatedMergeTree::checkParts(bool skip_sanity_checks)
            }
        }
        else
-        {
-            LOG_ERROR(log, "Fetching missing part " << missing_name);
            parts_to_fetch.push_back(missing_name);
    }
-    }

    for (const String & name : parts_to_fetch)
        expected_parts.erase(name);
@ -671,25 +669,49 @@ void StorageReplicatedMergeTree::checkParts(bool skip_sanity_checks)
        removePartsFromZooKeeper(zookeeper, Strings(expected_parts.begin(), expected_parts.end()));
    }

-    /// Add to the queue job to pick up the missing parts from other replicas and remove from ZK the information that we have them.
-    for (const String & name : parts_to_fetch)
+    /// Add to the queue jobs to pick up the missing parts from other replicas and remove from ZK the information that we have them.
+
+    std::vector<std::future<Coordination::ExistsResponse>> exists_futures;
+    exists_futures.reserve(parts_to_fetch.size());
+    for (const String & part_name : parts_to_fetch)
    {
-        LOG_ERROR(log, "Removing missing part from ZooKeeper and queueing a fetch: " << name);
+        String part_path = replica_path + "/parts/" + part_name;
+        exists_futures.emplace_back(zookeeper->asyncExists(part_path));
+    }
+
+    std::vector<std::future<Coordination::MultiResponse>> enqueue_futures;
+    enqueue_futures.reserve(parts_to_fetch.size());
+    for (size_t i = 0; i < parts_to_fetch.size(); ++i)
+    {
+        const String & part_name = parts_to_fetch[i];
+        LOG_ERROR(log, "Removing locally missing part from ZooKeeper and queueing a fetch: " << part_name);
+
+        Coordination::Requests ops;
+
+        time_t part_create_time = 0;
+        Coordination::ExistsResponse exists_resp = exists_futures[i].get();
+        if (!exists_resp.error)
+        {
+            part_create_time = exists_resp.stat.ctime / 1000;
+            removePartFromZooKeeper(part_name, ops, exists_resp.stat.numChildren > 0);
+        }

        LogEntry log_entry;
        log_entry.type = LogEntry::GET_PART;
        log_entry.source_replica = "";
-        log_entry.new_part_name = name;
-        log_entry.create_time = tryGetPartCreateTime(zookeeper, replica_path, name);
+        log_entry.new_part_name = part_name;
+        log_entry.create_time = part_create_time;

        /// We assume that this occurs before the queue is loaded (queue.initialize).
-        Coordination::Requests ops;
-        removePartFromZooKeeper(name, ops);
        ops.emplace_back(zkutil::makeCreateRequest(
            replica_path + "/queue/queue-", log_entry.toString(), zkutil::CreateMode::PersistentSequential));
-        zookeeper->multi(ops);
+
+        enqueue_futures.emplace_back(zookeeper->asyncMulti(ops));
    }

+    for (auto & future : enqueue_futures)
+        future.get();
+
    /// Remove extra local parts.
    for (const MergeTreeData::DataPartPtr & part : unexpected_parts)
    {
@ -708,18 +730,19 @@ void StorageReplicatedMergeTree::checkPartChecksumsAndAddCommitOps(const zkutil:
    check(part->columns);
    int expected_columns_version = columns_version;

+    auto local_part_header = ReplicatedMergeTreePartHeader::fromColumnsAndChecksums(
+        part->columns, part->checksums);
+
    Strings replicas = zookeeper->getChildren(zookeeper_path + "/replicas");
    std::shuffle(replicas.begin(), replicas.end(), rng);
-    String expected_columns_str = part->columns.toString();
-    bool has_been_alredy_added = false;
+    bool has_been_already_added = false;

    for (const String & replica : replicas)
    {
-        Coordination::Stat stat_before, stat_after;
        String current_part_path = zookeeper_path + "/replicas/" + replica + "/parts/" + part_name;

-        String columns_str;
-        if (!zookeeper->tryGet(current_part_path + "/columns", columns_str, &stat_before))
+        String part_zk_str;
+        if (!zookeeper->tryGet(current_part_path, part_zk_str))
        {
            if (absent_replicas_paths)
                absent_replicas_paths->emplace(current_part_path);
@ -727,30 +750,41 @@ void StorageReplicatedMergeTree::checkPartChecksumsAndAddCommitOps(const zkutil:
            continue;
        }

-        if (columns_str != expected_columns_str)
+        ReplicatedMergeTreePartHeader replica_part_header;
+        if (!part_zk_str.empty())
+            replica_part_header = ReplicatedMergeTreePartHeader::fromString(part_zk_str);
+        else
        {
-            LOG_INFO(log, "Not checking checksums of part " << part_name << " with replica " << replica
-                << " because columns are different");
-            continue;
-        }
-
+            Coordination::Stat columns_stat_before, columns_stat_after;
+            String columns_str;
            String checksums_str;
            /// Let's check that the node's version with the columns did not change while we were reading the checksums.
            /// This ensures that the columns and the checksum refer to the same data.
-        if (!zookeeper->tryGet(current_part_path + "/checksums", checksums_str) ||
-            !zookeeper->exists(current_part_path + "/columns", &stat_after) ||
-            stat_before.version != stat_after.version)
+            if (!zookeeper->tryGet(current_part_path + "/columns", columns_str, &columns_stat_before) ||
+                !zookeeper->tryGet(current_part_path + "/checksums", checksums_str) ||
+                !zookeeper->exists(current_part_path + "/columns", &columns_stat_after) ||
+                columns_stat_before.version != columns_stat_after.version)
            {
                LOG_INFO(log, "Not checking checksums of part " << part_name << " with replica " << replica
                    << " because part changed while we were reading its checksums");
                continue;
            }

-        auto zk_checksums = MinimalisticDataPartChecksums::deserializeFrom(checksums_str);
-        zk_checksums.checkEqual(part->checksums, true);
+            replica_part_header = ReplicatedMergeTreePartHeader::fromColumnsAndChecksumsZNodes(
+                columns_str, checksums_str);
+        }
+
+        if (replica_part_header.getColumnsHash() != local_part_header.getColumnsHash())
+        {
+            LOG_INFO(log, "Not checking checksums of part " << part_name << " with replica " << replica
+                << " because columns are different");
+            continue;
+        }
+
+        replica_part_header.getChecksums().checkEqual(local_part_header.getChecksums(), true);

        if (replica == replica_name)
-            has_been_alredy_added = true;
+            has_been_already_added = true;

        /// If we verify checksums in "sequential manner" (i.e. recheck absence of checksums on other replicas when commit)
        /// then it is enough to verify checksums on at least one replica since checksums on other replicas must be the same.
@ -761,12 +795,20 @@ void StorageReplicatedMergeTree::checkPartChecksumsAndAddCommitOps(const zkutil:
        }
    }

-    if (!has_been_alredy_added)
+    if (!has_been_already_added)
    {
        String part_path = replica_path + "/parts/" + part_name;

        ops.emplace_back(zkutil::makeCheckRequest(
            zookeeper_path + "/columns", expected_columns_version));
+
+        if (data.settings.use_minimalistic_part_header_in_zookeeper)
+        {
+            ops.emplace_back(zkutil::makeCreateRequest(
+                part_path, local_part_header.toString(), zkutil::CreateMode::Persistent));
+        }
+        else
+        {
            ops.emplace_back(zkutil::makeCreateRequest(
                part_path, "", zkutil::CreateMode::Persistent));
            ops.emplace_back(zkutil::makeCreateRequest(
@ -774,6 +816,7 @@ void StorageReplicatedMergeTree::checkPartChecksumsAndAddCommitOps(const zkutil:
            ops.emplace_back(zkutil::makeCreateRequest(
                part_path + "/checksums", getChecksumsForZooKeeper(part->checksums), zkutil::CreateMode::Persistent));
        }
+    }
    else
    {
        LOG_WARNING(log, "checkPartAndAddToZooKeeper: node " << replica_path + "/parts/" + part_name << " already exists."
@ -1510,16 +1553,8 @@ void StorageReplicatedMergeTree::executeClearColumnInPartition(const LogEntry &
        if (!transaction)
            continue;

-        /// Update part metadata in ZooKeeper.
-        Coordination::Requests ops;
-        ops.emplace_back(zkutil::makeSetRequest(
-            replica_path + "/parts/" + part->name + "/columns", transaction->getNewColumns().toString(), -1));
-        ops.emplace_back(zkutil::makeSetRequest(
-            replica_path + "/parts/" + part->name + "/checksums", getChecksumsForZooKeeper(transaction->getNewChecksums()), -1));
+        updatePartHeaderInZooKeeperAndCommit(zookeeper, *transaction);

-        zookeeper->multi(ops);
-
-        transaction->commit();
        ++modified_parts;
    }

@ -2322,12 +2357,15 @@ bool StorageReplicatedMergeTree::createLogEntryToMutatePart(const MergeTreeDataP
 }


-void StorageReplicatedMergeTree::removePartFromZooKeeper(const String & part_name, Coordination::Requests & ops)
+void StorageReplicatedMergeTree::removePartFromZooKeeper(const String & part_name, Coordination::Requests & ops, bool has_children)
 {
    String part_path = replica_path + "/parts/" + part_name;

+    if (has_children)
+    {
        ops.emplace_back(zkutil::makeRemoveRequest(part_path + "/checksums", -1));
        ops.emplace_back(zkutil::makeRemoveRequest(part_path + "/columns", -1));
+    }
    ops.emplace_back(zkutil::makeRemoveRequest(part_path, -1));
 }

@ -2338,19 +2376,26 @@ void StorageReplicatedMergeTree::removePartAndEnqueueFetch(const String & part_n

    String part_path = replica_path + "/parts/" + part_name;

+    Coordination::Requests ops;
+
+    time_t part_create_time = 0;
+    Coordination::Stat stat;
+    if (zookeeper->exists(part_path, &stat))
+    {
+        part_create_time = stat.ctime / 1000;
+        removePartFromZooKeeper(part_name, ops, stat.numChildren > 0);
+    }
+
    LogEntryPtr log_entry = std::make_shared<LogEntry>();
    log_entry->type = LogEntry::GET_PART;
-    log_entry->create_time = tryGetPartCreateTime(zookeeper, replica_path, part_name);
+    log_entry->create_time = part_create_time;
    log_entry->source_replica = "";
    log_entry->new_part_name = part_name;

-    Coordination::Requests ops;
    ops.emplace_back(zkutil::makeCreateRequest(
        replica_path + "/queue/queue-", log_entry->toString(),
        zkutil::CreateMode::PersistentSequential));

-    removePartFromZooKeeper(part_name, ops);
-
    auto results = zookeeper->multi(ops);

    String path_created = dynamic_cast<const Coordination::CreateResponse &>(*results[0]).path_created;
@ -2691,8 +2736,18 @@ bool StorageReplicatedMergeTree::fetchPart(const String & part_name, const Strin
            MinimalisticDataPartChecksums source_part_checksums;
            source_part_checksums.computeTotalChecksums(source_part->checksums);

-            String desired_checksums_str = getZooKeeper()->get(source_replica_path + "/parts/" + part_name + "/checksums");
-            auto desired_checksums = MinimalisticDataPartChecksums::deserializeFrom(desired_checksums_str);
+            MinimalisticDataPartChecksums desired_checksums;
+            auto zookeeper = getZooKeeper();
+            String part_path = source_replica_path + "/parts/" + part_name;
+            String part_znode = zookeeper->get(part_path);
+            if (!part_znode.empty())
+                desired_checksums = ReplicatedMergeTreePartHeader::fromString(part_znode).getChecksums();
+            else
+            {
+                String desired_checksums_str = zookeeper->get(part_path + "/checksums");
+                desired_checksums = MinimalisticDataPartChecksums::deserializeFrom(desired_checksums_str);
+            }
+
            if (source_part_checksums == desired_checksums)
            {
                LOG_TRACE(log, "Found local part " << source_part->name << " with the same checksums as " << part_name);
@ -4450,32 +4505,40 @@ bool StorageReplicatedMergeTree::tryRemovePartsFromZooKeeperWithRetries(MergeTre

 bool StorageReplicatedMergeTree::tryRemovePartsFromZooKeeperWithRetries(const Strings & part_names, size_t max_retries)
 {
-    using MultiFuture = std::future<Coordination::MultiResponse>;
-
    size_t num_tries = 0;
-    bool sucess = false;
+    bool success = false;

-    while (!sucess && (max_retries == 0 || num_tries < max_retries))
+    while (!success && (max_retries == 0 || num_tries < max_retries))
    {
-        std::vector<MultiFuture> futures;
-        futures.reserve(part_names.size());
-
-        ++num_tries;
-        sucess = true;
-
        try
        {
+            ++num_tries;
+            success = true;
+
            auto zookeeper = getZooKeeper();

+            std::vector<std::future<Coordination::ExistsResponse>> exists_futures;
+            exists_futures.reserve(part_names.size());
            for (const String & part_name : part_names)
            {
-                Coordination::Requests ops;
-                removePartFromZooKeeper(part_name, ops);
-
-                futures.emplace_back(zookeeper->tryAsyncMulti(ops));
+                String part_path = replica_path + "/parts/" + part_name;
+                exists_futures.emplace_back(zookeeper->asyncExists(part_path));
            }

-            for (auto & future : futures)
+            std::vector<std::future<Coordination::MultiResponse>> remove_futures;
+            remove_futures.reserve(part_names.size());
+            for (size_t i = 0; i < part_names.size(); ++i)
+            {
+                Coordination::ExistsResponse exists_resp = exists_futures[i].get();
+                if (!exists_resp.error)
+                {
+                    Coordination::Requests ops;
+                    removePartFromZooKeeper(part_names[i], ops, exists_resp.stat.numChildren > 0);
+                    remove_futures.emplace_back(zookeeper->tryAsyncMulti(ops));
+                }
+            }
+
+            for (auto & future : remove_futures)
            {
                auto response = future.get();

@ -4484,7 +4547,7 @@ bool StorageReplicatedMergeTree::tryRemovePartsFromZooKeeperWithRetries(const St

                if (Coordination::isHardwareError(response.error))
                {
-                    sucess = false;
+                    success = false;
                    continue;
                }

@ -4493,7 +4556,7 @@ bool StorageReplicatedMergeTree::tryRemovePartsFromZooKeeperWithRetries(const St
        }
        catch (Coordination::Exception & e)
        {
-            sucess = false;
+            success = false;

            if (Coordination::isHardwareError(e.code))
                tryLogCurrentException(log, __PRETTY_FUNCTION__);
@ -4501,69 +4564,78 @@ bool StorageReplicatedMergeTree::tryRemovePartsFromZooKeeperWithRetries(const St
                throw;
        }

-        if (!sucess && num_tries < max_retries)
+        if (!success && num_tries < max_retries)
            std::this_thread::sleep_for(std::chrono::milliseconds(1000));
    }

-    return sucess;
+    return success;
 }

-/// TODO: rewrite this code using async Multi ops after final ZooKeeper library update
-void StorageReplicatedMergeTree::removePartsFromZooKeeper(zkutil::ZooKeeperPtr & zookeeper, const Strings & part_names,
-                                                          NameSet * parts_should_be_retried)
+void StorageReplicatedMergeTree::removePartsFromZooKeeper(
+    zkutil::ZooKeeperPtr & zookeeper, const Strings & part_names, NameSet * parts_should_be_retried)
 {
+    std::vector<std::future<Coordination::ExistsResponse>> exists_futures;
+    exists_futures.reserve(part_names.size());
+    for (const String & part_name : part_names)
+    {
+        String part_path = replica_path + "/parts/" + part_name;
+        exists_futures.emplace_back(zookeeper->asyncExists(part_path));
+    }
+
+    std::vector<std::future<Coordination::MultiResponse>> remove_futures;
+    remove_futures.reserve(part_names.size());
+    try
+    {
+        for (size_t i = 0; i < part_names.size(); ++i)
+        {
+            Coordination::ExistsResponse exists_resp = exists_futures[i].get();
+            if (!exists_resp.error)
+            {
                Coordination::Requests ops;
-    auto it_first_node_in_batch = part_names.cbegin();
-
-    for (auto it = part_names.cbegin(); it != part_names.cend(); ++it)
-    {
-        removePartFromZooKeeper(*it, ops);
-
-        auto it_next = std::next(it);
-        if (ops.size() >= zkutil::MULTI_BATCH_SIZE || it_next == part_names.cend())
-        {
-            Coordination::Responses unused_responses;
-            auto code = zookeeper->tryMultiNoThrow(ops, unused_responses);
-            ops.clear();
-
-            if (code == Coordination::ZNONODE)
-            {
-                /// Fallback
-                LOG_DEBUG(log, "ZooKeeper nodes for some parts in the batch are missing, will remove part nodes one by one");
-
-                for (auto it_in_batch = it_first_node_in_batch; it_in_batch != it_next; ++it_in_batch)
-                {
-                    Coordination::Requests cur_ops;
-                    removePartFromZooKeeper(*it_in_batch, cur_ops);
-                    auto cur_code = zookeeper->tryMultiNoThrow(cur_ops, unused_responses);
-
-                    if (cur_code == Coordination::ZNONODE)
-                    {
-                        LOG_DEBUG(log, "There is no part " << *it_in_batch << " in ZooKeeper, it was only in filesystem");
+                removePartFromZooKeeper(part_names[i], ops, exists_resp.stat.numChildren > 0);
+                remove_futures.emplace_back(zookeeper->tryAsyncMulti(ops));
            }
-                    else if (parts_should_be_retried && Coordination::isHardwareError(cur_code))
+            else
            {
-                        parts_should_be_retried->emplace(*it_in_batch);
-                    }
-                    else if (cur_code)
-                    {
-                        LOG_WARNING(log, "Cannot remove part " << *it_in_batch << " from ZooKeeper: " << zkutil::ZooKeeper::error2string(cur_code));
+                LOG_DEBUG(log,
+                    "There is no part " << part_names[i] << " in ZooKeeper, it was only in filesystem");
+                // emplace invalid future so that the total number of futures is the same as part_names.size();
+                remove_futures.emplace_back();
            }
        }
    }
-            else if (parts_should_be_retried && Coordination::isHardwareError(code))
+    catch (const Coordination::Exception & e)
    {
-                for (auto it_in_batch = it_first_node_in_batch; it_in_batch != it_next; ++it_in_batch)
-                    parts_should_be_retried->emplace(*it_in_batch);
-            }
-            else if (code)
-            {
-                LOG_WARNING(log, "There was a problem with deleting " << (it_next - it_first_node_in_batch)
-                    << " nodes from ZooKeeper: " << ::zkutil::ZooKeeper::error2string(code));
+        if (parts_should_be_retried && Coordination::isHardwareError(e.code))
+            parts_should_be_retried->insert(part_names.begin(), part_names.end());
+        throw;
    }

-            it_first_node_in_batch = it_next;
+    for (size_t i = 0; i < remove_futures.size(); ++i)
+    {
+        auto & future = remove_futures[i];
+
+        if (!future.valid())
+            continue;
+
+        auto response = future.get();
+        if (response.error == Coordination::ZOK)
+            continue;
+        else if (response.error == Coordination::ZNONODE)
+        {
+            LOG_DEBUG(log,
+                "There is no part " << part_names[i] << " in ZooKeeper, it was only in filesystem");
+            continue;
        }
+        else if (Coordination::isHardwareError(response.error))
+        {
+            if (parts_should_be_retried)
+                parts_should_be_retried->insert(part_names[i]);
+            continue;
+        }
+        else
+            LOG_WARNING(log, "Cannot remove part " << part_names[i] << " from ZooKeeper: "
+                << zkutil::ZooKeeper::error2string(response.error));
    }
 }

@ -4809,6 +4881,16 @@ void StorageReplicatedMergeTree::getCommitPartOps(
    ops.emplace_back(zkutil::makeCheckRequest(
        zookeeper_path + "/columns",
        columns_version));
+
+    if (data.settings.use_minimalistic_part_header_in_zookeeper)
+    {
+        ops.emplace_back(zkutil::makeCreateRequest(
+            replica_path + "/parts/" + part->name,
+            ReplicatedMergeTreePartHeader::fromColumnsAndChecksums(part->columns, part->checksums).toString(),
+            zkutil::CreateMode::Persistent));
+    }
+    else
+    {
        ops.emplace_back(zkutil::makeCreateRequest(
            replica_path + "/parts/" + part->name,
            "",
@ -4821,6 +4903,59 @@ void StorageReplicatedMergeTree::getCommitPartOps(
            replica_path + "/parts/" + part->name + "/checksums",
            getChecksumsForZooKeeper(part->checksums),
            zkutil::CreateMode::Persistent));
+    }
+}
+
+void StorageReplicatedMergeTree::updatePartHeaderInZooKeeperAndCommit(
+    const zkutil::ZooKeeperPtr & zookeeper,
+    MergeTreeData::AlterDataPartTransaction & transaction)
+{
+    String part_path = replica_path + "/parts/" + transaction.getPartName();
+
+    bool need_delete_columns_and_checksums_nodes = false;
+    try
+    {
+        if (data.settings.use_minimalistic_part_header_in_zookeeper)
+        {
+            auto part_header = ReplicatedMergeTreePartHeader::fromColumnsAndChecksums(
+                transaction.getNewColumns(), transaction.getNewChecksums());
+            Coordination::Stat stat;
+            zookeeper->set(part_path, part_header.toString(), -1, &stat);
+
+            need_delete_columns_and_checksums_nodes = stat.numChildren > 0;
+        }
+        else
+        {
+            Coordination::Requests ops;
+            ops.emplace_back(zkutil::makeSetRequest(
+                    part_path, String(), -1));
+            ops.emplace_back(zkutil::makeSetRequest(
+                    part_path + "/columns", transaction.getNewColumns().toString(), -1));
+            ops.emplace_back(zkutil::makeSetRequest(
+                    part_path + "/checksums", getChecksumsForZooKeeper(transaction.getNewChecksums()), -1));
+            zookeeper->multi(ops);
+        }
+    }
+    catch (const Coordination::Exception & e)
+    {
+        /// The part does not exist in ZK. We will add to queue for verification - maybe the part is superfluous, and it must be removed locally.
+        if (e.code == Coordination::ZNONODE)
+            enqueuePartForCheck(transaction.getPartName());
+
+        throw;
+    }
+
+    /// Apply file changes.
+    transaction.commit();
+
+    /// Legacy <part_path>/columns and <part_path>/checksums znodes are not needed anymore and can be deleted.
+    if (need_delete_columns_and_checksums_nodes)
+    {
+        Coordination::Requests ops;
+        ops.emplace_back(zkutil::makeRemoveRequest(part_path + "/columns", -1));
+        ops.emplace_back(zkutil::makeRemoveRequest(part_path + "/checksums", -1));
+        zookeeper->multi(ops);
+    }
 }

 ReplicatedMergeTreeAddress StorageReplicatedMergeTree::getReplicatedMergeTreeAddress() const
--- a/dbms/src/Storages/StorageReplicatedMergeTree.h
+++ b/dbms/src/Storages/StorageReplicatedMergeTree.h
@ -372,8 +372,14 @@ private:
        MergeTreeData::MutableDataPartPtr & part,
        const String & block_id_path = "") const;

+    /// Updates info about part columns and checksums in ZooKeeper and commits transaction if successful.
+    void updatePartHeaderInZooKeeperAndCommit(
+        const zkutil::ZooKeeperPtr & zookeeper,
+        MergeTreeData::AlterDataPartTransaction & transaction);
+
    /// Adds actions to `ops` that remove a part from ZooKeeper.
-    void removePartFromZooKeeper(const String & part_name, Coordination::Requests & ops);
+    /// Set has_children to true for "old-style" parts (those with /columns and /checksums child znodes).
+    void removePartFromZooKeeper(const String & part_name, Coordination::Requests & ops, bool has_children);

    /// Quickly removes big set of parts from ZooKeeper (using async multi queries)
    void removePartsFromZooKeeper(zkutil::ZooKeeperPtr & zookeeper, const Strings & part_names,
--- a/dbms/src/Storages/tests/CMakeLists.txt
+++ b/dbms/src/Storages/tests/CMakeLists.txt
@ -24,3 +24,6 @@ target_link_libraries (get_current_inserts_in_replicated PRIVATE dbms clickhouse

 add_executable (get_abandonable_lock_in_all_partitions get_abandonable_lock_in_all_partitions.cpp)
 target_link_libraries (get_abandonable_lock_in_all_partitions PRIVATE dbms clickhouse_common_config clickhouse_common_zookeeper)
+
+add_executable (transform_part_zk_nodes transform_part_zk_nodes.cpp)
+target_link_libraries (transform_part_zk_nodes PRIVATE dbms clickhouse_common_config clickhouse_common_zookeeper)
--- a/dbms/src/Storages/tests/transform_part_zk_nodes.cpp
+++ b/dbms/src/Storages/tests/transform_part_zk_nodes.cpp
@ -0,0 +1,130 @@
+#include <Storages/MergeTree/ReplicatedMergeTreePartHeader.h>
+#include <Storages/MergeTree/MergeTreeDataPartChecksum.h>
+#include <IO/WriteBufferFromString.h>
+#include <IO/WriteHelpers.h>
+#include <Common/Exception.h>
+#include <Common/StringUtils/StringUtils.h>
+#include <Common/ZooKeeper/ZooKeeper.h>
+#include <Common/ZooKeeper/KeeperException.h>
+
+#include <boost/program_options.hpp>
+
+#include <list>
+#include <iostream>
+
+
+int main(int argc, char ** argv)
+try
+{
+    boost::program_options::options_description desc("Allowed options");
+    desc.add_options()
+        ("help,h", "produce help message")
+        ("address,a", boost::program_options::value<std::string>()->required(),
+            "addresses of ZooKeeper instances, comma separated. Example: example01e.yandex.ru:2181")
+        ("path,p", boost::program_options::value<std::string>()->required(),
+            "where to start")
+    ;
+
+    boost::program_options::variables_map options;
+    boost::program_options::store(boost::program_options::parse_command_line(argc, argv, desc), options);
+
+    if (options.count("help"))
+    {
+        std::cout << "Transform contents of part nodes in ZooKeeper to more compact storage scheme." << std::endl;
+        std::cout << "Usage: " << argv[0] << " [options]" << std::endl;
+        std::cout << desc << std::endl;
+        return 1;
+    }
+
+    zkutil::ZooKeeper zookeeper(options.at("address").as<std::string>());
+
+    std::string initial_path = options.at("path").as<std::string>();
+
+    struct Node
+    {
+        Node(
+            std::string path_,
+            std::future<Coordination::GetResponse> get_future_,
+            std::future<Coordination::ListResponse> children_future_,
+            Node * parent_)
+            : path(std::move(path_))
+            , get_future(std::move(get_future_))
+            , children_future(std::move(children_future_))
+            , parent(parent_)
+        {
+        }
+
+        std::string path;
+        std::future<Coordination::GetResponse> get_future;
+        std::future<Coordination::ListResponse> children_future;
+
+        Node * parent = nullptr;
+        std::future<Coordination::MultiResponse> set_future;
+    };
+
+    std::list<Node> nodes_queue;
+    nodes_queue.emplace_back(
+        initial_path, zookeeper.asyncGet(initial_path), zookeeper.asyncGetChildren(initial_path), nullptr);
+
+    for (auto it = nodes_queue.begin(); it != nodes_queue.end(); ++it)
+    {
+        Coordination::GetResponse get_response;
+        Coordination::ListResponse children_response;
+        try
+        {
+            get_response = it->get_future.get();
+            children_response = it->children_future.get();
+        }
+        catch (const Coordination::Exception & e)
+        {
+            if (e.code == Coordination::ZNONODE)
+                continue;
+            throw;
+        }
+
+        if (get_response.stat.ephemeralOwner)
+            continue;
+
+        if (it->path.find("/parts/") != std::string::npos
+            && !endsWith(it->path, "/columns")
+            && !endsWith(it->path, "/checksums"))
+        {
+            if (!children_response.names.empty())
+            {
+                auto part_header =  DB::ReplicatedMergeTreePartHeader::fromColumnsAndChecksumsZNodes(
+                    zookeeper.get(it->path + "/columns"), zookeeper.get(it->path + "/checksums"));
+
+                Coordination::Requests ops;
+                ops.emplace_back(zkutil::makeRemoveRequest(it->path + "/columns", -1));
+                ops.emplace_back(zkutil::makeRemoveRequest(it->path + "/checksums", -1));
+                ops.emplace_back(zkutil::makeSetRequest(it->path, part_header.toString(), -1));
+
+                it->set_future = zookeeper.asyncMulti(ops);
+            }
+        }
+        else
+        {
+            for (const auto & name : children_response.names)
+            {
+                std::string child_path = it->path == "/" ? it->path + name : it->path + '/' + name;
+                nodes_queue.emplace_back(
+                    child_path, zookeeper.asyncGet(child_path), zookeeper.asyncGetChildren(child_path),
+                    &(*it));
+            }
+        }
+    }
+
+    for (auto it = nodes_queue.begin(); it != nodes_queue.end(); ++it)
+    {
+        if (it->set_future.valid())
+        {
+            it->set_future.get();
+            std::cerr << it->path << " changed!" << std::endl;
+        }
+    }
+}
+catch (...)
+{
+    std::cerr << DB::getCurrentExceptionMessage(true) << '\n';
+    throw;
+}
--- a/dbms/src/TableFunctions/TableFunctionRemote.cpp
+++ b/dbms/src/TableFunctions/TableFunctionRemote.cpp
@ -12,6 +12,7 @@
 #include <Common/typeid_cast.h>
 #include <Common/parseRemoteDescription.h>
 #include <TableFunctions/TableFunctionFactory.h>
+#include <Core/Defines.h>


 namespace DB
@ -152,7 +153,8 @@ StoragePtr TableFunctionRemote::executeImpl(const ASTPtr & ast_function, const C
        if (names.empty())
            throw Exception("Shard list is empty after parsing first argument", ErrorCodes::BAD_ARGUMENTS);

-        cluster = std::make_shared<Cluster>(context.getSettings(), names, username, password, context.getTCPPort(), false);
+        auto maybe_secure_port = context.getTCPPortSecure();
+        cluster = std::make_shared<Cluster>(context.getSettings(), names, username, password, (secure ? (maybe_secure_port ? *maybe_secure_port : DBMS_DEFAULT_SECURE_PORT) : context.getTCPPort()), false, secure);
    }

    auto structure_remote_table = getStructureOfRemoteTable(*cluster, remote_database, remote_table, context, remote_table_function_ptr);
@ -177,8 +179,8 @@ StoragePtr TableFunctionRemote::executeImpl(const ASTPtr & ast_function, const C
 }


-TableFunctionRemote::TableFunctionRemote(const std::string & name_)
-    : name(name_)
+TableFunctionRemote::TableFunctionRemote(const std::string & name_, bool secure)
+    : name{name_}, secure{secure}
 {
    is_cluster_function = name == "cluster";

@ -193,6 +195,7 @@ TableFunctionRemote::TableFunctionRemote(const std::string & name_)
 void registerTableFunctionRemote(TableFunctionFactory & factory)
 {
    factory.registerFunction("remote", [] () -> TableFunctionPtr { return std::make_shared<TableFunctionRemote>("remote"); });
+    factory.registerFunction("remoteSecure", [] () -> TableFunctionPtr { return std::make_shared<TableFunctionRemote>("remote", /* secure = */ true); });
    factory.registerFunction("cluster", [] () -> TableFunctionPtr { return std::make_shared<TableFunctionRemote>("cluster"); });
 }

--- a/dbms/src/TableFunctions/TableFunctionRemote.h
+++ b/dbms/src/TableFunctions/TableFunctionRemote.h
@ -16,7 +16,7 @@ namespace DB
 class TableFunctionRemote : public ITableFunction
 {
 public:
-    explicit TableFunctionRemote(const std::string & name_ = "remote");
+    explicit TableFunctionRemote(const std::string & name_ = "remote", bool secure = false);

    std::string getName() const override { return name; }

@ -26,6 +26,7 @@ private:
    std::string name;
    bool is_cluster_function;
    std::string help_message;
+    bool secure;
 };

 }
--- a/dbms/tests/clickhouse-test
+++ b/dbms/tests/clickhouse-test
@ -132,6 +132,9 @@ def main(args):
            if 'stateful' in suite and not is_data_present():
                print("Won't run stateful tests because test data wasn't loaded.")
                continue
+            if 'stateless' in suite and args.no_stateless:
+                print("Won't run stateless tests because they were manually disabled.")
+                continue

            # Reverse sort order: we want run newest test first.
            # And not reverse subtests
@ -343,6 +346,7 @@ if __name__ == '__main__':
    parser.add_argument('--order', default = 'desc', help = 'Run order (asc, desc, random)')
    parser.add_argument('--testname', action = 'store_true', default = None, dest = 'testname', help = 'Make query with test name before test run')

+    parser.add_argument('--no-stateless', action = 'store_true', help = 'Disable all stateless tests')
    parser.add_argument('--skip', nargs='+', help = "Skip these tests")
    parser.add_argument('--no-long', action = 'store_false', dest = 'no_long', help = 'Do not run long tests')
    group = parser.add_mutually_exclusive_group(required = False)
--- a/dbms/tests/performance/string_search/constant_column_comparison.xml
+++ b/dbms/tests/performance/string_search/constant_column_comparison.xml
@ -0,0 +1,46 @@
+<test>
+    <name>Constant column string search</name>
+
+    <tags>
+        <tag>search</tag>
+    </tags>
+
+    <preconditions>
+        <table_exists>hits_100m_single</table_exists>
+    </preconditions>
+
+    <type>loop</type>
+
+    <stop_conditions>
+        <all_of>
+            <iterations>5</iterations>
+            <min_time_not_changing_for_ms>10000</min_time_not_changing_for_ms>
+        </all_of>
+        <any_of>
+            <iterations>50</iterations>
+            <total_time_ms>60000</total_time_ms>
+        </any_of>
+    </stop_conditions>
+
+    <query><![CDATA[select count(position(URL, 'yandex')), count(position(URL, 'google')) FROM hits_100m_single]]></query>
+    <query><![CDATA[select count(multiPosition(URL, ['yandex', 'google'])) FROM hits_100m_single]]></query>
+    <query><![CDATA[select count(match(URL, 'yandex|google')) FROM hits_100m_single]]></query>
+
+    <query><![CDATA[select sum(match(URL, 'yandex')), sum(match(URL, 'google')), sum(match(URL, 'yahoo')), sum(match(URL, 'pikabu')) FROM hits_100m_single]]></query>
+    <query><![CDATA[select sum(multiSearch(URL, ['yandex', 'google', 'yahoo', 'pikabu'])) from hits_100m_single]]></query>
+    <query><![CDATA[select sum(match(URL, 'yandex|google|yahoo|pikabu')) FROM hits_100m_single]]></query>
+
+    <query><![CDATA[select sum(match(URL, 'yandex')), sum(match(URL, 'google')), sum(match(URL, 'http')) FROM hits_100m_single]]></query>
+    <query><![CDATA[select sum(multiSearch(URL, ['yandex', 'google', 'http'])) from hits_100m_single]]></query>
+    <query><![CDATA[select sum(match(URL, 'yandex|google|http')) FROM hits_100m_single]]></query>
+
+    <query><![CDATA[select sum(match(URL, 'yandex')), sum(match(URL, 'google')), sum(match(URL, 'facebook')), sum(match(URL, 'wikipedia')), sum(match(URL, 'reddit')) FROM hits_100m_single]]></query>
+    <query><![CDATA[select sum(multiSearch(URL, ['yandex', 'google', 'facebook', 'wikipedia', 'reddit'])) from hits_100m_single]]></query>
+    <query><![CDATA[select sum(match(URL, 'yandex|google|facebook|wikipedia|reddit')) FROM hits_100m_single]]></query>
+
+    <query><![CDATA[select sum(firstMatch(URL, ['yandex', 'google', 'http', 'facebook', 'google'])) from hits_100m_single]]></query>
+
+    <main_metric>
+        <min_time/>
+    </main_metric>
+</test>
--- a/dbms/tests/queries/0_stateless/00233_position_function_family.reference
+++ b/dbms/tests/queries/0_stateless/00233_position_function_family.reference
--- a/dbms/tests/queries/0_stateless/00233_position_function_family.sql
+++ b/dbms/tests/queries/0_stateless/00233_position_function_family.sql
--- a/dbms/tests/queries/0_stateless/00505_shard_secure.reference
+++ b/dbms/tests/queries/0_stateless/00505_shard_secure.reference
@ -0,0 +1,10 @@
+0
+0
+0
+0
+0
+0
+0
+0
+0
+0
--- a/dbms/tests/queries/0_stateless/00505_shard_secure.sh
+++ b/dbms/tests/queries/0_stateless/00505_shard_secure.sh
@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+
+# set -x
+
+CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+. $CURDIR/../shell_config.sh
+
+# Not default server config needed
+
+tcp_port_secure=`$CLICKHOUSE_EXTRACT_CONFIG -k tcp_port_secure 2>/dev/null`
+if [ -z $tcp_port_secure ]; then
+    # Secure port disabled. Fake result
+    cat $CURDIR/00505_shard_secure.reference
+else
+
+    $CLICKHOUSE_CLIENT -q "SELECT * FROM remoteSecure('127.0.0.{1,2}', system.one);"
+    $CLICKHOUSE_CLIENT -q "SELECT * FROM remoteSecure('127.0.0.{1,2}:$CLICKHOUSE_PORT_TCP_SECURE', system.one);"
+    $CLICKHOUSE_CLIENT -q "SELECT * FROM remoteSecure('127.0.0.{1,2}', system.one);"
+    $CLICKHOUSE_CLIENT -q "SELECT * FROM remoteSecure(test_shard_localhost_secure, system.one);"
+    $CLICKHOUSE_CLIENT -q "SELECT * FROM remote(test_shard_localhost_secure, system.one);"
+    $CLICKHOUSE_CLIENT -q "SELECT * FROM remoteSecure(test_shard_localhost, system.one);"
+    $CLICKHOUSE_CLIENT -q "SELECT * FROM remote(test_shard_localhost, system.one);"
+
+fi
--- a/dbms/tests/queries/0_stateless/00814_replicated_minimalistic_part_header_zookeeper.reference
+++ b/dbms/tests/queries/0_stateless/00814_replicated_minimalistic_part_header_zookeeper.reference
@ -0,0 +1,35 @@
+*** Test fetches ***
+*** replica 1 ***
+1	1
+2	2
+*** replica 2 ***
+1	1
+2	2
+*** Test merges ***
+*** replica 1 ***
+all_0_1_1	1
+all_0_1_1	2
+*** replica 2 ***
+all_0_1_1	1
+all_0_1_1	2
+*** Test part removal ***
+*** replica 1 ***
+all_0_1_1
+all_0_1_1
+*** replica 2 ***
+all_0_1_1
+all_0_1_1
+*** Test ALTER ***
+*** replica 1 ***
+1	1
+2	1
+*** replica 2 ***
+1	1
+2	1
+*** Test CLEAR COLUMN ***
+*** replica 1 ***
+1	0
+2	0
+*** replica 2 ***
+1	0
+2	0
--- a/dbms/tests/queries/0_stateless/00814_replicated_minimalistic_part_header_zookeeper.sql
+++ b/dbms/tests/queries/0_stateless/00814_replicated_minimalistic_part_header_zookeeper.sql
@ -0,0 +1,61 @@
+DROP TABLE IF EXISTS test.part_header_r1;
+DROP TABLE IF EXISTS test.part_header_r2;
+
+CREATE TABLE test.part_header_r1(x UInt32, y UInt32)
+    ENGINE ReplicatedMergeTree('/clickhouse/tables/test/part_header', '1') ORDER BY x
+    SETTINGS use_minimalistic_part_header_in_zookeeper = 0,
+             old_parts_lifetime = 1,
+             cleanup_delay_period = 0,
+             cleanup_delay_period_random_add = 0;
+CREATE TABLE test.part_header_r2(x UInt32, y UInt32)
+    ENGINE ReplicatedMergeTree('/clickhouse/tables/test/part_header', '2') ORDER BY x
+    SETTINGS use_minimalistic_part_header_in_zookeeper = 1,
+             old_parts_lifetime = 1,
+             cleanup_delay_period = 0,
+             cleanup_delay_period_random_add = 0;
+
+SELECT '*** Test fetches ***';
+INSERT INTO test.part_header_r1 VALUES (1, 1);
+INSERT INTO test.part_header_r2 VALUES (2, 2);
+SYSTEM SYNC REPLICA test.part_header_r1;
+SYSTEM SYNC REPLICA test.part_header_r2;
+SELECT '*** replica 1 ***';
+SELECT x, y FROM test.part_header_r1 ORDER BY x;
+SELECT '*** replica 2 ***';
+SELECT x, y FROM test.part_header_r2 ORDER BY x;
+
+SELECT '*** Test merges ***';
+OPTIMIZE TABLE test.part_header_r1;
+SYSTEM SYNC REPLICA test.part_header_r2;
+SELECT '*** replica 1 ***';
+SELECT _part, x FROM test.part_header_r1 ORDER BY x;
+SELECT '*** replica 2 ***';
+SELECT _part, x FROM test.part_header_r2 ORDER BY x;
+
+SELECT sleep(2) FORMAT Null;
+
+SELECT '*** Test part removal ***';
+SELECT '*** replica 1 ***';
+SELECT name FROM system.parts WHERE database = 'test' AND table = 'part_header_r1';
+SELECT name FROM system.zookeeper WHERE path = '/clickhouse/tables/test/part_header/replicas/1/parts';
+SELECT '*** replica 2 ***';
+SELECT name FROM system.parts WHERE database = 'test' AND table = 'part_header_r2';
+SELECT name FROM system.zookeeper WHERE path = '/clickhouse/tables/test/part_header/replicas/1/parts';
+
+SELECT '*** Test ALTER ***';
+ALTER TABLE test.part_header_r1 MODIFY COLUMN y String;
+SELECT '*** replica 1 ***';
+SELECT x, length(y) FROM test.part_header_r1 ORDER BY x;
+SELECT '*** replica 2 ***';
+SELECT x, length(y) FROM test.part_header_r2 ORDER BY x;
+
+SELECT '*** Test CLEAR COLUMN ***';
+SET replication_alter_partitions_sync = 2;
+ALTER TABLE test.part_header_r1 CLEAR COLUMN y IN PARTITION tuple();
+SELECT '*** replica 1 ***';
+SELECT x, length(y) FROM test.part_header_r1 ORDER BY x;
+SELECT '*** replica 2 ***';
+SELECT x, length(y) FROM test.part_header_r2 ORDER BY x;
+
+DROP TABLE test.part_header_r1;
+DROP TABLE test.part_header_r2;
--- a/dbms/tests/queries/0_stateless/00823_capnproto_input.reference
+++ b/dbms/tests/queries/0_stateless/00823_capnproto_input.reference
@ -0,0 +1 @@
+1	2	5	3	4	7	6	
--- a/dbms/tests/queries/0_stateless/00823_capnproto_input.sh
+++ b/dbms/tests/queries/0_stateless/00823_capnproto_input.sh
@ -0,0 +1,59 @@
+#!/usr/bin/env bash
+
+set -e 
+
+CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+. $CURDIR/../shell_config.sh
+
+#create the schema file
+echo "
+@0x803231eaa402b968;
+struct NestedNestedOne
+{
+    nestednestednumber @0 : UInt64;
+}
+struct NestedNestedTwo
+{
+    nestednestedtext @0 : Text;
+}
+struct NestedOne
+{
+    nestednestedone @0 : NestedNestedOne;
+    nestednestedtwo @1 : NestedNestedTwo;
+    nestednumber @2: UInt64;
+}
+struct NestedTwo
+{
+    nestednestedone @0 : NestedNestedOne;
+    nestednestedtwo @1 : NestedNestedTwo;
+    nestedtext @2 : Text;
+}
+struct CapnProto
+{
+    number @0 : UInt64;
+    string @1 : Text;
+    nestedone @2 : NestedOne;
+    nestedtwo @3 : NestedTwo;
+    nestedthree @4 : NestedNestedTwo;
+}" > test.capnp
+
+$CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS test.capnproto_input"
+$CLICKHOUSE_CLIENT -q "CREATE TABLE test.capnproto_input
+(
+    number UInt64,
+    string String,
+    nestedone_nestednumber UInt64,
+    nestedone_nestednestedone_nestednestednumber UInt64,
+    nestedone_nestednestedtwo_nestednestedtext String,
+    nestedtwo_nestednestedtwo_nestednestedtext String,
+    nestedtwo_nestednestedone_nestednestednumber UInt64,
+    nestedtwo_nestedtext String
+) ENGINE = Memory"
+
+echo  -ne '\x00\x00\x00\x00\x15\x00\x00\x00\x00\x00\x00\x00\x01\x00\x04\x00\x01\x00\x00\x00\x00\x00\x00\x00\x0d\x00\x00\x00\x12\x00\x00\x00\x0c\x00\x00\x00\x01\x00\x02\x00\x20\x00\x00\x00\x00\x00\x03\x00\x34\x00\x00\x00\x00\x00\x01\x00\x32\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00\x04\x00\x00\x00\x01\x00\x00\x00\x04\x00\x00\x00\x00\x00\x01\x00\x03\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x12\x00\x00\x00\x34\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00\x01\x00\x00\x00\x08\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x06\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x12\x00\x00\x00\x37\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x12\x00\x00\x00\x39\x00\x00\x00\x00\x00\x00\x00' | $CLICKHOUSE_CLIENT --stacktrace --format_schema='test:CapnProto' --query="INSERT INTO test.capnproto_input FORMAT CapnProto";
+
+$CLICKHOUSE_CLIENT -q "SELECT * FROM test.capnproto_input"
+$CLICKHOUSE_CLIENT -q "DROP TABLE test.capnproto_input"
+
+# remove the schema file
+rm test.capnp
--- a/dbms/tests/queries/0_stateless/00823_sequence_match_dfa.reference
+++ b/dbms/tests/queries/0_stateless/00823_sequence_match_dfa.reference
@ -0,0 +1 @@
+1
--- a/dbms/tests/queries/0_stateless/00823_sequence_match_dfa.sql
+++ b/dbms/tests/queries/0_stateless/00823_sequence_match_dfa.sql
@ -0,0 +1,25 @@
+-- this test cannot pass without the new DFA matching algorithm of sequenceMatch
+
+DROP TABLE IF EXISTS test.sequence;
+
+CREATE TABLE test.sequence
+(
+    userID UInt64,
+    eventType Enum8('A' = 1, 'B' = 2, 'C' = 3),
+    EventTime UInt64
+)
+ENGINE = Memory;
+
+INSERT INTO test.sequence SELECT 1, number = 0 ? 'A' : (number < 1000000 ? 'B' : 'C'), number FROM numbers(1000001);
+
+SELECT userID
+FROM test.sequence
+GROUP BY userID
+HAVING sequenceMatch('(?1).*(?2).*(?3)')(toDateTime(EventTime), eventType = 'A', eventType = 'B', eventType = 'C');
+
+SELECT userID
+FROM test.sequence
+GROUP BY userID
+HAVING sequenceMatch('(?1).*(?2).*(?3)')(toDateTime(EventTime), eventType = 'A', eventType = 'B', eventType = 'A');
+
+DROP TABLE test.sequence;
--- a/debian/changelog
+++ b/debian/changelog
@ -1,5 +1,5 @@
-clickhouse (19.1.0) unstable; urgency=low
+clickhouse (19.1.1) unstable; urgency=low

  * Modified source code

- --  <root@yandex-team.ru>  Tue, 01 Jan 2019 07:16:20 +0300
+ --  <root@yandex-team.ru>  Wed, 16 Jan 2019 14:04:37 +0300
--- a/debian/control
+++ b/debian/control
@ -37,6 +37,7 @@ Description: Common files for ClickHouse
 Package: clickhouse-server
 Architecture: all
 Depends: ${shlibs:Depends}, ${misc:Depends}, clickhouse-common-static (= ${binary:Version}), adduser
+Recommends: libcap2-bin
 Replaces: clickhouse-server-common, clickhouse-server-base
 Provides: clickhouse-server-common
 Description: Server binary for ClickHouse
--- a/debian/rules
+++ b/debian/rules
@ -8,7 +8,7 @@ export DH_VERBOSE=1
 export DEB_BUILD_MAINT_OPTIONS=hardening=+all,-pie

 # because copy_headers.sh have hardcoded path to build/include_directories.txt
-BUILDDIR = build
+BUILDDIR = obj-$(DEB_HOST_GNU_TYPE)
 CURDIR = $(shell pwd)
 DESTDIR = $(CURDIR)/debian/tmp

@ -80,23 +80,23 @@ ifneq ($(THREADS_COUNT),)
 endif

 %:
-	dh $@ $(DH_FLAGS) --buildsystem=cmake --builddirectory=$(BUILDDIR)
+	dh $@ $(DH_FLAGS) --buildsystem=cmake

 override_dh_auto_configure:
 	dh_auto_configure -- $(CMAKE_FLAGS)

 override_dh_auto_build:
 	# Fix for ninja. Do not add -O.
-	cd $(BUILDDIR) && $(MAKE) $(THREADS_COUNT) $(MAKE_TARGET)
-	#cd $(BUILDDIR) && cmake --build . -- -j$(THREADS_COUNT) # cmake return true on error
+	$(MAKE) $(THREADS_COUNT) -C $(BUILDDIR) $(MAKE_TARGET)
+#	#cd $(BUILDDIR) && cmake --build . -- -j$(THREADS_COUNT) # cmake return true on error

 override_dh_auto_test:
-	#TODO, use ENABLE_TESTS=1
-	#./debian/tests_wrapper.sh
+#	#TODO, use ENABLE_TESTS=1
+#	#./debian/tests_wrapper.sh
+#	cd $(BUILDDIR) && ctest $(THREADS_COUNT) -V -R GLIBC_required_version
+	cd $(BUILDDIR) && ctest $(THREADS_COUNT) -V -E with_server

 override_dh_clean:
-	rm -rf $(BUILDDIR)
-	rm -rf $(DESTDIR)
 	rm -rf debian/copyright debian/clickhouse-client.docs debian/clickhouse-common-static.docs
 	dh_clean

@ -130,7 +130,7 @@ override_dh_install:
 	dh_install --list-missing --sourcedir=$(DESTDIR)

 override_dh_auto_install:
-	env DESTDIR=$(DESTDIR) $(MAKE) -C $(BUILDDIR) install
+	env DESTDIR=$(DESTDIR) $(MAKE) $(THREADS_COUNT) -C $(BUILDDIR) install

 override_dh_shlibdeps:
 	true # We depend only on libc and dh_shlibdeps gives us wrong (too strict) dependency.
--- a/docker/client/Dockerfile
+++ b/docker/client/Dockerfile
@ -1,7 +1,7 @@
 FROM ubuntu:18.04

 ARG repository="deb http://repo.yandex.ru/clickhouse/deb/stable/ main/"
-ARG version=19.1.0
+ARG version=19.1.1

 RUN apt-get update \
    && apt-get install --yes --no-install-recommends \
--- a/docker/server/Dockerfile
+++ b/docker/server/Dockerfile
@ -1,7 +1,7 @@
 FROM ubuntu:18.04

 ARG repository="deb http://repo.yandex.ru/clickhouse/deb/stable/ main/"
-ARG version=19.1.0
+ARG version=19.1.1
 ARG gosu_ver=1.10

 RUN apt-get update \
--- a/docker/test/Dockerfile
+++ b/docker/test/Dockerfile
@ -1,7 +1,7 @@
 FROM ubuntu:18.04

 ARG repository="deb http://repo.yandex.ru/clickhouse/deb/stable/ main/"
-ARG version=19.1.0
+ARG version=19.1.1

 RUN apt-get update && \
    apt-get install -y apt-transport-https dirmngr && \
--- a/docker/test/stateless/Dockerfile
+++ b/docker/test/stateless/Dockerfile
@ -18,7 +18,8 @@ RUN apt-get update -y \
            sudo \
            openssl \
            netcat-openbsd \
-            telnet
+            telnet \
+            moreutils

 ENV TZ=Europe/Moscow
 RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
@ -29,8 +30,14 @@ COPY part_log.xml /etc/clickhouse-server/config.d/part_log.xml
 COPY log_queries.xml /etc/clickhouse-server/users.d/log_queries.xml

 CMD dpkg -i package_folder/clickhouse-common-static_*.deb; \
+    dpkg -i package_folder/clickhouse-common-static-dbg_*.deb; \
    dpkg -i package_folder/clickhouse-server_*.deb;  \
    dpkg -i package_folder/clickhouse-client_*.deb; \
    dpkg -i package_folder/clickhouse-test_*.deb; \
+    echo "TSAN_OPTIONS='halt_on_error=1'" >> /etc/environment; \
+    echo "UBSAN_OPTIONS='print_stacktrace=1'" >> /etc/environment; \
+    echo "ASAN_SYMBOLIZER_PATH=/usr/lib/llvm-6.0/bin/llvm-symbolizer" >> /etc/environment; \
+    echo "UBSAN_SYMBOLIZER_PATH=/usr/lib/llvm-6.0/bin/llvm-symbolizer" >> /etc/environment; \
+    echo "LLVM_SYMBOLIZER_PATH=/usr/lib/llvm-6.0/bin/llvm-symbolizer" >> /etc/environment; \
    service zookeeper start; sleep 5; \
-    service clickhouse-server start && sleep 5 && clickhouse-test --shard --zookeeper $SKIP_TESTS_OPTION 2>&1 | tee test_output/test_result.txt
+    service clickhouse-server start && sleep 5 && clickhouse-test --shard --zookeeper $SKIP_TESTS_OPTION 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee test_output/test_result.txt
--- a/docker/test/stress/Dockerfile
+++ b/docker/test/stress/Dockerfile
@ -1,4 +1,4 @@
-FROM ubuntu:18.10
+FROM yandex/clickhouse-deb-builder

 RUN apt-get update -y \
    && env DEBIAN_FRONTEND=noninteractive \
@ -24,7 +24,14 @@ COPY log_queries.xml /etc/clickhouse-server/users.d/log_queries.xml
 COPY part_log.xml /etc/clickhouse-server/config.d/part_log.xml

 CMD dpkg -i package_folder/clickhouse-common-static_*.deb; \
+    dpkg -i package_folder/clickhouse-common-static-dbg_*.deb; \
    dpkg -i package_folder/clickhouse-server_*.deb;  \
    dpkg -i package_folder/clickhouse-client_*.deb; \
    dpkg -i package_folder/clickhouse-test_*.deb; \
+    echo "TSAN_OPTIONS='halt_on_error=1'" >> /etc/environment; \
+    echo "UBSAN_OPTIONS='print_stacktrace=1'" >> /etc/environment; \
+    echo "ASAN_SYMBOLIZER_PATH=/usr/lib/llvm-6.0/bin/llvm-symbolizer" >> /etc/environment; \
+    echo "UBSAN_SYMBOLIZER_PATH=/usr/lib/llvm-6.0/bin/llvm-symbolizer" >> /etc/environment; \
+    echo "TSAN_SYMBOLIZER_PATH=/usr/lib/llvm-6.0/bin/llvm-symbolizer" >> /etc/environment; \
+    echo "LLVM_SYMBOLIZER_PATH=/usr/lib/llvm-6.0/bin/llvm-symbolizer" >> /etc/environment; \
    service clickhouse-server start && sleep 1 && ./stress --output-folder test_output
--- a/docs/en/operations/server_settings/settings.md
+++ b/docs/en/operations/server_settings/settings.md
@ -163,7 +163,7 @@ You can configure multiple `<graphite>` clauses. For instance, you can use this
 ```


-## graphite_rollup
+## graphite_rollup {#server_settings-graphite_rollup}

 Settings for thinning data for Graphite.

@ -416,7 +416,7 @@ The value 0 means that you can delete all tables without any restrictions.
 ```


-## merge_tree
+## merge_tree {#server_settings-merge_tree}

 Fine tuning for tables in the [ MergeTree](../../operations/table_engines/mergetree.md).

--- a/docs/en/operations/settings/settings.md
+++ b/docs/en/operations/settings/settings.md
@ -22,7 +22,7 @@ The possible values are:
 - `allow`  — Allows the use of these types of subqueries.


-## fallback_to_stale_replicas_for_distributed_queries
+## fallback_to_stale_replicas_for_distributed_queries {#settings-fallback_to_stale_replicas_for_distributed_queries}

 Forces a query to an out-of-date replica if updated data is not available. See "[Replication](../../operations/table_engines/replication.md)".

--- a/docs/en/operations/table_engines/collapsingmergetree.md
+++ b/docs/en/operations/table_engines/collapsingmergetree.md
@ -2,7 +2,7 @@

 The engine inherits from [MergeTree](mergetree.md) and adds the logic of rows collapsing to data parts merge algorithm.

-`CollapsingMergeTree` asynchronously deletes (collapses) pairs of rows if all of the fields in a row are equivalent excepting the particular field `Sign` which can have `1` and `-1` values. Rows without a pair are kept. For more details see the [Collapsing](#collapsing) section of the document.
+`CollapsingMergeTree` asynchronously deletes (collapses) pairs of rows if all of the fields in a row are equivalent excepting the particular field `Sign` which can have `1` and `-1` values. Rows without a pair are kept. For more details see the [Collapsing](#table_engine-collapsingmergetree-collapsing) section of the document.

 The engine may significantly reduce the volume of storage and increase efficiency of `SELECT` query as a consequence.

@ -31,7 +31,7 @@ For a description of query parameters, see [query description](../../query_langu

 **Query clauses**

-When creating a `CollapsingMergeTree` table, the same [clauses](mergetree.md) are required, as when creating a `MergeTree` table.
+When creating a `CollapsingMergeTree` table, the same [query clauses](mergetree.md#table_engine-mergetree-creating-a-table) are required, as when creating a `MergeTree` table.

 <details markdown="1"><summary>Deprecated Method for Creating a Table</summary>

@ -55,13 +55,13 @@ All of the parameters excepting `sign` have the same meaning as in `MergeTree`.
 </details>


-## Collapsing
+## Collapsing {#table_engine-collapsingmergetree-collapsing}

 ### Data

 Consider the situation where you need to save continually changing data for some object. It sounds logical to have one row for an object and update it at any change, but update operation is expensive and slow for DBMS because it requires rewriting of the data in the storage. If you need to write data quickly, update not acceptable, but you can write the changes of an object sequentially as follows.

-Use the particular column `Sign` when writing row. If `Sign = 1` it means that the row is a state of an object, let's call it "state" row. If `Sign = -1` it means the cancellation of the state of an object with the same attributes, let's call it "cancel" row.
+Use the particular column `Sign`. If `Sign = 1` it means that the row is a state of an object, let's call it "state" row. If `Sign = -1` it means the cancellation of the state of an object with the same attributes, let's call it "cancel" row.

 For example, we want to calculate how much pages users checked at some site and how long they were there. At some moment of time we write the following row with the state of user activity:

@ -95,15 +95,15 @@ As we need only the last state of user activity, the rows

 can be deleted collapsing the invalid (old) state of an object. `CollapsingMergeTree` does this while merging of the data parts.

-Why we need 2 rows for each change read in the "Algorithm" paragraph.
+Why we need 2 rows for each change read in the [Algorithm](#table_engine-collapsingmergetree-collapsing-algorithm) paragraph.

 **Peculiar properties of such approach**

 1. The program that writes the data should remember the state of an object to be able to cancel it. "Cancel" string should be the copy of "state" string with the opposite `Sign`. It increases the initial size of storage but allows to write the data quickly.
 2. Long growing arrays in columns reduce the efficiency of the engine due to load for writing. The more straightforward data, the higher efficiency.
-3. `SELECT` results depend strongly on the consistency of object changes history. Be accurate when preparing data for inserting. You can get unpredictable results in inconsistent data, for example, negative values for non-negative metrics such as session depth.
+3. The `SELECT` results depend strongly on the consistency of object changes history. Be accurate when preparing data for inserting. You can get unpredictable results in inconsistent data, for example, negative values for non-negative metrics such as session depth.

-### Algorithm
+### Algorithm {#table_engine-collapsingmergetree-collapsing-algorithm}

 When ClickHouse merges data parts, each group of consecutive rows with the same primary key is reduced to not more than two rows, one with `Sign = 1` ("state" row) and another with `Sign = -1` ("cancel" row). In other words, entries collapse.

@ -181,8 +181,8 @@ SELECT * FROM UAct
 ```

 What do we see and where is collapsing?
-With two `INSERT` queries, we created 2 data parts. The `SELECT` query was performed in 2 threads, and we got a random order of rows.
-Collapsing not occurred because there was no merge of the data parts yet. ClickHouse merges data part in an unknown moment of time which we can not predict.
+
+With two `INSERT` queries, we created 2 data parts. The `SELECT` query was performed in 2 threads, and we got a random order of rows. Collapsing not occurred because there was no merge of the data parts yet. ClickHouse merges data part in an unknown moment of time which we can not predict.

 Thus we need aggregation:

--- a/docs/en/operations/table_engines/custom_compression_codec.md
+++ b/docs/en/operations/table_engines/custom_compression_codec.md
@ -0,0 +1,44 @@
+
+# Column Compression Codecs
+
+Besides default data compression, defined in [server settings](../server_settings/settings.md#compression), per-column specification is also available.
+
+Supported compression algorithms:
+
+- `NONE` - no compression for data applied
+- `LZ4`
+- `LZ4HC(level)` - (level) - LZ4_HC compression algorithm with defined level.
+Possible `level` range: \[3, 12\]. Default value: 9. Greater values stands for better compression and higher CPU usage. Recommended value range: [4,9].
+- `ZSTD(level)` - ZSTD compression algorithm with defined `level`. Possible `level` value range: \[1, 22\]. Default value: 1.
+Greater values stands for better compression and higher CPU usage.
+- `Delta(delta_bytes)` - compression approach when raw values are replace with difference of two neighbour values. Up to `delta_bytes` are used for storing delta value.
+Possible `delta_bytes` values: 1, 2, 4, 8. Default value for delta bytes is `sizeof(type)`, if it is equals to 1, 2, 4, 8 and equals to 1 otherwise.
+
+Syntax example:
+```
+CREATE TABLE codec_example
+(
+    dt Date CODEC(ZSTD), /* используется уровень сжатия по-умолчанию */
+    ts DateTime CODEC(LZ4HC),
+    float_value Float32 CODEC(NONE),
+    double_value Float64 CODEC(LZ4HC(9))
+)
+ENGINE = MergeTree
+PARTITION BY tuple()
+ORDER BY dt
+```
+
+Codecs can be combined in a pipeline. Default table codec is not included into pipeline (if it should be applied to a column, you have to specify it explicitly in pipeline). Example below shows an optimization approach for storing timeseries metrics.
+Usually, values for particular metric, stored in `path` does not differ significantly from point to point. Using delta-encoding allows to reduce disk space usage significantly.
+```
+CREATE TABLE timeseries_example
+(
+    dt Date,
+    ts DateTime,
+    path String,
+    value Float32 CODEC(Delta(2), ZSTD)
+)
+ENGINE = MergeTree
+PARTITION BY dt
+ORDER BY (path, ts)
+```
--- a/docs/en/operations/table_engines/graphitemergetree.md
+++ b/docs/en/operations/table_engines/graphitemergetree.md
@ -1,7 +1,7 @@

 # GraphiteMergeTree

-This engine is designed for rollup (thinning and aggregating/averaging) [Graphite](http://graphite.readthedocs.io/en/latest/index.html) data. It may be helpful to developers who want to use ClickHouse as a data store for Graphite.
+This engine is designed for thinning and aggregating/averaging (rollup) [Graphite](http://graphite.readthedocs.io/en/latest/index.html) data. It may be helpful to developers who want to use ClickHouse as a data store for Graphite.

 You can use any ClickHouse table engine to store the Graphite data if you don't need rollup, but if you need a rollup use `GraphiteMergeTree`. The engine reduces the volume of storage and increases the efficiency of queries from Graphite.

@ -29,9 +29,9 @@ For a description of request parameters, see [request description](../../query_l
 A table for the Graphite date should have the following columns:

 - Column with the metric name (Graphite sensor). Data type: `String`.
- Column with the time for measuring the metric. Data type: `DateTime`.
+- Column with the time of measuring the metric. Data type: `DateTime`.
 - Column with the value of the metric. Data type: any numeric.
- Column with the version of the metric with the same name and time of measurement. Data type: any numeric.
+- Column with the version of the metric. Data type: any numeric.

    ClickHouse saves the rows with the highest version or the last written if versions are the same. Other rows are deleted during the merge of data parts.

@ -43,7 +43,7 @@ The names of these columns should be set in the rollup configuration.

 **Query clauses**

-When creating a `GraphiteMergeTree` table, the same [clauses](mergetree.md) are required, as when creating a `MergeTree` table.
+When creating a `GraphiteMergeTree` table, the same [clauses](mergetree.md#table_engine-mergetree-creating-a-table) are required, as when creating a `MergeTree` table.

 <details markdown="1"><summary>Deprecated Method for Creating a Table</summary>

@ -69,7 +69,7 @@ All of the parameters excepting `config_section` have the same meaning as in `Me

 ## Rollup configuration

-The settings for rollup are defined by the [graphite_rollup](../server_settings/settings.md) parameter in the server configuration. The name of the parameter could be any. You can create several configurations and use them for different tables.
+The settings for rollup are defined by the [graphite_rollup](../server_settings/settings.md#server_settings-graphite_rollup) parameter in the server configuration. The name of the parameter could be any. You can create several configurations and use them for different tables.

 Rollup configuration structure:

@ -102,10 +102,9 @@ Fields for `pattern` and `default` sections:
 The `required-columns`:

 - `path_column_name` — Column with the metric name (Graphite sensor).
- `time_column_name` — Column with the time for measuring the metric.
+- `time_column_name` — Column with the time of measuring the metric.
 - `value_column_name` — Column with the value of the metric at the time set in `time_column_name`.
- `version_column_name` — Column with the version timestamp of the metric with the same name and time remains in the database.
-
+- `version_column_name` — Column with the version of the metric.

 Example of settings:

--- a/docs/en/operations/table_engines/mergetree.md
+++ b/docs/en/operations/table_engines/mergetree.md
@ -26,7 +26,7 @@ Main features:
    The [Merge](merge.md) engine does not belong to the `*MergeTree` family.


-## Creating a Table
+## Creating a Table  {#table_engine-mergetree-creating-a-table}

 ```
 CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
--- a/docs/en/operations/table_engines/replication.md
+++ b/docs/en/operations/table_engines/replication.md
@ -18,9 +18,9 @@ Compressed data for `INSERT` and `ALTER` queries is replicated (for more informa

 `CREATE`, `DROP`, `ATTACH`, `DETACH` and `RENAME` queries are executed on a single server and are not replicated:

- `The CREATE TABLE` query creates a new replicatable table on the server where the query is run. If this table already exists on other servers, it adds a new replica.
- `The DROP TABLE` query deletes the replica located on the server where the query is run.
- `The RENAME` query renames the table on one of the replicas. In other words, replicated tables can have different names on different replicas.
+- The `CREATE TABLE` query creates a new replicatable table on the server where the query is run. If this table already exists on other servers, it adds a new replica.
+- The `DROP TABLE` query deletes the replica located on the server where the query is run.
+- The `RENAME` query renames the table on one of the replicas. In other words, replicated tables can have different names on different replicas.

 To use replication, set the addresses of the ZooKeeper cluster in the config file. Example:

@ -47,7 +47,7 @@ You can specify any existing ZooKeeper cluster and the system will use a directo

 If ZooKeeper isn't set in the config file, you can't create replicated tables, and any existing replicated tables will be read-only.

-ZooKeeper is not used in `SELECT` queries because replication does not affect the performance of `SELECT` and queries run just as fast as they do for non-replicated tables. When querying distributed replicated tables, ClickHouse behavior is controlled by the settings [max_replica_delay_for_distributed_queries](../settings/settings.md#settings-max_replica_delay_for_distributed_queries) and [fallback_to_stale_replicas_for_distributed_queries](../settings/settings.md).
+ZooKeeper is not used in `SELECT` queries because replication does not affect the performance of `SELECT` and queries run just as fast as they do for non-replicated tables. When querying distributed replicated tables, ClickHouse behavior is controlled by the settings [max_replica_delay_for_distributed_queries](../settings/settings.md#settings-max_replica_delay_for_distributed_queries) and [fallback_to_stale_replicas_for_distributed_queries](../settings/settings.md#settings-fallback_to_stale_replicas_for_distributed_queries).

 For each `INSERT` query, approximately ten entries are added to ZooKeeper through several transactions. (To be more precise, this is for each inserted block of data; an INSERT query contains one block or one block per `max_insert_block_size = 1048576` rows.) This leads to slightly longer latencies for `INSERT` compared to non-replicated tables. But if you follow the recommendations to insert data in batches of no more than one `INSERT` per second, it doesn't create any problems. The entire ClickHouse cluster used for coordinating one ZooKeeper cluster has a total of several hundred `INSERTs` per second. The throughput on data inserts (the number of rows per second) is just as high as for non-replicated data.

@ -59,7 +59,7 @@ By default, an INSERT query waits for confirmation of writing the data from only

 Each block of data is written atomically. The INSERT query is divided into blocks up to `max_insert_block_size = 1048576` rows. In other words, if the `INSERT` query has less than 1048576 rows, it is made atomically.

-Data blocks are deduplicated. For multiple writes of the same data block (data blocks of the same size containing the same rows in the same order), the block is only written once. The reason for this is in case of network failures when the client application doesn't know if the data was written to the DB, so the `INSERT` query can simply be repeated. It doesn't matter which replica INSERTs were sent to with identical data. `INSERTs` are idempotent. Deduplication parameters are controlled by [merge_tree](../server_settings/settings.md) server settings.
+Data blocks are deduplicated. For multiple writes of the same data block (data blocks of the same size containing the same rows in the same order), the block is only written once. The reason for this is in case of network failures when the client application doesn't know if the data was written to the DB, so the `INSERT` query can simply be repeated. It doesn't matter which replica INSERTs were sent to with identical data. `INSERTs` are idempotent. Deduplication parameters are controlled by [merge_tree](../server_settings/settings.md#server_settings-merge_tree) server settings.

 During replication, only the source data to insert is transferred over the network. Further data transformation (merging) is coordinated and performed on all the replicas in the same way. This minimizes network usage, which means that replication works well when replicas reside in different datacenters. (Note that duplicating data in different datacenters is the main goal of replication.)

--- a/docs/en/query_language/create.md
+++ b/docs/en/query_language/create.md
@ -17,8 +17,8 @@ The `CREATE TABLE` query can have several forms.
 ```sql
 CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
 (
-    name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1],
-    name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2],
+    name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1] [compression_codec],
+    name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2] [compression_codec],
    ...
 ) ENGINE = engine
 ```
@ -80,6 +80,11 @@ If you add a new column to a table but later change its default expression, the

 It is not possible to set default values for elements in nested data structures.

+### Column compression codecs
+
+Table columns can use either common compression codec, defined in server settings, or use individual one, defined in `compression_codec`.
+[Detailed description](../operations/table_engines/custom_compression_codec.md).
+
 ### Temporary Tables

 ClickHouse supports temporary tables which have the following characteristics:
--- a/docs/en/query_language/functions/date_time_functions.md
+++ b/docs/en/query_language/functions/date_time_functions.md
@ -80,7 +80,7 @@ Rounds down a date with time to the start of the minute.

 ## toStartOfFiveMinute

-Rounds down a date with time to the start of the hour.
+Rounds down a date with time to the start of the five-minute interval.

 ## toStartOfFifteenMinutes

--- a/docs/en/query_language/table_functions/remote.md
+++ b/docs/en/query_language/table_functions/remote.md
@ -1,5 +1,5 @@

-# remote
+# remote, remoteSecure

 Allows you to access remote servers without creating a `Distributed` table.

@ -72,5 +72,6 @@ The `remote` table function can be useful in the following cases:
 If the user is not specified, `default` is used.
 If the password is not specified, an empty password is used.

+`remoteSecure` - same as `remote` but with secured connection. Default port - `tcp_port_secure` from config or 9440.

 [Original article](https://clickhouse.yandex/docs/en/query_language/table_functions/remote/) <!--hide-->
--- a/docs/ru/getting_started/index.md
+++ b/docs/ru/getting_started/index.md
@ -50,7 +50,7 @@ sudo apt-get install clickhouse-client clickhouse-server

 ### Из исходникого кода

-Для компиляции ClickHouse вручную, испольщуйте инструкцию для [Linux](../development/build.md) или [Mac OS X](../development/build_osx.md).
+Для компиляции ClickHouse вручную, используйте инструкцию для [Linux](../development/build.md) или [Mac OS X](../development/build_osx.md).

 Можно скомпилировать пакеты и установить их, либо использовать программы без установки пакетов. Также при ручой сборке можно отключить необходимость поддержки набора инструкций SSE 4.2 или собрать под процессоры архитектуры AArch64.

@ -97,7 +97,7 @@ $ clickhouse-client

 По умолчанию он соединяется с localhost:9000, от имени пользователя `default` без пароля. Также клиент может быть использован для соединения с удалённым сервером с помощью аргемента `--host`.

-Терминал должен использлвать кодировку UTF-8.
+Терминал должен использовать кодировку UTF-8.

 Более подробная информация о клиенте располагается в разделе [«Клиент командной строки»](../interfaces/cli.md).

--- a/docs/ru/operations/server_settings/settings.md
+++ b/docs/ru/operations/server_settings/settings.md
@ -164,7 +164,7 @@ ClickHouse проверит условия `min_part_size` и `min_part_size_rat
 ```


-## graphite_rollup
+## graphite_rollup {#server_settings-graphite_rollup}

 Настройка прореживания данных для Graphite.

@ -416,7 +416,7 @@ ClickHouse проверит условия `min_part_size` и `min_part_size_rat
 ```


-## merge_tree
+## merge_tree {#server_settings-merge_tree}

 Тонкая настройка таблиц семейства [MergeTree](../../operations/table_engines/mergetree.md).

--- a/docs/ru/operations/table_engines/collapsingmergetree.md
+++ b/docs/ru/operations/table_engines/collapsingmergetree.md
@ -1,37 +1,222 @@
 # CollapsingMergeTree {#table_engine-collapsingmergetree}

-*Движок достаточно специфичен для Яндекс.Метрики.*
+Движок наследует функциональность от [MergeTree](mergetree.md) и добавляет в алгоритм слияния кусков данных логику сворачивания (удаления) строк.

-Отличается от `MergeTree` тем, что позволяет автоматически удалять - "схлопывать" некоторые пары строк при слиянии.
+`CollapsingMergeTree` асинхронно удаляет (сворачивает) пары строк, если все поля в строке эквивалентны, за исключением специального поля `Sign`, которое может принимать значения `1` и `-1`. Строки без пары сохраняются. Подробнее смотрите раздел [Сворачивание (удаление) строк](#table_engine-collapsingmergetree-collapsing).

-В Яндекс.Метрике есть обычные логи (например, лог хитов) и логи изменений. Логи изменений используются, чтобы инкрементально считать статистику по постоянно меняющимся данным. Например - логи изменений визитов, логи изменений истории посетителей. Визиты в Яндекс.Метрике постоянно меняются - например, увеличивается количество хитов в визите. Изменением какого либо объекта будем называть пару (?старые значения, ?новые значения). Старые значения могут отсутствовать, если объект создался. Новые значения могут отсутствовать, если объект удалился. Если объект изменился, но был раньше и не удалился - присутствует оба значения. В лог изменений, для каждого изменения, пишется от одной до двух записей. Каждая запись содержит все те же атрибуты, что и сам объект, и ещё специальный атрибут, который позволяет отличить старые и новые значения. Видно, что при изменении объектов, в лог изменений лишь дописываются новые записи и не трогаются уже имеющиеся.
+Движок может значительно уменьшить объем хранения и, как следствие, повысить эффективность запросов `SELECT`.

-Лог изменений позволяет инкрементально считать почти любую статистику. Для этого надо учитывать "новые" строки с положительным знаком, и "старые" строки с отрицательным знаком. То есть, возможно инкрементально считать все статистики, алгебраическая структура которых содержит операцию взятия обратного элемента. Большинство статистик именно такие. Также удаётся посчитать "идемпотентные" статистики, например, количество уникальных посетителей, так как при изменении визитов, уникальные посетители не удаляются.
+## Создание таблицы

-Это - основная идея, благодаря которой Яндекс.Метрика работает в реальном времени.
-
-CollapsingMergeTree принимает дополнительный параметр - имя столбца типа Int8, содержащего "знак" строки. Пример:
-
-``` sql
-CollapsingMergeTree(EventDate, (CounterID, EventDate, intHash32(UniqID), VisitID), 8192, Sign)
+```sql
+CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
+(
+    name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1],
+    name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2],
+    ...
+) ENGINE = CollapsingMergeTree(sign)
+[PARTITION BY expr]
+[ORDER BY expr]
+[SAMPLE BY expr]
+[SETTINGS name=value, ...]
 ```

-Здесь `Sign` - столбец, содержащий -1 для "старых" значений и 1 для "новых" значений.
+Подробности про `CREATE TABLE` смотрите в [описании запроса](../../query_language/create.md).

-При слиянии, для каждой группы идущих подряд одинаковых значений первичного ключа (столбцов, по которым сортируются данные), остаётся не более одной строки со значением столбца sign_column = -1 ("отрицательной строки") и не более одной строки со значением столбца sign_column = 1 ("положительной строки"). То есть - производится схлопывание записей из лога изменений.
+**Параметры CollapsingMergeTree**

-Если количество положительных и отрицательных строк совпадает - то пишет первую отрицательную и последнюю положительную строку.
-Если положительных на 1 больше, чем отрицательных - то пишет только последнюю положительную строку.
-Если отрицательных на 1 больше, чем положительных - то пишет только первую отрицательную строку.
-Иначе - логическая ошибка, и ни одна из таких строк не пишется. (Логическая ошибка может возникать, если случайно один кусок лога был вставлен более одного раза. Поэтому, об ошибке всего лишь пишется в лог сервера, и слияние продолжает работать.)
+- `sign` — Имя столбца с типом строки: `1` — строка состояния, `-1` — строка отмены состояния.

-Как видно, от схлопывания не должны меняться результаты расчётов статистик.
-Изменения постепенно схлопываются так что в конце-концов, для почти каждого объекта, остаются лишь его последние значения.
-По сравнению с MergeTree, движок CollapsingMergeTree позволяет в несколько раз уменьшить объём данных.
+    Тип данных столбца — `Int8`.

-Существует несколько способов получения полностью "схлопнутых" данных из таблицы типа `CollapsingMergeTree`:
+**Секции запроса**
+При создании таблицы `CollapsingMergeTree` используются те же [секции](mergetree.md#table_engine-mergetree-creating-a-table) запроса, что и при создании таблицы `MergeTree`.

-1. Написать запрос с GROUP BY и агрегатными функциями, учитывающими знак. Например, чтобы посчитать количество, надо вместо count() написать sum(Sign); чтобы посчитать сумму чего-либо, надо вместо sum(x) написать sum(Sign \* x) и т. п., а также добавить HAVING sum(Sign) `>` 0. Не все величины можно посчитать подобным образом. Например, агрегатные функции min, max не могут быть переписаны.
-2. Если необходимо вынимать данные без агрегации (например, проверить наличие строк, самые новые значения которых удовлетворяют некоторым условиям), можно использовать модификатор FINAL для секции FROM. Это вариант существенно менее эффективен.
+<details markdown="1"><summary>Устаревший способ создания таблицы</summary>
+
+!!! attention
+    Не используйте этот способ в новых проектах и по возможности переведите старые проекты на способ описанный выше.
+
+```sql
+CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
+(
+    name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1],
+    name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2],
+    ...
+) ENGINE [=] CollapsingMergeTree(date-column [, sampling_expression], (primary, key), index_granularity, sign)
+```
+
+Все параметры, кроме `ver` имеют то же значение, что и в `MergeTree`.
+
+- `sign` — Имя столбца с типом строки: `1` — строка состояния, `-1` — строка отмены состояния.
+
+    Тип данных столбца — `Int8`.
+
+</details>
+
+## Сворачивание (удаление) строк {table_engine-collapsingmergetree-collapsing}
+
+### Данные
+
+Рассмотрим ситуацию, когда необходимо сохранять постоянно изменяющиеся данные для какого-либо объекта. Кажется логичным иметь одну строку для объекта и обновлять её при любом изменении, однако операция обновления является дорогостоящей и медленной для СУБД, поскольку требует перезаписи данных в хранилище. Если необходимо быстро записать данные, обновление не допустимо, но можно записать изменения объекта последовательно как описано ниже.
+
+Используйте специальный столбец `Sign`. Если `Sign = 1`, то это означает, что строка является состоянием объекта, назовём её строкой состояния. Если `Sign = -1`, то это означает отмену состояния объекта с теми же атрибутами, назовём её строкой отмены состояния.
+
+Например, мы хотим рассчитать, сколько страниц проверили пользователи на каком-то сайте и как долго они там находились. В какой-то момент времени мы пишем следующую строку с состоянием действий пользователя:
+
+```
+┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐
+│ 4324182021466249494 │         5 │      146 │    1 │
+└─────────────────────┴───────────┴──────────┴──────┘
+```
+
+Через некоторое время мы регистрируем изменение активности пользователя и записываем его следующими двумя строками.
+
+```
+┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐
+│ 4324182021466249494 │         5 │      146 │   -1 │
+│ 4324182021466249494 │         6 │      185 │    1 │
+└─────────────────────┴───────────┴──────────┴──────┘
+```
+
+Первая строка отменяет предыдущее состояние объекта (пользователя). Она должен повторять все поля отменённого состояния за исключением `Sign`.
+
+Вторая строка содержит текущее состояние.
+
+Поскольку нам нужно только последнее состояние активности пользователя, строки
+
+```
+┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐
+│ 4324182021466249494 │         5 │      146 │    1 │
+│ 4324182021466249494 │         5 │      146 │   -1 │
+└─────────────────────┴───────────┴──────────┴──────┘
+```
+
+можно удалить, сворачивая (удаляя) устаревшее состояние объекта. `CollapsingMergeTree`  выполняет это при слиянии кусков данных.
+
+Зачем нужны 2 строки для каждого изменения, читайте в параграфе [Алгоритм](#table_engine-collapsingmergetree-collapsing-algorithm).
+
+**Особенности подхода**
+
+1. Программа, которая записывает данные, должна помнить состояние объекта, чтобы иметь возможность отменить его. Строка отмены состояния должна быть копией предыдущей строки состояния с противоположным значением `Sign`. Это увеличивает начальный размер хранилища, но позволяет быстро записывать данные.
+2. Длинные растущие массивы в Столбцах снижают эффективность работы движка за счёт нагрузки на запись. Чем проще данные, тем выше эффективность.
+3. Результаты запроса `SELECT` сильно зависят от согласованности истории изменений объекта. Будьте точны при подготовке данных для вставки. Можно получить непредсказуемые результаты для несогласованных данных, например отрицательные значения для неотрицательных метрик, таких как глубина сеанса.
+
+### Алгоритм {#table_engine-collapsingmergetree-collapsing-algorithm}
+
+Когда ClickHouse объединяет куски данных, каждая группа последовательных строк с одним и тем же первичным ключом уменьшается до не более чем двух строк, одна из которых имеет `Sign = 1` (строка состояния), а другая строка с `Sign = -1` (строка отмены состояния). Другими словами, записи сворачиваются.
+
+Для каждого результирующего куска данных ClickHouse сохраняет:
+
+  1. Первую строку отмены состояния и последнюю строку состояния, если количество строк обоих видов совпадает.
+
+  1. Последнюю строку состояния, если строк состояния на одну больше, чем строк отмены состояния.
+
+  1. Первую строку отмены состояния, если их на одну больше, чем строк состояния.
+
+  1. Ни в одну из строк во всех остальных случаях.
+
+      Слияние продолжается, но ClickHouse рассматривает эту ситуацию как логическую ошибку и записывает её в журнал сервера. Эта ошибка может возникать, если одни и те же данные вставлялись несколько раз.
+
+Как видно, от сворачивания не должны меняться результаты расчётов статистик.
+Изменения постепенно сворачиваются так, что остаются лишь последнее состояние почти каждого объекта.
+
+Столбец `Sign` необходим, поскольку алгоритм слияния не гарантирует, что все строки с одинаковым первичным ключом будут находиться в одном результирующем куске данных и даже на одном физическом сервере. ClickHouse выполняет запросы `SELECT` несколькими потоками, и он не может предсказать порядок строк в результате. Если необходимо получить полностью свёрнутые данные из таблицы `CollapsingMergeTree`, то необходимо агрегирование.
+
+Для завершения свертывания добавьте в запрос секцию`GROUP BY`  и агрегатные функции, которые учитывают знак. Например, для расчета количества используйте `sum(Sign)` вместо`count()`. Чтобы вычислить сумму чего-либо, используйте `sum(Sign * x)` вместо`sum(х)`, и так далее, а также добавьте `HAVING sum(Sign) > 0` .
+
+Таким образом можно вычислять агрегации `count`, `sum` и `avg`. Если объект имеет хотя бы одно не свёрнутое состояние, то может быть вычислена агрегация `uniq`. Агрегации `min` и `max` невозможно вычислить, поскольку `CollapsingMergeTree` не сохраняет историю значений свернутых состояний.
+
+Если необходимо выбирать данные без агрегации (например, проверить наличие строк, последние значения которых удовлетворяют некоторым условиям), можно использовать модификатор  `FINAL`  для секции `FROM`. Это вариант существенно менее эффективен.
+
+## Пример использования
+
+Example data:
+
+```
+┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐
+│ 4324182021466249494 │         5 │      146 │    1 │
+│ 4324182021466249494 │         5 │      146 │   -1 │
+│ 4324182021466249494 │         6 │      185 │    1 │
+└─────────────────────┴───────────┴──────────┴──────┘
+```
+
+Создание таблицы:
+
+```sql
+CREATE TABLE UAct
+(
+    UserID UInt64,
+    PageViews UInt8,
+    Duration UInt8,
+    Sign Int8
+)
+ENGINE = CollapsingMergeTree(Sign)
+ORDER BY UserID
+```
+
+Insertion of the data:
+
+```sql
+INSERT INTO UAct VALUES (4324182021466249494, 5, 146, 1)
+```
+
+```sql
+INSERT INTO UAct VALUES (4324182021466249494, 5, 146, -1),(4324182021466249494, 6, 185, 1)
+```
+
+Мы используем два запроса `INSERT` для создания двух различных кусков данных. Если вставить данные одним запросом, ClickHouse создаёт один кусок данных и никогда не будет выполнять слияние.
+
+Получение данных:
+
+```
+SELECT * FROM UAct
+```
+
+```
+┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐
+│ 4324182021466249494 │         5 │      146 │   -1 │
+│ 4324182021466249494 │         6 │      185 │    1 │
+└─────────────────────┴───────────┴──────────┴──────┘
+┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐
+│ 4324182021466249494 │         5 │      146 │    1 │
+└─────────────────────┴───────────┴──────────┴──────┘
+```
+
+Что мы видим и где сворачивание?
+
+Двумя запросами `INSERT`, мы создали два куска данных. Запрос `SELECT`  был выполнен в 2 потока, и мы получили случайный порядок строк. Сворачивание не произошло, так как слияние кусков данных еще не произошло. ClickHouse объединяет куски данных в неизвестный момент времени, который мы не можем предсказать.
+
+Таким образом, нам нужна агрегация:
+
+```sql
+SELECT
+    UserID,
+    sum(PageViews * Sign) AS PageViews,
+    sum(Duration * Sign) AS Duration
+FROM UAct
+GROUP BY UserID
+HAVING sum(Sign) > 0
+```
+
+```
+┌──────────────UserID─┬─PageViews─┬─Duration─┐
+│ 4324182021466249494 │         6 │      185 │
+└─────────────────────┴───────────┴──────────┘
+```
+
+Если нам не нужна агрегация, но мы хотим принудительно выполнить свёртку данных, можно использовать модификатор `FINAL` для секции `FROM`.
+
+```sql
+SELECT * FROM UAct FINAL
+```
+
+```
+┌──────────────UserID─┬─PageViews─┬─Duration─┬─Sign─┐
+│ 4324182021466249494 │         6 │      185 │    1 │
+└─────────────────────┴───────────┴──────────┴──────┘
+```
+
+Такой способ выбора данных очень неэффективен. Не используйте его для больших таблиц.

 [Оригинальная статья](https://clickhouse.yandex/docs/ru/operations/table_engines/collapsingmergetree/) <!--hide-->
--- a/docs/ru/operations/table_engines/custom_compression_codec.md
+++ b/docs/ru/operations/table_engines/custom_compression_codec.md
@ -0,0 +1,45 @@
+
+# Форматы сжатия для колонок
+
+Помимо сжатия для колонок по умолчанию, определяемого в [настройках сервера](../server_settings/settings.md#compression),
+существует возможность указать формат сжатия индивидуально для каждой колонки.
+
+Поддерживаемые форматы:
+
+- `NONE` - сжатие отсутствует
+- `LZ4`
+- `LZ4HC(level)` - алгоритм сжатия LZ4_HC с указанным уровнем компрессии `level`.
+Возможный диапазон значений `level`: \[3, 12\]. Значение по умолчанию: 9. Чем выше уровень, тем лучше сжатие, но тратится больше времени. Рекомендованный диапазон \[4, 9\].
+- `ZSTD(level)` - алгоритм сжатия ZSTD с указанным уровнем компрессии `level`. Возможный диапазон значений `level`: \[1, 22\]. Значение по умолчанию: 1.
+Чем выше уровень, тем лучше сжатие, но тратится больше времени.
+- `Delta(delta_bytes)` - способ сжатия, при котором вместо числовых значений поля сохраняется разность между двумя соседними значениями. Значение `delta_bytes` - число байт для хранения дельты.
+Возможные значения: 1, 2, 4, 8. Значение по умолчанию: если `sizeof(type)` равен 1, 2, 4, 8 - `sizeof(type)`, иначе - 1.
+
+Пример использования:
+```
+CREATE TABLE codec_example
+(
+    dt Date CODEC(ZSTD), /* используется уровень сжатия по умолчанию */
+    ts DateTime CODEC(LZ4HC),
+    float_value Float32 CODEC(NONE),
+    double_value Float64 CODEC(LZ4HC(9))
+)
+ENGINE = MergeTree
+PARTITION BY tuple()
+ORDER BY dt
+```
+
+Кодеки могут комбинироваться между собой. Если для колонки указана своя последовательность кодеков, то общий табличный кодек не применяется (должен быть указан в последовательности принудительно, если нужен). В примере ниже - оптимизация для хранения timeseries метрик.
+Как правило, значения одной и той же метрики `path` не сильно различаются между собой, и выгоднее использовать дельта-компрессию вместо записи всего числа:
+```
+CREATE TABLE timeseries_example
+(
+    dt Date,
+    ts DateTime,
+    path String,
+    value Float32 CODEC(Delta(2), ZSTD)
+)
+ENGINE = MergeTree
+PARTITION BY dt
+ORDER BY (path, ts)
+```
--- a/docs/ru/operations/table_engines/graphitemergetree.md
+++ b/docs/ru/operations/table_engines/graphitemergetree.md
@ -1,58 +1,122 @@
-
 # GraphiteMergeTree

-Движок предназначен для rollup (прореживания и агрегирования/усреднения) данных [Graphite](http://graphite.readthedocs.io/en/latest/index.html). Он может быть интересен разработчикам, которые хотят использовать ClickHouse как хранилище данных для Graphite.
+Движок предназначен для прореживания и агрегирования/усреднения (rollup) данных [Graphite](http://graphite.readthedocs.io/en/latest/index.html). Он может быть интересен разработчикам, которые хотят использовать ClickHouse как хранилище данных для Graphite.

-Graphite хранит в ClickHouse полные данные, а получать их может следующими способами:
+Если rollup не требуется, то для хранения данных Graphite можно использовать любой движок таблиц ClickHouse, в противном случае используйте `GraphiteMergeTree`. Движок уменьшает объем хранения и повышает эффективность запросов от Graphite.

-   Без прореживания.
+Движок наследует свойства от [MergeTree](mergetree.md).

-    Используется движок [MergeTree](mergetree.md).
+## Создание таблицы

-   С прореживанием.
+```sql
+CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
+(
+    Path String,
+    Time DateTime,
+    Value <Numeric_type>,
+    Version <Numeric_type>
+    ...
+) ENGINE = GraphiteMergeTree(config_section)
+[PARTITION BY expr]
+[ORDER BY expr]
+[SAMPLE BY expr]
+[SETTINGS name=value, ...]
+```

-    Используется движок `GraphiteMergeTree`.
+Описание параметров запроса смотрите в [описании запроса](../../query_language/create.md).

-Движок наследует свойства MergeTree. Настройки прореживания данных задаются параметром [graphite_rollup](../server_settings/settings.md) в конфигурации сервера .
+Таблица для данных Graphite должна содержать следующие столбцы:

-## Использование движка
+- Колонка с названием метрики (Graphite sensor). Тип данных: `String`.

-Таблица с данными Graphite должна содержать как минимум следующие поля:
+- Столбец со временем измерения метрики. Тип данных `DateTime`.

-   `Path` - имя метрики (сенсора Graphite).
-   `Time` - время измерения.
-   `Value` - значение метрики в момент времени Time.
-   `Version` - настройка, которая определяет какое значение метрики с одинаковыми Path и Time останется в базе.
+- Столбец со значением метрики. Тип данных: любой числовой.

-Шаблон правил rollup:
+- Столбец с версией метрики. Тип данных: любой числовой.
+
+    ClickHouse сохраняет строки с последней версией или последнюю записанную строку, если версии совпадают. Другие строки удаляются при слиянии кусков данных.
+
+Имена этих столбцов должны быть заданы в конфигурации rollup.
+
+**Параметры GraphiteMergeTree**
+
+- `config_section` — имя раздела в конфигурационном файле, в котором находятся правила rollup.
+
+**Секции запроса**
+
+При создании таблицы `GraphiteMergeTree` используются те же [секции](mergetree.md#table_engine-mergetree-creating-a-table) запроса, что при создании таблицы `MergeTree`.
+
+<details markdown="1"><summary>Устаревший способ создания таблицы</summary>
+
+!!! attention
+Не используйте этот способ в новых проектах и по возможности переведите старые проекты на способ описанный выше.
+
+```sql
+CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
+(
+    EventDate Date,
+    Path String,
+    Time DateTime,
+    Value <Numeric_type>,
+    Version <Numeric_type>
+    ...
+) ENGINE [=] GraphiteMergeTree(date-column [, sampling_expression], (primary, key), index_granularity, config_section)
+```
+
+Все параметры, кроме `config_section` имеют то же значение, что в `MergeTree`.
+
+- `config_section` — имя раздела в конфигурационном файле, в котором находятся правила rollup.
+
+</details>
+
+## Конфигурация rollup
+
+Настройки для прореживания данных задаются параметром [graphite_rollup](../server_settings/settings.md#server_settings-graphite_rollup) Имя параметра может быть любым. Можно создать несколько конфигураций и использовать их для разных таблиц.
+
+Структура конфигурации rollup:

 ```
+required-columns
 pattern
    regexp
    function
-    age -> precision
+    age + precision
    ...
 pattern
    ...
 default
    function
-       age -> precision
+    age + precision
    ...
 ```

-При обработке записи ClickHouse проверит правила в секции `pattern`. Если имя метрики соответствует шаблону `regexp`, то применяются правила из `pattern`, в противном случае из `default`.
+При обработке строки ClickHouse проверяет правила в разделе `pattern`. Если имя метрики соответствует шаблону `regexp`, то  применяются правила из раздела `pattern`, в противном случае из раздела `default`.

-Поля шаблона правил.
+Правила определяются с помощью полей `function` и `age + precision`.

- `age` - Минимальный возраст данных в секундах.
- `function` - Имя агрегирующей функции, которую следует применить к данным, чей возраст оказался в интервале `[age, age + precision]`.
- `precision` - Точность определения возраста данных в секундах.
- `regexp` - Шаблон имени метрики.
+Поля для разделов `pattenrn` и `default`:
+
+- `regexp` – шаблон имени метрики.
+- `age` – минимальный возраст данных в секундах.
+- `precision` – точность определения возраста данных в секундах.
+- `function` – имя агрегирующей функции, которую следует применить к данным, чей возраст оказался в интервале `[age, age + precision]`.
+
+`required-columns`:
+
+- `path_column_name` — колонка с названием метрики (Graphite sensor).
+- `time_column_name` — столбец со временем измерения метрики.
+- `value_column_name` — столбец со значением метрики в момент времени, установленный в `time_column_name`.
+- `version_column_name` — столбец с версией метрики.

 Пример настройки:

 ```xml
 <graphite_rollup>
+    <path_column_name>Path</path_column_name>
+    <time_column_name>Time</time_column_name>
+    <value_column_name>Value</value_column_name>
+    <version_column_name>Version</version_column_name>
    <pattern>
        <regexp>click_cost</regexp>
        <function>any</function>
--- a/docs/ru/operations/table_engines/mergetree.md
+++ b/docs/ru/operations/table_engines/mergetree.md
@ -24,7 +24,7 @@
    При необходимости можно задать способ сэмплирования данных в таблице.


-## Создание таблицы
+## Создание таблицы {#table_engine-mergetree-creating-a-table}

 ```sql
 CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
--- a/docs/ru/operations/table_engines/replication.md
+++ b/docs/ru/operations/table_engines/replication.md
@ -1,4 +1,4 @@
-# Репликация данных {#table_engines-replication}
+# Репликация данных {#table_engines-репликация}

 Репликация поддерживается только для таблиц семейства MergeTree:

@ -7,6 +7,7 @@
 - ReplicatedReplacingMergeTree
 - ReplicatedAggregatingMergeTree
 - ReplicatedCollapsingMergeTree
+- ReplicatedVersionedCollapsingMergeTree
 - ReplicatedGraphiteMergeTree

 Репликация работает на уровне отдельных таблиц, а не всего сервера. То есть, на сервере могут быть расположены одновременно реплицируемые и не реплицируемые таблицы.
@ -15,11 +16,11 @@

 Реплицируются сжатые данные запросов `INSERT`, `ALTER` (см. подробности в описании запроса [ALTER](../../query_language/alter.md#query_language_queries_alter)).

-Запросы `CREATE`, `DROP`, `ATTACH`, `DETACH`, `RENAME` выполняются на одном сервере и не реплицируются:
+Запросы `CREATE`, `DROP`, `ATTACH`, `DETACH` и `RENAME` выполняются на одном сервере и не реплицируются:

- `CREATE TABLE` создаёт новую реплицируемую таблицу на том сервере, где выполняется запрос, а если на других серверах такая таблица уже есть - добавляет новую реплику.
+- Запрос `CREATE TABLE` создаёт новую реплицируемую таблицу на том сервере, где его выполнили. Если таблица уже существует на других серверах, запрос добавляет новую реплику.
 - `DROP TABLE` удаляет реплику, расположенную на том сервере, где выполняется запрос.
- `RENAME` переименовывает таблицу на одной из реплик - то есть, реплицируемые таблицы на разных репликах могут называться по разному.
+- Запрос `RENAME` переименовывает таблицу на одной реплик. Другими словами, реплицируемые таблицы на разных репликах могут называться по-разному.

 Чтобы использовать репликацию, укажите в конфигурационном файле адреса ZooKeeper кластера. Пример:

@ -46,19 +47,19 @@

 Если в конфигурационном файле не настроен ZooKeeper, то вы не сможете создать реплицируемые таблицы, а уже имеющиеся реплицируемые таблицы будут доступны в режиме только на чтение.

-При запросах `SELECT`, ZooKeeper не используется, т.е. репликация не влияет на производительность `SELECT` и запросы работают так же быстро, как и для нереплицируемых таблиц. При запросах к распределенным реплицированным таблицам поведение ClickHouse регулируется настройками [max_replica_delay_for_distributed_queries](../settings/settings.md#settings_settings_max_replica_delay_for_distributed_queries) и [fallback_to_stale_replicas_for_distributed_queries](../settings/settings.md).
+При запросах `SELECT`, ZooKeeper не используется, т.е. репликация не влияет на производительность `SELECT` и запросы работают так же быстро, как и для нереплицируемых таблиц. При запросах к распределенным реплицированным таблицам поведение ClickHouse регулируется настройками [max_replica_delay_for_distributed_queries](../settings/settings.md#settings-max_replica_delay_for_distributed_queries) and [fallback_to_stale_replicas_for_distributed_queries](../settings/settings.md).

-При каждом запросе `INSERT` (точнее, на каждый вставляемый блок данных; запрос INSERT содержит один блок, или по блоку на каждые `max_insert_block_size = 1048576` строк), делается около десятка записей в ZooKeeper в рамках нескольких транзакций. Это приводит к некоторому увеличению задержек при `INSERT`, по сравнению с нереплицируемыми таблицами. Но если придерживаться обычных рекомендаций - вставлять данные пачками не более одного `INSERT` в секунду, то это не составляет проблем. На всём кластере ClickHouse, использующим для координации один кластер ZooKeeper, может быть в совокупности несколько сотен `INSERT` в секунду. Пропускная способность при вставке данных (количество строчек в секунду) такая же высокая, как для нереплицируемых таблиц.
+При каждом запросе `INSERT`, делается около десятка записей в ZooKeeper в рамках нескольких транзакций. (Чтобы быть более точным, это для каждого вставленного блока данных; запрос INSERT содержит один блок или один блок на `max_insert_block_size = 1048576` строк.) Это приводит к некоторому увеличению задержек при `INSERT`, по сравнению с нереплицируемыми таблицами. Но если придерживаться обычных рекомендаций - вставлять данные пачками не более одного `INSERT` в секунду, то это не составляет проблем. На всём кластере ClickHouse, использующим для координации один кластер ZooKeeper, может быть в совокупности несколько сотен `INSERT` в секунду. Пропускная способность при вставке данных (количество строчек в секунду) такая же высокая, как для нереплицируемых таблиц.

 Для очень больших кластеров, можно использовать разные кластеры ZooKeeper для разных шардов. Впрочем, на кластере Яндекс.Метрики (примерно 300 серверов) такой необходимости не возникает.

-Репликация асинхронная, мульти-мастер. Запросы `INSERT` (а также `ALTER`) можно отправлять на любой доступный сервер. Данные вставятся на сервер, где выполнен запрос, а затем скопируются на остальные серверы. В связи с асинхронностью, только что вставленные данные появляются на остальных репликах с небольшой задержкой. Если часть реплик недоступна, данные на них запишутся тогда, когда они станут доступны. Если реплика доступна, то задержка составляет столько времени, сколько требуется для передачи блока сжатых данных по сети.
+Репликация асинхронная, мульти-мастер. Запросы `INSERT` и `ALTER` можно направлять на любой доступный сервер. Данные вставятся на сервер, где выполнен запрос, а затем скопируются на остальные серверы. В связи с асинхронностью, только что вставленные данные появляются на остальных репликах с небольшой задержкой. Если часть реплик недоступна, данные на них запишутся тогда, когда они станут доступны. Если реплика доступна, то задержка составляет столько времени, сколько требуется для передачи блока сжатых данных по сети.

-По умолчанию, запрос INSERT ждёт подтверждения записи только от одной реплики. Если данные были успешно записаны только на одну реплику, и сервер с этой репликой перестал существовать, то записанные данные будут потеряны. Вы можете включить подтверждение записи от нескольких реплик, используя настройку [insert_quorum](../settings/settings.md).
+По умолчанию, запрос INSERT ждёт подтверждения записи только от одной реплики. Если данные были успешно записаны только на одну реплику, и сервер с этой репликой перестал существовать, то записанные данные будут потеряны. Вы можете включить подтверждение записи от нескольких реплик, используя настройку `insert_quorum`.

 Каждый блок данных записывается атомарно. Запрос INSERT разбивается на блоки данных размером до `max_insert_block_size = 1048576` строк. То есть, если в запросе `INSERT` менее 1048576 строк, то он делается атомарно.

-Блоки данных дедуплицируются. При многократной записи одного и того же блока данных (блоков данных одинакового размера, содержащих одни и те же строчки в одном и том же порядке), блок будет записан только один раз. Это сделано для того, чтобы в случае сбоя в сети, когда клиентское приложение не может понять, были ли данные записаны в БД, можно было просто повторить запрос `INSERT`. При этом не имеет значения, на какую реплику будут отправлены INSERT-ы с одинаковыми данными. То есть, обеспечивается идемпотентность `INSERT`. Параметры дедупликации регулируются настройками сервера [merge_tree](../server_settings/settings.md).
+Блоки данных дедуплицируются. При многократной записи одного и того же блока данных (блоков данных одинакового размера, содержащих одни и те же строчки в одном и том же порядке), блок будет записан только один раз. Это сделано для того, чтобы в случае сбоя в сети, когда клиентское приложение не может понять, были ли данные записаны в БД, можно было просто повторить запрос `INSERT`. При этом не имеет значения, на какую реплику будут отправлены INSERT-ы с одинаковыми данными. Запрос `INSERT` идемпотентный. Параметры дедуплицирования регулируются настройками сервера [merge_tree](../server_settings/settings.md#server_settings-merge_tree)

 При репликации, по сети передаются только исходные вставляемые данные. Дальнейшие преобразования данных (слияния) координируются и делаются на всех репликах одинаковым образом. За счёт этого минимизируется использование сети, и благодаря этому, репликация хорошо работает при расположении реплик в разных датацентрах. (Стоит заметить, что дублирование данных в разных датацентрах, по сути, является основной задачей репликации).

@ -66,12 +67,14 @@

 Система следит за синхронностью данных на репликах и умеет восстанавливаться после сбоя. Восстановление после сбоя автоматическое (в случае небольших различий в данных) или полуавтоматическое (когда данные отличаются слишком сильно, что может свидетельствовать об ошибке конфигурации).

-
 ## Создание реплицируемых таблиц

 В начало имени движка таблицы добавляется `Replicated`. Например, `ReplicatedMergeTree`.

-Также добавляются два параметра в начало списка параметров - путь к таблице в ZooKeeper, имя реплики в ZooKeeper.
+**Параметры Replicated\*MergeTree**
+
+- `zoo_path` — путь к таблице в ZooKeeper.
+- `replica_name`  — имя реплики в ZooKeeper.

 Пример:

@ -100,7 +103,7 @@ CREATE TABLE table_name

 </details>

-Как видно в примере, эти параметры могут содержать подстановки в фигурных скобках. Подставляемые значения достаются из конфигурационного файла, из секции macros. Пример:
+Как видно в примере, эти параметры могут содержать подстановки в фигурных скобках. Подставляемые значения достаются из конфигурационного файла, из секции `macros`. Пример:

 ```xml
 <macros>
@ -113,13 +116,13 @@ CREATE TABLE table_name
 Путь к таблице в ZooKeeper должен быть разным для каждой реплицируемой таблицы. В том числе, для таблиц на разных шардах, должны быть разные пути.
 В данном случае, путь состоит из следующих частей:

-`/clickhouse/tables/` - общий префикс. Рекомендуется использовать именно его.
+`/clickhouse/tables/`  — общий префикс. Рекомендуется использовать именно его.

-`{layer}-{shard}` - идентификатор шарда. В данном примере он состоит из двух частей, так как на кластере Яндекс.Метрики используется двухуровневое шардирование. Для большинства задач, оставьте только подстановку {shard}, которая будет раскрываться в идентификатор шарда.
+`{layer}-{shard}`  — идентификатор шарда. В данном примере он состоит из двух частей, так как на кластере Яндекс.Метрики используется двухуровневое шардирование. Для большинства задач, оставьте только подстановку {shard}, которая будет раскрываться в идентификатор шарда.

 `hits` - имя узла для таблицы в ZooKeeper. Разумно делать его таким же, как имя таблицы. Оно указывается явно, так как, в отличие от имени таблицы, оно не меняется после запроса RENAME.

-Имя реплики - то, что идентифицирует разные реплики одной и той же таблицы. Можно использовать для него имя сервера, как показано в примере. Впрочем, достаточно, чтобы имя было уникально лишь в пределах каждого шарда.
+Имя реплики  — то, что идентифицирует разные реплики одной и той же таблицы. Можно использовать для него имя сервера, как показано в примере. Впрочем, достаточно, чтобы имя было уникально лишь в пределах каждого шарда.

 Можно не использовать подстановки, а указать соответствующие параметры явно. Это может быть удобным для тестирования и при настройке маленьких кластеров. Однако в этом случае нельзя пользоваться распределенными DDL-запросами (`ON CLUSTER`).

@ -129,7 +132,7 @@ CREATE TABLE table_name

 Если вы добавляете новую реплику после того, как таблица на других репликах уже содержит некоторые данные, то после выполнения запроса, данные на новую реплику будут скачаны с других реплик. То есть, новая реплика синхронизирует себя с остальными.

-Для удаления реплики, выполните запрос `DROP TABLE`. При этом, удаляется только одна реплика - расположенная на том сервере, где вы выполняете запрос.
+Для удаления реплики, выполните запрос `DROP TABLE`. При этом, удаляется только одна реплика  — расположенная на том сервере, где вы выполняете запрос.

 ## Восстановление после сбоя

@ -143,9 +146,9 @@ CREATE TABLE table_name

 Стоит заметить, что ClickHouse не делает самостоятельно никаких деструктивных действий типа автоматического удаления большого количества данных.

-При старте сервера (или создании новой сессии с ZooKeeper), проверяется только количество и размеры всех файлов. Если у файлов совпадают размеры, но изменены байты где-то посередине, то это обнаруживается не сразу, а только при попытке их прочитать при каком-либо запросе `SELECT` - запрос кинет исключение о несоответствующей чексумме или размере сжатого блока. В этом случае, куски данных добавляются в очередь на проверку, и при необходимости, скачиваются с реплик.
+При старте сервера (или создании новой сессии с ZooKeeper), проверяется только количество и размеры всех файлов. Если у файлов совпадают размеры, но изменены байты где-то посередине, то это обнаруживается не сразу, а только при попытке их прочитать при каком-либо запросе `SELECT`. Запрос кинет исключение о несоответствующей чексумме или размере сжатого блока. В этом случае, куски данных добавляются в очередь на проверку, и при необходимости, скачиваются с реплик.

-Если обнаруживается, что локальный набор данных слишком сильно отличается от ожидаемого, то срабатывает защитный механизм - сервер сообщает об этом в лог и отказывается запускаться. Это сделано, так как такой случай может свидетельствовать об ошибке конфигурации - например, если реплика одного шарда была случайно сконфигурирована, как реплика другого шарда. Тем не менее, пороги защитного механизма поставлены довольно низкими, и такая ситуация может возникнуть и при обычном восстановлении после сбоя. В этом случае, восстановление делается полуавтоматически - "по кнопке".
+Если обнаруживается, что локальный набор данных слишком сильно отличается от ожидаемого, то срабатывает защитный механизм. Сервер сообщает об этом в лог и отказывается запускаться. Это сделано, так как такой случай может свидетельствовать об ошибке конфигурации - например, если реплика одного шарда была случайно сконфигурирована, как реплика другого шарда. Тем не менее, пороги защитного механизма поставлены довольно низкими, и такая ситуация может возникнуть и при обычном восстановлении после сбоя. В этом случае, восстановление делается полуавтоматически - "по кнопке".

 Для запуска восстановления, создайте в ZooKeeper узел `/path_to_table/replica_name/flags/force_restore_data` с любым содержимым или выполните команду для восстановления всех реплицируемых таблиц:

@ -166,11 +169,10 @@ sudo -u clickhouse touch /var/lib/clickhouse/flags/force_restore_data

 Затем запустите сервер (перезапустите, если уже запущен). Данные будут скачаны с реплик.

-В качестве альтернативного варианта восстановления, вы можете удалить из ZooKeeper информацию о потерянной реплике - `/path_to_table/replica_name`, и затем создать реплику заново, как написано в разделе "[Создание реплицируемых таблиц](#sozdanie-replitsiruemykh-tablits)".
+В качестве альтернативного варианта восстановления, вы можете удалить из ZooKeeper информацию о потерянной реплике (`/path_to_table/replica_name`), и затем создать реплику заново, как написано в разделе [Создание реплицированных таблиц](#creating-replicated-tables) .

 Отсутствует ограничение на использование сетевой полосы при восстановлении. Имейте это ввиду, если восстанавливаете сразу много реплик.

-
 ## Преобразование из MergeTree в ReplicatedMergeTree

 Здесь и далее, под `MergeTree` подразумеваются все движки таблиц семейства `MergeTree`, так же для `ReplicatedMergeTree`.
--- a/docs/ru/query_language/create.md
+++ b/docs/ru/query_language/create.md
@ -1,23 +1,23 @@
 ## CREATE DATABASE
+
 Создание базы данных db_name

-``` sql
+```sql
 CREATE DATABASE [IF NOT EXISTS] db_name
 ```

 `База данных` - это просто директория для таблиц.
 Если написано `IF NOT EXISTS`, то запрос не будет возвращать ошибку, если база данных уже существует.

-
-
 ## CREATE TABLE
+
 Запрос `CREATE TABLE` может иметь несколько форм.

-``` sql
-CREATE [TEMPORARY] TABLE [IF NOT EXISTS] [db.]name [ON CLUSTER cluster]
+```sql
+CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
 (
-    name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1],
-    name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2],
+    name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1] [compression_codec],
+    name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2] [compression_codec],
    ...
 ) ENGINE = engine
 ```
@ -28,14 +28,14 @@ CREATE [TEMPORARY] TABLE [IF NOT EXISTS] [db.]name [ON CLUSTER cluster]
 Описание столбца, это `name type`, в простейшем случае. Пример: `RegionID UInt32`.
 Также могут быть указаны выражения для значений по умолчанию - смотрите ниже.

-``` sql
-CREATE [TEMPORARY] TABLE [IF NOT EXISTS] [db.]name AS [db2.]name2 [ENGINE = engine]
+```sql
+CREATE TABLE [IF NOT EXISTS] [db.]table_name AS [db2.]name2 [ENGINE = engine]
 ```

 Создаёт таблицу с такой же структурой, как другая таблица. Можно указать другой движок для таблицы. Если движок не указан, то будет выбран такой же движок, как у таблицы `db2.name2`.

-``` sql
-CREATE [TEMPORARY] TABLE [IF NOT EXISTS] [db.]name ENGINE = engine AS SELECT ...
+```sql
+CREATE TABLE [IF NOT EXISTS] [db.]table_name ENGINE = engine AS SELECT ...
 ```

 Создаёт таблицу со структурой, как результат запроса `SELECT`, с движком engine, и заполняет её данными из SELECT-а.
@ -46,7 +46,6 @@ CREATE [TEMPORARY] TABLE [IF NOT EXISTS] [db.]name ENGINE = engine AS SELECT ...

 ### Значения по умолчанию

-
 В описании столбца, может быть указано выражение для значения по умолчанию, одного из следующих видов:
 `DEFAULT expr`, `MATERIALIZED expr`, `ALIAS expr`.
 Пример: `URLDomain String DEFAULT domain(URL)`.
@ -67,7 +66,7 @@ CREATE [TEMPORARY] TABLE [IF NOT EXISTS] [db.]name ENGINE = engine AS SELECT ...

 Материализованное выражение. Такой столбец не может быть указан при INSERT, то есть, он всегда вычисляется.
 При INSERT без указания списка столбцов, такие столбцы не рассматриваются.
-Также этот столбец не подставляется при использовании звёздочки в запросе SELECT - чтобы сохранить инвариант, что дамп, полученный путём `SELECT *`, можно вставить обратно в таблицу INSERT-ом без указания списка столбцов.
+Также этот столбец не подставляется при использовании звёздочки в запросе SELECT. Это необходимо, чтобы сохранить инвариант, что дамп, полученный путём `SELECT *`, можно вставить обратно в таблицу INSERT-ом без указания списка столбцов.

 `ALIAS expr`

@ -81,60 +80,71 @@ CREATE [TEMPORARY] TABLE [IF NOT EXISTS] [db.]name ENGINE = engine AS SELECT ...

 Отсутствует возможность задать значения по умолчанию для элементов вложенных структур данных.

+### Форматы сжатия для колонок
+
+Таблица может использовать общий формат сжатия, установленный в настройках сервера, или применять к каждой колонке свой формат, указанный в `compression_codec`.
+[Подробное описание](../operations/table_engines/custom_compression_codec.md).
+
 ### Временные таблицы

-Во всех случаях, если указано `TEMPORARY`, то будет создана временная таблица. Временные таблицы обладают следующими особенностями:
+ClickHouse поддерживает временные таблицы со следующими характеристиками:
+
 - временные таблицы исчезают после завершения сессии; в том числе, при обрыве соединения;
- временная таблица создаётся с движком Memory; все остальные движки таблиц не поддерживаются;
- для временной таблицы нет возможности указать БД: она создаётся вне баз данных;
+- Временная таблица использует только модуль памяти.
+- Невозможно указать базу данных для временной таблицы. Временные таблицы создается вне баз данных.
 - если временная таблица имеет то же имя, что и некоторая другая, то, при упоминании в запросе без указания БД, будет использована временная таблица;
 - при распределённой обработке запроса, используемые в запросе временные таблицы, передаются на удалённые серверы.

+Чтобы создать временную таблицу, используйте следующий синтаксис:
+
+```sql
+CREATE TEMPORARY TABLE [IF NOT EXISTS] table_name [ON CLUSTER cluster]
+(
+    name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1],
+    name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2],
+    ...
+)
+```
+
 В большинстве случаев, временные таблицы создаются не вручную, а при использовании внешних данных для запроса, или при распределённом `(GLOBAL) IN`. Подробнее см. соответствующие разделы

-Распределенные DDL запросы (секция ON CLUSTER)
----------------------------------------------
+## Распределенные DDL запросы (секция ON CLUSTER)

 Запросы `CREATE`, `DROP`, `ALTER`, `RENAME` поддерживают возможность распределенного выполнения на кластере.
-Например, следующий запрос создает `Distributed`-таблицу `all_hits` на каждом хосте кластера `cluster`:
+Например, следующий запрос создает распределенную (Distributed) таблицу `all_hits` на каждом хосте в `cluster`:

-``` sql
+```sql
 CREATE TABLE IF NOT EXISTS all_hits ON CLUSTER cluster (p Date, i Int32) ENGINE = Distributed(cluster, default, hits)
 ```

-Для корректного выполнения таких запросов необходимо на каждом хосте иметь одинаковое определение кластера (для упрощения синхронизации конфигов можете использовать подстановки из ZooKeeper), также необходимо подключение к ZooKeeper серверам.
-Локальная версия запроса в конечном итоге будет выполнена на каждом хосте кластера, даже если некоторые хосты в данный момент не доступны, гарантируется упорядоченность выполнения запросов в рамках одного хоста.
-Пока не поддерживаются `ALTER`-запросы для реплицированных таблиц.
+Для корректного выполнения таких запросов необходимо на каждом хосте иметь одинаковое определение кластера (для упрощения синхронизации конфигов можете использовать подстановки из ZooKeeper). Также необходимо подключение к ZooKeeper серверам.
+Локальная версия запроса в конечном итоге будет выполнена на каждом хосте кластера, даже если некоторые хосты в данный момент не доступны. Гарантируется упорядоченность выполнения запросов в рамках одного хоста. Для реплицированных таблиц не поддерживаются запросы `ALTER`.

 ## CREATE VIEW

-``` sql
-CREATE [MATERIALIZED] VIEW [IF NOT EXISTS] [db.]name [TO[db.]name] [ENGINE = engine] [POPULATE] AS SELECT ...
+```sql
+CREATE [MATERIALIZED] VIEW [IF NOT EXISTS] [db.]table_name [TO[db.]name] [ENGINE = engine] [POPULATE] AS SELECT ...
 ```

 Создаёт представление. Представления бывают двух видов - обычные и материализованные (MATERIALIZED).

-При создании материализованного представления, нужно обязательно указать ENGINE - движок таблицы для хранения данных.
-
-Материализованное представление работает следующим образом: при вставлении данных в таблицу, указанную в SELECT, часть вставленных данных конвертируется запросом, а результат вставляется в представление.
-
 Обычные представления не хранят никаких данных, а всего лишь производят чтение из другой таблицы. То есть, обычное представление - не более чем сохранённый запрос. При чтении из представления, этот сохранённый запрос, используется в качестве подзапроса в секции FROM.

 Для примера, пусть вы создали представление:

-``` sql
+```sql
 CREATE VIEW view AS SELECT ...
 ```

 и написали запрос:

-``` sql
+```sql
 SELECT a, b, c FROM view
 ```

 Этот запрос полностью эквивалентен использованию подзапроса:

-``` sql
+```sql
 SELECT a, b, c FROM (SELECT ...)
 ```

@ -154,5 +164,4 @@ SELECT a, b, c FROM (SELECT ...)

 Отсутствует отдельный запрос для удаления представлений. Чтобы удалить представление, следует использовать `DROP TABLE`.

-
 [Оригинальная статья](https://clickhouse.yandex/docs/ru/query_language/create/) <!--hide-->
--- a/docs/ru/query_language/table_functions/remote.md
+++ b/docs/ru/query_language/table_functions/remote.md
@ -1,5 +1,5 @@

-# remote
+# remote, remoteSecure

 Позволяет обратиться к удалённым серверам без создания таблицы типа `Distributed`.

@ -72,4 +72,6 @@ example01-{01..02}-{1|2}
 Если пользователь не задан,то используется `default`.
 Если пароль не задан, то используется пустой пароль.

+`remoteSecure` - аналогично функции `remote`, но с соединением по шифрованому каналу. Порт по умолчанию - `tcp_port_secure` из конфига или 9440.
+
 [Оригинальная статья](https://clickhouse.yandex/docs/ru/query_language/table_functions/remote/) <!--hide-->
--- a/libs/libglibc-compatibility/CMakeLists.txt
+++ b/libs/libglibc-compatibility/CMakeLists.txt
@ -31,4 +31,6 @@ add_library (glibc-compatibility ${GLIBC_COMPATIBILITY_SOURCES})

 target_include_directories(glibc-compatibility PRIVATE libcxxabi)

-add_subdirectory (tests)
+if(ENABLE_TESTS)
+    add_subdirectory(tests)
+endif()
--- a/utils/CMakeLists.txt
+++ b/utils/CMakeLists.txt
@ -21,6 +21,7 @@ if (NOT DEFINED ENABLE_UTILS OR ENABLE_UTILS)
    add_subdirectory (corrector_utf8)
    add_subdirectory (zookeeper-cli)
    add_subdirectory (zookeeper-dump-tree)
+    add_subdirectory (zookeeper-copy-tree)
    add_subdirectory (zookeeper-remove-by-list)
    add_subdirectory (zookeeper-create-entry-to-download-part)
    add_subdirectory (wikistat-loader)
--- a/utils/check-style/format.sh
+++ b/utils/check-style/format.sh
@ -1,4 +1,8 @@
-#/usr/bin/env bash
+#!/usr/bin/env bash
+
+# Format almost all code with current clang-format settings
+
+cd `readlink -f $(dirname $0)`/../..

 clang_format=`bash -c "compgen -c clang-format | grep 'clang-format-[[:digit:]]' | sort --version-sort --reverse | head -n1"`
 if [ ! -z $clang_format ]; then
--- a/utils/zookeeper-copy-tree/CMakeLists.txt
+++ b/utils/zookeeper-copy-tree/CMakeLists.txt
@ -0,0 +1,2 @@
+add_executable (zookeeper-copy-tree main.cpp ${SRCS})
+target_link_libraries(zookeeper-copy-tree PRIVATE clickhouse_common_zookeeper clickhouse_common_io ${Boost_PROGRAM_OPTIONS_LIBRARY})
--- a/utils/zookeeper-copy-tree/main.cpp
+++ b/utils/zookeeper-copy-tree/main.cpp
@ -0,0 +1,149 @@
+#include <Common/ZooKeeper/ZooKeeper.h>
+#include <Common/ZooKeeper/KeeperException.h>
+#include <Common/Exception.h>
+
+#include <boost/program_options.hpp>
+
+#include <iostream>
+
+namespace DB
+{
+namespace ErrorCodes
+{
+
+extern const int UNEXPECTED_NODE_IN_ZOOKEEPER;
+
+}
+}
+
+int main(int argc, char ** argv)
+try
+{
+    boost::program_options::options_description desc("Allowed options");
+    desc.add_options()
+        ("help,h", "produce help message")
+        ("from", boost::program_options::value<std::string>()->required(),
+            "addresses of source ZooKeeper instances, comma separated. Example: example01e.yandex.ru:2181")
+        ("from-path", boost::program_options::value<std::string>()->required(),
+            "where to copy from")
+        ("to", boost::program_options::value<std::string>()->required(),
+            "addresses of destination ZooKeeper instances, comma separated. Example: example01e.yandex.ru:2181")
+        ("to-path", boost::program_options::value<std::string>()->required(),
+            "where to copy to")
+    ;
+
+    boost::program_options::variables_map options;
+    boost::program_options::store(boost::program_options::parse_command_line(argc, argv, desc), options);
+
+    if (options.count("help"))
+    {
+        std::cout << "Copy a ZooKeeper tree to another cluster." << std::endl;
+        std::cout << "Usage: " << argv[0] << " [options]" << std::endl;
+        std::cout << "WARNING: it is almost useless as it is impossible to corretly copy sequential nodes" << std::endl;
+        std::cout << desc << std::endl;
+        return 1;
+    }
+
+    zkutil::ZooKeeper from_zookeeper(options.at("from").as<std::string>());
+    zkutil::ZooKeeper to_zookeeper(options.at("to").as<std::string>());
+
+    std::string from_path = options.at("from-path").as<std::string>();
+    std::string to_path = options.at("to-path").as<std::string>();
+
+    if (to_zookeeper.exists(to_path))
+        throw DB::Exception("Destination path: " + to_path + " already exists, aborting.",
+            DB::ErrorCodes::UNEXPECTED_NODE_IN_ZOOKEEPER);
+
+    struct Node
+    {
+        Node(
+            std::string path_,
+            std::future<Coordination::GetResponse> get_future_,
+            std::future<Coordination::ListResponse> children_future_,
+            Node * parent_)
+            : path(std::move(path_))
+            , get_future(std::move(get_future_))
+            , children_future(std::move(children_future_))
+            , parent(parent_)
+        {
+        }
+
+        std::string path;
+        std::future<Coordination::GetResponse> get_future;
+        std::future<Coordination::ListResponse> children_future;
+
+        Node * parent = nullptr;
+        std::future<Coordination::CreateResponse> create_future;
+        bool created = false;
+        bool deleted = false;
+        bool ephemeral = false;
+    };
+
+    std::list<Node> nodes_queue;
+    nodes_queue.emplace_back(
+        from_path, from_zookeeper.asyncGet(from_path), from_zookeeper.asyncGetChildren(from_path), nullptr);
+
+    to_zookeeper.createAncestors(to_path);
+
+    for (auto it = nodes_queue.begin(); it != nodes_queue.end(); ++it)
+    {
+        Coordination::GetResponse get_response;
+        Coordination::ListResponse children_response;
+        try
+        {
+            get_response = it->get_future.get();
+            children_response = it->children_future.get();
+        }
+        catch (const Coordination::Exception & e)
+        {
+            if (e.code == Coordination::ZNONODE)
+            {
+                it->deleted = true;
+                continue;
+            }
+            throw;
+        }
+
+        if (get_response.stat.ephemeralOwner)
+        {
+            it->ephemeral = true;
+            continue;
+        }
+
+        if (it->parent && !it->parent->created)
+        {
+            it->parent->create_future.get();
+            it->parent->created = true;
+            std::cerr << it->parent->path << " copied!" << std::endl;
+        }
+
+        std::string new_path = it->path;
+        new_path.replace(0, from_path.length(), to_path);
+        it->create_future = to_zookeeper.asyncCreate(new_path, get_response.data, zkutil::CreateMode::Persistent);
+        get_response.data.clear();
+        get_response.data.shrink_to_fit();
+
+        for (const auto & name : children_response.names)
+        {
+            std::string child_path = it->path == "/" ? it->path + name : it->path + '/' + name;
+            nodes_queue.emplace_back(
+                child_path, from_zookeeper.asyncGet(child_path), from_zookeeper.asyncGetChildren(child_path),
+                &(*it));
+        }
+    }
+
+    for (auto it = nodes_queue.begin(); it != nodes_queue.end(); ++it)
+    {
+        if (!it->created && !it->deleted && !it->ephemeral)
+        {
+            it->create_future.get();
+            it->created = true;
+            std::cerr << it->path << " copied!" << std::endl;
+        }
+    }
+}
+catch (...)
+{
+    std::cerr << DB::getCurrentExceptionMessage(true) << '\n';
+    throw;
+}