Merge remote-tracking branch 'upstream/master' into HEAD

2024-11-25 00:52:02 +00:00 · 2021-02-26 15:10:52 +03:00 · 2021-02-26 15:10:52 +03:00 · 6ed3cf5511
commit 6ed3cf5511
parent 5482a82342 0c5e48df9c
110 changed files with 4027 additions and 1588 deletions
--- a/base/common/defines.h
+++ b/base/common/defines.h
@ -1,5 +1,20 @@
 #pragma once

+/// __has_feature supported only by clang.
+///
+/// But libcxx/libcxxabi overrides it to 0,
+/// thus the checks for __has_feature will be wrong.
+///
+/// NOTE:
+/// - __has_feature cannot be simply undefined,
+///   since this will be broken if some C++ header will be included after
+///   including <common/defines.h>
+/// - it should not have fallback to 0,
+///   since this may create false-positive detection (common problem)
+#if defined(__clang__) && defined(__has_feature)
+#    define ch_has_feature __has_feature
+#endif
+
 #if defined(_MSC_VER)
 #   if !defined(likely)
 #      define likely(x)   (x)
@ -32,8 +47,8 @@

 /// Check for presence of address sanitizer
 #if !defined(ADDRESS_SANITIZER)
-#    if defined(__has_feature)
-#        if __has_feature(address_sanitizer)
+#    if defined(ch_has_feature)
+#        if ch_has_feature(address_sanitizer)
 #            define ADDRESS_SANITIZER 1
 #        endif
 #    elif defined(__SANITIZE_ADDRESS__)
@ -42,8 +57,8 @@
 #endif

 #if !defined(THREAD_SANITIZER)
-#    if defined(__has_feature)
-#        if __has_feature(thread_sanitizer)
+#    if defined(ch_has_feature)
+#        if ch_has_feature(thread_sanitizer)
 #            define THREAD_SANITIZER 1
 #        endif
 #    elif defined(__SANITIZE_THREAD__)
@ -52,8 +67,8 @@
 #endif

 #if !defined(MEMORY_SANITIZER)
-#    if defined(__has_feature)
-#        if __has_feature(memory_sanitizer)
+#    if defined(ch_has_feature)
+#        if ch_has_feature(memory_sanitizer)
 #            define MEMORY_SANITIZER 1
 #        endif
 #    elif defined(__MEMORY_SANITIZER__)
--- a/base/common/phdr_cache.cpp
+++ b/base/common/phdr_cache.cpp
@ -15,11 +15,11 @@
 #endif

 #define __msan_unpoison(X, Y) // NOLINT
-#if defined(__has_feature)
-#   if __has_feature(memory_sanitizer)
-#       undef __msan_unpoison
-#       include <sanitizer/msan_interface.h>
-#   endif
+#if defined(ch_has_feature)
+#    if ch_has_feature(memory_sanitizer)
+#        undef __msan_unpoison
+#        include <sanitizer/msan_interface.h>
+#    endif
 #endif

 #include <link.h>
--- a/base/mysqlxx/Connection.cpp
+++ b/base/mysqlxx/Connection.cpp
@ -51,10 +51,11 @@ Connection::Connection(
    const char* ssl_key,
    unsigned timeout,
    unsigned rw_timeout,
-    bool enable_local_infile)
+    bool enable_local_infile,
+    bool opt_reconnect)
    : Connection()
 {
-    connect(db, server, user, password, port, socket, ssl_ca, ssl_cert, ssl_key, timeout, rw_timeout, enable_local_infile);
+    connect(db, server, user, password, port, socket, ssl_ca, ssl_cert, ssl_key, timeout, rw_timeout, enable_local_infile, opt_reconnect);
 }

 Connection::Connection(const std::string & config_name)
@ -80,7 +81,8 @@ void Connection::connect(const char* db,
    const char * ssl_key,
    unsigned timeout,
    unsigned rw_timeout,
-    bool enable_local_infile)
+    bool enable_local_infile,
+    bool opt_reconnect)
 {
    if (is_connected)
        disconnect();
@ -104,9 +106,8 @@ void Connection::connect(const char* db,
    if (mysql_options(driver.get(), MYSQL_OPT_LOCAL_INFILE, &enable_local_infile_arg))
        throw ConnectionFailed(errorMessage(driver.get()), mysql_errno(driver.get()));

-    /// Enables auto-reconnect.
-    bool reconnect = true;
-    if (mysql_options(driver.get(), MYSQL_OPT_RECONNECT, reinterpret_cast<const char *>(&reconnect)))
+    /// See C API Developer Guide: Automatic Reconnection Control
+    if (mysql_options(driver.get(), MYSQL_OPT_RECONNECT, reinterpret_cast<const char *>(&opt_reconnect)))
        throw ConnectionFailed(errorMessage(driver.get()), mysql_errno(driver.get()));

    /// Specifies particular ssl key and certificate if it needs
--- a/base/mysqlxx/Connection.h
+++ b/base/mysqlxx/Connection.h
@ -14,6 +14,8 @@

 /// Disable LOAD DATA LOCAL INFILE because it is insecure
 #define MYSQLXX_DEFAULT_ENABLE_LOCAL_INFILE false
+/// See https://dev.mysql.com/doc/c-api/5.7/en/c-api-auto-reconnect.html
+#define MYSQLXX_DEFAULT_MYSQL_OPT_RECONNECT true


 namespace mysqlxx
@ -76,7 +78,8 @@ public:
        const char * ssl_key = "",
        unsigned timeout = MYSQLXX_DEFAULT_TIMEOUT,
        unsigned rw_timeout = MYSQLXX_DEFAULT_RW_TIMEOUT,
-        bool enable_local_infile = MYSQLXX_DEFAULT_ENABLE_LOCAL_INFILE);
+        bool enable_local_infile = MYSQLXX_DEFAULT_ENABLE_LOCAL_INFILE,
+        bool opt_reconnect = MYSQLXX_DEFAULT_MYSQL_OPT_RECONNECT);

    /// Creates connection. Can be used if Poco::Util::Application is using.
    /// All settings will be got from config_name section of configuration.
@ -96,7 +99,8 @@ public:
        const char* ssl_key,
        unsigned timeout = MYSQLXX_DEFAULT_TIMEOUT,
        unsigned rw_timeout = MYSQLXX_DEFAULT_RW_TIMEOUT,
-        bool enable_local_infile = MYSQLXX_DEFAULT_ENABLE_LOCAL_INFILE);
+        bool enable_local_infile = MYSQLXX_DEFAULT_ENABLE_LOCAL_INFILE,
+        bool opt_reconnect = MYSQLXX_DEFAULT_MYSQL_OPT_RECONNECT);

    void connect(const std::string & config_name)
    {
@ -112,6 +116,7 @@ public:
        std::string ssl_cert = cfg.getString(config_name + ".ssl_cert", "");
        std::string ssl_key = cfg.getString(config_name + ".ssl_key", "");
        bool enable_local_infile = cfg.getBool(config_name + ".enable_local_infile", MYSQLXX_DEFAULT_ENABLE_LOCAL_INFILE);
+        bool opt_reconnect = cfg.getBool(config_name + ".opt_reconnect", MYSQLXX_DEFAULT_MYSQL_OPT_RECONNECT);

        unsigned timeout =
            cfg.getInt(config_name + ".connect_timeout",
@ -135,7 +140,8 @@ public:
                ssl_key.c_str(),
                timeout,
                rw_timeout,
-                enable_local_infile);
+                enable_local_infile,
+                opt_reconnect);
    }

    /// If MySQL connection was established.
--- a/base/mysqlxx/Pool.cpp
+++ b/base/mysqlxx/Pool.cpp
@ -78,6 +78,9 @@ Pool::Pool(const Poco::Util::AbstractConfiguration & cfg, const std::string & co

        enable_local_infile = cfg.getBool(config_name + ".enable_local_infile",
            cfg.getBool(parent_config_name + ".enable_local_infile", MYSQLXX_DEFAULT_ENABLE_LOCAL_INFILE));
+
+        opt_reconnect = cfg.getBool(config_name + ".opt_reconnect",
+            cfg.getBool(parent_config_name + ".opt_reconnect", MYSQLXX_DEFAULT_MYSQL_OPT_RECONNECT));
    }
    else
    {
@ -96,6 +99,8 @@ Pool::Pool(const Poco::Util::AbstractConfiguration & cfg, const std::string & co

        enable_local_infile = cfg.getBool(
            config_name + ".enable_local_infile", MYSQLXX_DEFAULT_ENABLE_LOCAL_INFILE);
+
+        opt_reconnect = cfg.getBool(config_name + ".opt_reconnect", MYSQLXX_DEFAULT_MYSQL_OPT_RECONNECT);
    }

    connect_timeout = cfg.getInt(config_name + ".connect_timeout",
@ -233,7 +238,8 @@ void Pool::Entry::forceConnected() const
            pool->ssl_key.c_str(),
            pool->connect_timeout,
            pool->rw_timeout,
-            pool->enable_local_infile);
+            pool->enable_local_infile,
+            pool->opt_reconnect);
    }
 }

@ -248,7 +254,7 @@ bool Pool::Entry::tryForceConnected() const
        if (prev_connection_id != current_connection_id)
        {
            auto & logger = Poco::Util::Application::instance().logger();
-            logger.information("Connection to mysql server has been reestablished. Connection id changed: %lu -> %lu",
+            logger.information("Reconnected to mysql server. Connection id changed: %lu -> %lu",
                                prev_connection_id, current_connection_id);
        }
        return true;
@ -294,7 +300,8 @@ Pool::Connection * Pool::allocConnection(bool dont_throw_if_failed_first_time)
            ssl_key.c_str(),
            connect_timeout,
            rw_timeout,
-            enable_local_infile);
+            enable_local_infile,
+            opt_reconnect);
    }
    catch (mysqlxx::ConnectionFailed & e)
    {
--- a/base/mysqlxx/Pool.h
+++ b/base/mysqlxx/Pool.h
@ -165,10 +165,12 @@ public:
         unsigned rw_timeout_ = MYSQLXX_DEFAULT_RW_TIMEOUT,
         unsigned default_connections_ = MYSQLXX_POOL_DEFAULT_START_CONNECTIONS,
         unsigned max_connections_ = MYSQLXX_POOL_DEFAULT_MAX_CONNECTIONS,
-         unsigned enable_local_infile_ = MYSQLXX_DEFAULT_ENABLE_LOCAL_INFILE)
+         unsigned enable_local_infile_ = MYSQLXX_DEFAULT_ENABLE_LOCAL_INFILE,
+         bool opt_reconnect_ = MYSQLXX_DEFAULT_MYSQL_OPT_RECONNECT)
    : default_connections(default_connections_), max_connections(max_connections_),
    db(db_), server(server_), user(user_), password(password_), port(port_), socket(socket_),
-    connect_timeout(connect_timeout_), rw_timeout(rw_timeout_), enable_local_infile(enable_local_infile_) {}
+    connect_timeout(connect_timeout_), rw_timeout(rw_timeout_), enable_local_infile(enable_local_infile_),
+    opt_reconnect(opt_reconnect_) {}

    Pool(const Pool & other)
        : default_connections{other.default_connections},
@ -177,7 +179,7 @@ public:
          user{other.user}, password{other.password},
          port{other.port}, socket{other.socket},
          connect_timeout{other.connect_timeout}, rw_timeout{other.rw_timeout},
-          enable_local_infile{other.enable_local_infile}
+          enable_local_infile{other.enable_local_infile}, opt_reconnect(other.opt_reconnect)
    {}

    Pool & operator=(const Pool &) = delete;
@ -231,6 +233,7 @@ private:
    std::string ssl_cert;
    std::string ssl_key;
    bool enable_local_infile;
+    bool opt_reconnect;

    /// True if connection was established at least once.
    bool was_successful{false};
--- a/base/mysqlxx/PoolWithFailover.cpp
+++ b/base/mysqlxx/PoolWithFailover.cpp
@ -1,3 +1,8 @@
+#include <algorithm>
+#include <ctime>
+#include <random>
+#include <thread>
+
 #include <mysqlxx/PoolWithFailover.h>


@ -33,6 +38,19 @@ PoolWithFailover::PoolWithFailover(const Poco::Util::AbstractConfiguration & con
                    std::make_shared<Pool>(config_, replica_name, default_connections_, max_connections_, config_name_.c_str()));
            }
        }
+
+        /// PoolWithFailover objects are stored in a cache inside PoolFactory.
+        /// This cache is reset by ExternalDictionariesLoader after every SYSTEM RELOAD DICTIONAR{Y|IES}
+        /// which triggers massive re-constructing of connection pools.
+        /// The state of PRNGs like std::mt19937 is considered to be quite heavy
+        /// thus here we attempt to optimize its construction.
+        static thread_local std::mt19937 rnd_generator(
+                std::hash<std::thread::id>{}(std::this_thread::get_id()) + std::clock());
+        for (auto & [_, replicas] : replicas_by_priority)
+        {
+            if (replicas.size() > 1)
+                std::shuffle(replicas.begin(), replicas.end(), rnd_generator);
+        }
    }
    else
    {
--- a/contrib/NuRaft
+++ b/contrib/NuRaft
@ -1 +1 @@
-Subproject commit 7adf7ae33e7d5c307342431b577c8ab1025ee793
+Subproject commit 9a0d78de4b90546368d954b6434f0e9a823e8d80
--- a/docker/test/fasttest/run.sh
+++ b/docker/test/fasttest/run.sh
@ -70,6 +70,7 @@ function start_server
        --path "$FASTTEST_DATA"
        --user_files_path "$FASTTEST_DATA/user_files"
        --top_level_domains_path "$FASTTEST_DATA/top_level_domains"
+        --test_keeper_server.log_storage_path "$FASTTEST_DATA/coordination"
    )
    clickhouse-server "${opts[@]}" &>> "$FASTTEST_OUTPUT/server.log" &
    server_pid=$!
@ -375,7 +376,7 @@ function run_tests
        stop_server ||:

        # Clean the data so that there is no interference from the previous test run.
-        rm -rf "$FASTTEST_DATA"/{{meta,}data,user_files} ||:
+        rm -rf "$FASTTEST_DATA"/{{meta,}data,user_files,coordination} ||:

        start_server

--- a/docs/en/sql-reference/aggregate-functions/reference/avg.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/avg.md
@ -9,7 +9,7 @@ Calculates the arithmetic mean.
 **Syntax**

 ``` sql
-avgWeighted(x)
+avg(x)
 ```

 **Arguments**
--- a/docs/en/sql-reference/functions/date-time-functions.md
+++ b/docs/en/sql-reference/functions/date-time-functions.md
@ -61,40 +61,58 @@ int32samoa: 1546300800

 Converts a date or date with time to a UInt16 number containing the year number (AD).

+Alias: `YEAR`.
+
 ## toQuarter {#toquarter}

 Converts a date or date with time to a UInt8 number containing the quarter number.

+Alias: `QUARTER`.
+
 ## toMonth {#tomonth}

 Converts a date or date with time to a UInt8 number containing the month number (1-12).

+Alias: `MONTH`.
+
 ## toDayOfYear {#todayofyear}

 Converts a date or date with time to a UInt16 number containing the number of the day of the year (1-366).

+Alias: `DAYOFYEAR`.
+
 ## toDayOfMonth {#todayofmonth}

 Converts a date or date with time to a UInt8 number containing the number of the day of the month (1-31).

+Aliases: `DAYOFMONTH`, `DAY`.
+
 ## toDayOfWeek {#todayofweek}

 Converts a date or date with time to a UInt8 number containing the number of the day of the week (Monday is 1, and Sunday is 7).

+Alias: `DAYOFWEEK`.
+
 ## toHour {#tohour}

 Converts a date with time to a UInt8 number containing the number of the hour in 24-hour time (0-23).
 This function assumes that if clocks are moved ahead, it is by one hour and occurs at 2 a.m., and if clocks are moved back, it is by one hour and occurs at 3 a.m. (which is not always true – even in Moscow the clocks were twice changed at a different time).

+Alias: `HOUR`.
+
 ## toMinute {#tominute}

 Converts a date with time to a UInt8 number containing the number of the minute of the hour (0-59).

+Alias: `MINUTE`.
+
 ## toSecond {#tosecond}

 Converts a date with time to a UInt8 number containing the number of the second in the minute (0-59).
 Leap seconds are not accounted for.

+Alias: `SECOND`.
+
 ## toUnixTimestamp {#to-unix-timestamp}

 For DateTime argument: converts value to the number with type UInt32 -- Unix Timestamp (https://en.wikipedia.org/wiki/Unix_time).
--- a/docs/en/sql-reference/functions/encoding-functions.md
+++ b/docs/en/sql-reference/functions/encoding-functions.md
@ -75,6 +75,8 @@ Result:

 Returns a string containing the argument’s hexadecimal representation.

+Alias: `HEX`.
+
 **Syntax**

 ``` sql
--- a/docs/en/sql-reference/functions/functions-for-nulls.md
+++ b/docs/en/sql-reference/functions/functions-for-nulls.md
@ -13,6 +13,8 @@ Checks whether the argument is [NULL](../../sql-reference/syntax.md#null-literal
 isNull(x)
 ```

+Alias: `ISNULL`.
+
 **Arguments**

 -   `x` — A value with a non-compound data type.
--- a/docs/en/sql-reference/functions/ip-address-functions.md
+++ b/docs/en/sql-reference/functions/ip-address-functions.md
@ -9,10 +9,14 @@ toc_title: IP Addresses

 Takes a UInt32 number. Interprets it as an IPv4 address in big endian. Returns a string containing the corresponding IPv4 address in the format A.B.C.d (dot-separated numbers in decimal form).

+Alias: `INET_NTOA`.
+
 ## IPv4StringToNum(s) {#ipv4stringtonums}

 The reverse function of IPv4NumToString. If the IPv4 address has an invalid format, it returns 0.

+Alias: `INET_ATON`.
+
 ## IPv4NumToStringClassC(num) {#ipv4numtostringclasscnum}

 Similar to IPv4NumToString, but using xxx instead of the last octet.
@ -49,7 +53,11 @@ Since using ‘xxx’ is highly unusual, this may be changed in the future. We r
 ### IPv6NumToString(x) {#ipv6numtostringx}

 Accepts a FixedString(16) value containing the IPv6 address in binary format. Returns a string containing this address in text format.
-IPv6-mapped IPv4 addresses are output in the format ::ffff:111.222.33.44. Examples:
+IPv6-mapped IPv4 addresses are output in the format ::ffff:111.222.33.44. 
+
+Alias: `INET6_NTOA`.
+
+Examples:

 ``` sql
 SELECT IPv6NumToString(toFixedString(unhex('2A0206B8000000000000000000000011'), 16)) AS addr
@ -119,6 +127,8 @@ The reverse function of IPv6NumToString. If the IPv6 address has an invalid form
 If the IP address is a valid IPv4 address then the IPv6 equivalent of the IPv4 address is returned.
 HEX can be uppercase or lowercase.

+Alias: `INET6_ATON`.
+
 ``` sql
 SELECT cutIPv6(IPv6StringToNum('127.0.0.1'), 0, 0);
 ```
--- a/docs/en/sql-reference/functions/string-functions.md
+++ b/docs/en/sql-reference/functions/string-functions.md
@ -98,6 +98,8 @@ SELECT toValidUTF8('\x61\xF0\x80\x80\x80b')

 Repeats a string as many times as specified and concatenates the replicated values as a single string.

+Alias: `REPEAT`.
+
 **Syntax**

 ``` sql
@ -276,10 +278,14 @@ Returns the string ‘s’ that was converted from the encoding in ‘from’ to

 Encodes ‘s’ string into base64

+Alias: `TO_BASE64`.
+
 ## base64Decode(s) {#base64decode}

 Decode base64-encoded string ‘s’ into original string. In case of failure raises an exception.

+Alias: `FROM_BASE64`.
+
 ## tryBase64Decode(s) {#trybase64decode}

 Similar to base64Decode, but in case of error an empty string would be returned.
--- a/docs/en/sql-reference/functions/type-conversion-functions.md
+++ b/docs/en/sql-reference/functions/type-conversion-functions.md
@ -36,10 +36,14 @@ The behavior of functions for the [NaN and Inf](../../sql-reference/data-types/f

 **Example**

+Query:
+
 ``` sql
-SELECT toInt64(nan), toInt32(32), toInt16('16'), toInt8(8.8)
+SELECT toInt64(nan), toInt32(32), toInt16('16'), toInt8(8.8);
 ```

+Result:
+
 ``` text
 ┌─────────toInt64(nan)─┬─toInt32(32)─┬─toInt16('16')─┬─toInt8(8.8)─┐
 │ -9223372036854775808 │          32 │            16 │           8 │
@ -52,10 +56,14 @@ It takes an argument of type String and tries to parse it into Int (8 \| 16 \| 3

 **Example**

+Query:
+
 ``` sql
-select toInt64OrZero('123123'), toInt8OrZero('123qwe123')
+SELECT toInt64OrZero('123123'), toInt8OrZero('123qwe123');
 ```

+Result:
+
 ``` text
 ┌─toInt64OrZero('123123')─┬─toInt8OrZero('123qwe123')─┐
 │                  123123 │                         0 │
@ -68,10 +76,14 @@ It takes an argument of type String and tries to parse it into Int (8 \| 16 \| 3

 **Example**

+Query:
+
 ``` sql
-select toInt64OrNull('123123'), toInt8OrNull('123qwe123')
+SELECT toInt64OrNull('123123'), toInt8OrNull('123qwe123');
 ```

+Result:
+
 ``` text
 ┌─toInt64OrNull('123123')─┬─toInt8OrNull('123qwe123')─┐
 │                  123123 │                      ᴺᵁᴸᴸ │
@ -102,10 +114,14 @@ The behavior of functions for negative agruments and for the [NaN and Inf](../..

 **Example**

+Query:
+
 ``` sql
-SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8)
+SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8);
 ```

+Result:
+
 ``` text
 ┌───────toUInt64(nan)─┬─toUInt32(-32)─┬─toUInt16('16')─┬─toUInt8(8.8)─┐
 │ 9223372036854775808 │    4294967264 │             16 │            8 │
@ -124,6 +140,8 @@ SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8)

 ## toDate {#todate}

+Alias: `DATE`.
+
 ## toDateOrZero {#todateorzero}

 ## toDateOrNull {#todateornull}
@ -168,20 +186,28 @@ A value in the `Nullable(Decimal(P,S))` data type. The value contains:

 **Examples**

+Query:
+
 ``` sql
-SELECT toDecimal32OrNull(toString(-1.111), 5) AS val, toTypeName(val)
+SELECT toDecimal32OrNull(toString(-1.111), 5) AS val, toTypeName(val);
 ```

+Result:
+
 ``` text
 ┌──────val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 5))─┐
 │ -1.11100 │ Nullable(Decimal(9, 5))                            │
 └──────────┴────────────────────────────────────────────────────┘
 ```

+Query:
+
 ``` sql
-SELECT toDecimal32OrNull(toString(-1.111), 2) AS val, toTypeName(val)
+SELECT toDecimal32OrNull(toString(-1.111), 2) AS val, toTypeName(val);
 ```

+Result:
+
 ``` text
 ┌──val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 2))─┐
 │ ᴺᵁᴸᴸ │ Nullable(Decimal(9, 2))                            │
@ -213,20 +239,28 @@ A value in the `Nullable(Decimal(P,S))` data type. The value contains:

 **Example**

+Query:
+
 ``` sql
-SELECT toDecimal32OrZero(toString(-1.111), 5) AS val, toTypeName(val)
+SELECT toDecimal32OrZero(toString(-1.111), 5) AS val, toTypeName(val);
 ```

+Result:
+
 ``` text
 ┌──────val─┬─toTypeName(toDecimal32OrZero(toString(-1.111), 5))─┐
 │ -1.11100 │ Decimal(9, 5)                                      │
 └──────────┴────────────────────────────────────────────────────┘
 ```

+Query:
+
 ``` sql
-SELECT toDecimal32OrZero(toString(-1.111), 2) AS val, toTypeName(val)
+SELECT toDecimal32OrZero(toString(-1.111), 2) AS val, toTypeName(val);
 ```

+Result:
+
 ``` text
 ┌──val─┬─toTypeName(toDecimal32OrZero(toString(-1.111), 2))─┐
 │ 0.00 │ Decimal(9, 2)                                      │
@ -258,12 +292,18 @@ Conversion between numeric types uses the same rules as assignments between diff

 Additionally, the toString function of the DateTime argument can take a second String argument containing the name of the time zone. Example: `Asia/Yekaterinburg` In this case, the time is formatted according to the specified time zone.

+**Example**
+
+Query:
+
 ``` sql
 SELECT
    now() AS now_local,
-    toString(now(), 'Asia/Yekaterinburg') AS now_yekat
+    toString(now(), 'Asia/Yekaterinburg') AS now_yekat;
 ```

+Result:
+
 ``` text
 ┌───────────now_local─┬─now_yekat───────────┐
 │ 2016-06-15 00:11:21 │ 2016-06-15 02:11:21 │
@ -281,36 +321,81 @@ If the string has fewer bytes than N, it is padded with null bytes to the right.

 Accepts a String or FixedString argument. Returns the String with the content truncated at the first zero byte found.

-Example:
+**Example**
+
+Query:

 ``` sql
-SELECT toFixedString('foo', 8) AS s, toStringCutToZero(s) AS s_cut
+SELECT toFixedString('foo', 8) AS s, toStringCutToZero(s) AS s_cut;
 ```

+Result:
+
 ``` text
 ┌─s─────────────┬─s_cut─┐
 │ foo\0\0\0\0\0 │ foo   │
 └───────────────┴───────┘
 ```

+Query:
+
 ``` sql
-SELECT toFixedString('foo\0bar', 8) AS s, toStringCutToZero(s) AS s_cut
+SELECT toFixedString('foo\0bar', 8) AS s, toStringCutToZero(s) AS s_cut;
 ```

+Result:
+
 ``` text
 ┌─s──────────┬─s_cut─┐
 │ foo\0bar\0 │ foo   │
 └────────────┴───────┘
 ```

+## reinterpretAsUInt(8\|16\|32\|64) {#reinterpretasuint8163264}
+
+## reinterpretAsInt(8\|16\|32\|64) {#reinterpretasint8163264}
+
+## reinterpretAsFloat(32\|64) {#reinterpretasfloat3264}
+
+## reinterpretAsDate {#reinterpretasdate}
+
+## reinterpretAsDateTime {#reinterpretasdatetime}
+
+These functions accept a string and interpret the bytes placed at the beginning of the string as a number in host order (little endian). If the string isn’t long enough, the functions work as if the string is padded with the necessary number of null bytes. If the string is longer than needed, the extra bytes are ignored. A date is interpreted as the number of days since the beginning of the Unix Epoch, and a date with time is interpreted as the number of seconds since the beginning of the Unix Epoch.
+
+## reinterpretAsString {#type_conversion_functions-reinterpretAsString}
+
+This function accepts a number or date or date with time, and returns a string containing bytes representing the corresponding value in host order (little endian). Null bytes are dropped from the end. For example, a UInt32 type value of 255 is a string that is one byte long.
+
+## reinterpretAsFixedString {#reinterpretasfixedstring}
+
+This function accepts a number or date or date with time, and returns a FixedString containing bytes representing the corresponding value in host order (little endian). Null bytes are dropped from the end. For example, a UInt32 type value of 255 is a FixedString that is one byte long.
+
+## reinterpretAsUUID {#reinterpretasuuid}
+
+This function accepts 16 bytes string, and returns UUID containing bytes representing the corresponding value in network byte order (big-endian). If the string isn't long enough, the functions work as if the string is padded with the necessary number of null bytes to the end. If the string longer than 16 bytes, the extra bytes at the end are ignored. 
+
+**Syntax**
+
+``` sql
+reinterpretAsUUID(fixed_string)
+```
+
+**Parameters**
+
+-   `fixed_string` — Big-endian byte string. [FixedString](../../sql-reference/data-types/fixedstring.md#fixedstring).
+
 ## reinterpret(x, T) {#type_conversion_function-reinterpret}

-Performs byte reinterpretation of ‘x’ as ‘t’ data type.
+**Returned value**

-Following reinterpretations are allowed:
-1. Any type that has fixed size and value of that type can be represented continuously into FixedString.
-2. Any type that if value of that type can be represented continuously into String. Null bytes are dropped from the end. For example, a UInt32 type value of 255 is a string that is one byte long.
-3. FixedString, String, types that can be interpreted as numeric (Integers, Float, Date, DateTime, UUID) into types that can be interpreted as numeric (Integers, Float, Date, DateTime, UUID) into FixedString,
+-   The UUID type value. [UUID](../../sql-reference/data-types/uuid.md#uuid-data-type).
+
+**Examples**
+
+String to UUID.
+
+Query:

 ``` sql
 SELECT reinterpret(toInt8(-1), 'UInt8') as int_to_uint,
@ -318,39 +403,45 @@ SELECT reinterpret(toInt8(-1), 'UInt8') as int_to_uint,
    reinterpret('1', 'UInt32') as string_to_int;
 ```

+Result:
+
 ``` text
-┌─int_to_uint─┬─int_to_float─┬─string_to_int─┐
-│         255 │        1e-45 │            49 │
-└─────────────┴──────────────┴───────────────┘
+┌─reinterpretAsUUID(reverse(unhex('000102030405060708090a0b0c0d0e0f')))─┐
+│                                  08090a0b-0c0d-0e0f-0001-020304050607 │
+└───────────────────────────────────────────────────────────────────────┘
 ```

-## reinterpretAsUInt(8\|16\|32\|64\|256) {#reinterpretAsUInt8163264256}
+Going back and forth from String to UUID.

-## reinterpretAsInt(8\|16\|32\|64\|128\|256) {#reinterpretAsInt8163264128256}
+Query:

-## reinterpretAsDecimal(32\|64\|128\|256) {#reinterpretAsDecimal3264128256}
+``` sql
+WITH
+    generateUUIDv4() AS uuid,
+    identity(lower(hex(reverse(reinterpretAsString(uuid))))) AS str,
+    reinterpretAsUUID(reverse(unhex(str))) AS uuid2
+SELECT uuid = uuid2;
+```

-## reinterpretAsFloat(32\|64) {#type_conversion_function-reinterpretAsFloat}
+Result:

-## reinterpretAsDate {#type_conversion_function-reinterpretAsDate}
-
-## reinterpretAsDateTime {#type_conversion_function-reinterpretAsDateTime}
-
-## reinterpretAsDateTime64 {#type_conversion_function-reinterpretAsDateTime64}
-
-## reinterpretAsString {#type_conversion_function-reinterpretAsString}
-
-## reinterpretAsFixedString {#type_conversion_function-reinterpretAsFixedString}
-
-## reinterpretAsUUID {#type_conversion_function-reinterpretAsUUID}
-
-These functions are aliases for `reinterpret` function.
+``` text
+┌─equals(uuid, uuid2)─┐
+│                   1 │
+└─────────────────────┘
+```

 ## CAST(x, T) {#type_conversion_function-cast}

-Converts ‘x’ to the ‘t’ data type. The syntax CAST(x AS t) is also supported.
+Converts input value `x` to the `T` data type.

-Example:
+The syntax `CAST(x AS t)` is also supported.
+
+Note, that if value `x` does not fit the bounds of type T, the function overflows. For example, CAST(-1, 'UInt8') returns 255.
+
+**Example**
+
+Query:

 ``` sql
 SELECT
@ -358,9 +449,11 @@ SELECT
    CAST(timestamp AS DateTime) AS datetime,
    CAST(timestamp AS Date) AS date,
    CAST(timestamp, 'String') AS string,
-    CAST(timestamp, 'FixedString(22)') AS fixed_string
+    CAST(timestamp, 'FixedString(22)') AS fixed_string;
 ```

+Result:
+
 ``` text
 ┌─timestamp───────────┬────────────datetime─┬───────date─┬─string──────────────┬─fixed_string──────────────┐
 │ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00 │ 2016-06-15 │ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00\0\0\0 │
@ -369,12 +462,18 @@ SELECT

 Conversion to FixedString(N) only works for arguments of type String or FixedString(N).

-Type conversion to [Nullable](../../sql-reference/data-types/nullable.md) and back is supported. Example:
+Type conversion to [Nullable](../../sql-reference/data-types/nullable.md) and back is supported. 
+
+**Example**
+
+Query:

 ``` sql
-SELECT toTypeName(x) FROM t_null
+SELECT toTypeName(x) FROM t_null;
 ```

+Result:
+
 ``` text
 ┌─toTypeName(x)─┐
 │ Int8          │
@ -382,10 +481,14 @@ SELECT toTypeName(x) FROM t_null
 └───────────────┘
 ```

+Query:
+
 ``` sql
-SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null
+SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null;
 ```

+Result:
+
 ``` text
 ┌─toTypeName(CAST(x, 'Nullable(UInt16)'))─┐
 │ Nullable(UInt16)                        │
@ -399,15 +502,19 @@ SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null

 ## accurateCast(x, T) {#type_conversion_function-accurate-cast}

-Converts ‘x’ to the ‘t’ data type. The differente from cast(x, T) is that accurateCast
-does not allow overflow of numeric types during cast if type value x does not fit
-bounds of type T.
+Converts `x` to the `T` data type. 
+
+The difference from [cast(x, T)](#type_conversion_function-cast) is that `accurateCast` does not allow overflow of numeric types during cast if type value `x` does not fit the bounds of type `T`. For example, `accurateCast(-1, 'UInt8')` throws an exception.
+
+**Example**
+
+Query:

-Example
 ``` sql
 SELECT cast(-1, 'UInt8') as uint8;
 ```

+Result:

 ``` text
 ┌─uint8─┐
@ -415,38 +522,46 @@ SELECT cast(-1, 'UInt8') as uint8;
 └───────┘
 ```

+Query:
+
 ```sql
 SELECT accurateCast(-1, 'UInt8') as uint8;
 ```

+Result:
+
 ``` text
 Code: 70. DB::Exception: Received from localhost:9000. DB::Exception: Value in column Int8 cannot be safely converted into type UInt8: While processing accurateCast(-1, 'UInt8') AS uint8.
-
 ```

 ## accurateCastOrNull(x, T) {#type_conversion_function-accurate-cast_or_null}

-Converts ‘x’ to the ‘t’ data type. Always returns nullable type and returns NULL
-if the casted value is not representable in the target type.
+Converts input value `x` to the specified data type `T`. Always returns [Nullable](../../sql-reference/data-types/nullable.md) type and returns [NULL](../../sql-reference/syntax.md#null-literal) if the casted value is not representable in the target type.

-Example:
+**Syntax**
+
+```sql
+accurateCastOrNull(x, T)
+```
+
+**Parameters**
+
+-   `x` — Input value.
+-   `T` — The name of the returned data type.
+
+**Returned value**
+
+-   The value, converted to the specified data type `T`.
+
+**Example**
+
+Query:

 ``` sql
-SELECT
-    accurateCastOrNull(-1, 'UInt8') as uint8,
-    accurateCastOrNull(128, 'Int8') as int8,
-    accurateCastOrNull('Test', 'FixedString(2)') as fixed_string
+SELECT toTypeName(accurateCastOrNull(5, 'UInt8'));
 ```

-``` text
-┌─uint8─┬─int8─┬─fixed_string─┐
-│  ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ         │
-└───────┴──────┴──────────────┘┘
-```
-
-``` sql
-SELECT toTypeName(accurateCastOrNull(5, 'UInt8'))
-```
+Result:

 ``` text
 ┌─toTypeName(accurateCastOrNull(5, 'UInt8'))─┐
@ -454,6 +569,23 @@ SELECT toTypeName(accurateCastOrNull(5, 'UInt8'))
 └────────────────────────────────────────────┘
 ```

+Query:
+
+``` sql
+SELECT
+    accurateCastOrNull(-1, 'UInt8') as uint8,
+    accurateCastOrNull(128, 'Int8') as int8,
+    accurateCastOrNull('Test', 'FixedString(2)') as fixed_string;
+```
+
+Result:
+
+``` text
+┌─uint8─┬─int8─┬─fixed_string─┐
+│  ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ         │
+└───────┴──────┴──────────────┘
+```
+
 ## toInterval(Year\|Quarter\|Month\|Week\|Day\|Hour\|Minute\|Second) {#function-tointerval}

 Converts a Number type argument to an [Interval](../../sql-reference/data-types/special-data-types/interval.md) data type.
@ -481,6 +613,8 @@ toIntervalYear(number)

 **Example**

+Query:
+
 ``` sql
 WITH
    toDate('2019-01-01') AS date,
@ -488,9 +622,11 @@ WITH
    toIntervalWeek(1) AS interval_to_week
 SELECT
    date + interval_week,
-    date + interval_to_week
+    date + interval_to_week;
 ```

+Result:
+
 ``` text
 ┌─plus(date, interval_week)─┬─plus(date, interval_to_week)─┐
 │                2019-01-08 │                   2019-01-08 │
@ -506,7 +642,7 @@ The function parses [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601), [RFC 112
 **Syntax**

 ``` sql
-parseDateTimeBestEffort(time_string [, time_zone]);
+parseDateTimeBestEffort(time_string [, time_zone])
 ```

 **Arguments**
@ -549,7 +685,7 @@ Query:

 ``` sql
 SELECT parseDateTimeBestEffort('Sat, 18 Aug 2018 07:22:16 GMT', 'Europe/Moscow')
-AS parseDateTimeBestEffort
+AS parseDateTimeBestEffort;
 ```

 Result:
@ -564,7 +700,7 @@ Query:

 ``` sql
 SELECT parseDateTimeBestEffort('1284101485')
-AS parseDateTimeBestEffort
+AS parseDateTimeBestEffort;
 ```

 Result:
@ -579,7 +715,7 @@ Query:

 ``` sql
 SELECT parseDateTimeBestEffort('2018-12-12 10:12:12')
-AS parseDateTimeBestEffort
+AS parseDateTimeBestEffort;
 ```

 Result:
@ -593,7 +729,7 @@ Result:
 Query:

 ``` sql
-SELECT parseDateTimeBestEffort('10 20:19')
+SELECT parseDateTimeBestEffort('10 20:19');
 ```

 Result:
@ -613,12 +749,12 @@ Result:

 ## parseDateTimeBestEffortUS {#parsedatetimebesteffortUS}

-This function is similar to [‘parseDateTimeBestEffort’](#parsedatetimebesteffort), the only difference is that this function prefers US date format (`MM/DD/YYYY` etc.) in case of ambiguity.
+This function is similar to [parseDateTimeBestEffort](#parsedatetimebesteffort), the only difference is that this function prefers US date format (`MM/DD/YYYY` etc.) in case of ambiguity.

 **Syntax**

 ``` sql
-parseDateTimeBestEffortUS(time_string [, time_zone]);
+parseDateTimeBestEffortUS(time_string [, time_zone])
 ```

 **Arguments**
@ -693,6 +829,178 @@ Same as for [parseDateTimeBestEffort](#parsedatetimebesteffort) except that it r

 Same as for [parseDateTimeBestEffort](#parsedatetimebesteffort) except that it returns zero date or zero date time when it encounters a date format that cannot be processed.

+## parseDateTimeBestEffortUSOrNull {#parsedatetimebesteffortusornull}
+
+Same as [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS) function except that it returns `NULL` when it encounters a date format that cannot be processed.
+
+**Syntax**
+
+``` sql
+parseDateTimeBestEffortUSOrNull(time_string[, time_zone])
+```
+
+**Parameters**
+
+-   `time_string` — String containing a date or date with time to convert. The date must be in the US date format (`MM/DD/YYYY`, etc). [String](../../sql-reference/data-types/string.md).
+-   `time_zone` — [Timezone](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone). The function parses `time_string` according to the timezone. Optional. [String](../../sql-reference/data-types/string.md).
+
+**Supported non-standard formats**
+
+-   A string containing 9..10 digit [unix timestamp](https://en.wikipedia.org/wiki/Unix_time).
+-   A string with a date and a time components: `YYYYMMDDhhmmss`, `MM/DD/YYYY hh:mm:ss`, `MM-DD-YY hh:mm`, `YYYY-MM-DD hh:mm:ss`, etc.
+-   A string with a date, but no time component: `YYYY`, `YYYYMM`, `YYYY*MM`, `MM/DD/YYYY`, `MM-DD-YY`, etc.
+-   A string with a day and time: `DD`, `DD hh`, `DD hh:mm`. In this case, `YYYY-MM` are substituted with `2000-01`.
+-   A string that includes date and time along with timezone offset information: `YYYY-MM-DD hh:mm:ss ±h:mm`, etc. For example, `2020-12-12 17:36:00 -5:00`.
+
+**Returned values**
+
+-   `time_string` converted to the [DateTime](../../sql-reference/data-types/datetime.md) data type.
+-   `NULL` if the input string cannot be converted to the `DateTime` data type.
+
+**Examples**
+
+Query:
+
+``` sql
+SELECT parseDateTimeBestEffortUSOrNull('02/10/2021 21:12:57') AS parseDateTimeBestEffortUSOrNull;
+```
+
+Result:
+
+``` text
+┌─parseDateTimeBestEffortUSOrNull─┐
+│             2021-02-10 21:12:57 │
+└─────────────────────────────────┘
+```
+
+Query:
+
+``` sql
+SELECT parseDateTimeBestEffortUSOrNull('02-10-2021 21:12:57 GMT', 'Europe/Moscow') AS parseDateTimeBestEffortUSOrNull;
+```
+
+Result:
+
+``` text
+┌─parseDateTimeBestEffortUSOrNull─┐
+│             2021-02-11 00:12:57 │
+└─────────────────────────────────┘
+```
+
+Query:
+
+``` sql
+SELECT parseDateTimeBestEffortUSOrNull('02.10.2021') AS parseDateTimeBestEffortUSOrNull;
+```
+
+Result:
+
+``` text
+┌─parseDateTimeBestEffortUSOrNull─┐
+│             2021-02-10 00:00:00 │
+└─────────────────────────────────┘
+```
+
+Query:
+
+``` sql
+SELECT parseDateTimeBestEffortUSOrNull('10.2021') AS parseDateTimeBestEffortUSOrNull;
+```
+
+Result:
+
+``` text
+┌─parseDateTimeBestEffortUSOrNull─┐
+│                            ᴺᵁᴸᴸ │
+└─────────────────────────────────┘
+```
+
+## parseDateTimeBestEffortUSOrZero {#parsedatetimebesteffortusorzero}
+
+Same as [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS) function except that it returns zero date (`1970-01-01`) or zero date with time (`1970-01-01 00:00:00`) when it encounters a date format that cannot be processed.
+
+**Syntax**
+
+``` sql
+parseDateTimeBestEffortUSOrZero(time_string[, time_zone])
+```
+
+**Parameters**
+
+-   `time_string` — String containing a date or date with time to convert. The date must be in the US date format (`MM/DD/YYYY`, etc). [String](../../sql-reference/data-types/string.md).
+-   `time_zone` — [Timezone](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone). The function parses `time_string` according to the timezone. Optional. [String](../../sql-reference/data-types/string.md).
+
+**Supported non-standard formats**
+
+-   A string containing 9..10 digit [unix timestamp](https://en.wikipedia.org/wiki/Unix_time).
+-   A string with a date and a time components: `YYYYMMDDhhmmss`, `MM/DD/YYYY hh:mm:ss`, `MM-DD-YY hh:mm`, `YYYY-MM-DD hh:mm:ss`, etc.
+-   A string with a date, but no time component: `YYYY`, `YYYYMM`, `YYYY*MM`, `MM/DD/YYYY`, `MM-DD-YY`, etc.
+-   A string with a day and time: `DD`, `DD hh`, `DD hh:mm`. In this case, `YYYY-MM` are substituted with `2000-01`.
+-   A string that includes date and time along with timezone offset information: `YYYY-MM-DD hh:mm:ss ±h:mm`, etc. For example, `2020-12-12 17:36:00 -5:00`.
+
+**Returned values**
+
+-   `time_string` converted to the [DateTime](../../sql-reference/data-types/datetime.md) data type.
+-   Zero date or zero date with time if the input string cannot be converted to the `DateTime` data type.
+
+**Examples**
+
+Query:
+
+``` sql
+SELECT parseDateTimeBestEffortUSOrZero('02/10/2021 21:12:57') AS parseDateTimeBestEffortUSOrZero;
+```
+
+Result:
+
+``` text
+┌─parseDateTimeBestEffortUSOrZero─┐
+│             2021-02-10 21:12:57 │
+└─────────────────────────────────┘
+```
+
+Query:
+
+``` sql
+SELECT parseDateTimeBestEffortUSOrZero('02-10-2021 21:12:57 GMT', 'Europe/Moscow') AS parseDateTimeBestEffortUSOrZero;
+```
+
+Result:
+
+``` text
+┌─parseDateTimeBestEffortUSOrZero─┐
+│             2021-02-11 00:12:57 │
+└─────────────────────────────────┘
+```
+
+Query:
+
+``` sql
+SELECT parseDateTimeBestEffortUSOrZero('02.10.2021') AS parseDateTimeBestEffortUSOrZero;
+```
+
+Result:
+
+``` text
+┌─parseDateTimeBestEffortUSOrZero─┐
+│             2021-02-10 00:00:00 │
+└─────────────────────────────────┘
+```
+
+Query:
+
+``` sql
+SELECT parseDateTimeBestEffortUSOrZero('02.2021') AS parseDateTimeBestEffortUSOrZero;
+```
+
+Result:
+
+``` text
+┌─parseDateTimeBestEffortUSOrZero─┐
+│             1970-01-01 00:00:00 │
+└─────────────────────────────────┘
+```
+
 ## toLowCardinality {#tolowcardinality}

 Converts input parameter to the [LowCardianlity](../../sql-reference/data-types/lowcardinality.md) version of same data type.
@ -720,7 +1028,7 @@ Type: `LowCardinality(expr_result_type)`
 Query:

 ``` sql
-SELECT toLowCardinality('1')
+SELECT toLowCardinality('1');
 ```

 Result:
@ -759,7 +1067,7 @@ Query:

 ``` sql
 WITH toDateTime64('2019-09-16 19:20:12.345678910', 6) AS dt64
-SELECT toUnixTimestamp64Milli(dt64)
+SELECT toUnixTimestamp64Milli(dt64);
 ```

 Result:
@ -772,7 +1080,7 @@ Result:

 ``` sql
 WITH toDateTime64('2019-09-16 19:20:12.345678910', 6) AS dt64
-SELECT toUnixTimestamp64Nano(dt64)
+SELECT toUnixTimestamp64Nano(dt64);
 ```

 Result:
@ -806,13 +1114,17 @@ fromUnixTimestamp64Milli(value [, ti])

 -   `value` converted to the `DateTime64` data type.

-**Examples**
+**Example**
+
+Query:

 ``` sql
 WITH CAST(1234567891011, 'Int64') AS i64
-SELECT fromUnixTimestamp64Milli(i64, 'UTC')
+SELECT fromUnixTimestamp64Milli(i64, 'UTC');
 ```

+Result:
+
 ``` text
 ┌─fromUnixTimestamp64Milli(i64, 'UTC')─┐
 │              2009-02-13 23:31:31.011 │
@ -844,7 +1156,7 @@ Query:

 ``` sql
 SELECT formatRow('CSV', number, 'good')
-FROM numbers(3)
+FROM numbers(3);
 ```

 Result:
@ -885,7 +1197,7 @@ Query:

 ``` sql
 SELECT formatRowNoNewline('CSV', number, 'good')
-FROM numbers(3)
+FROM numbers(3);
 ```

 Result:
--- a/docs/en/sql-reference/operators/in.md
+++ b/docs/en/sql-reference/operators/in.md
@ -13,10 +13,28 @@ SELECT (CounterID, UserID) IN ((34, 123), (101500, 456)) FROM ...

 If the left side is a single column that is in the index, and the right side is a set of constants, the system uses the index for processing the query.

-Don’t list too many values explicitly (i.e. millions). If a data set is large, put it in a temporary table (for example, see the section “External data for query processing”), then use a subquery.
+Don’t list too many values explicitly (i.e. millions). If a data set is large, put it in a temporary table (for example, see the section [External data for query processing](../../engines/table-engines/special/external-data.md)), then use a subquery.

 The right side of the operator can be a set of constant expressions, a set of tuples with constant expressions (shown in the examples above), or the name of a database table or SELECT subquery in brackets.

+ClickHouse allows types to differ in the left and the right parts of `IN` subquery. In this case it converts the left side value to the type of the right side, as if the [accurateCastOrNull](../functions/type-conversion-functions.md#type_conversion_function-accurate-cast_or_null) function is applied. That means, that the data type becomes [Nullable](../../sql-reference/data-types/nullable.md), and if the conversion cannot be performed, it returns [NULL](../../sql-reference/syntax.md#null-literal).
+
+**Example**
+
+Query:
+
+``` sql
+SELECT '1' IN (SELECT 1);
+```
+
+Result:
+
+``` text
+┌─in('1', _subquery49)─┐
+│                    1 │
+└──────────────────────┘
+```
+
 If the right side of the operator is the name of a table (for example, `UserID IN users`), this is equivalent to the subquery `UserID IN (SELECT * FROM users)`. Use this when working with external data that is sent along with the query. For example, the query can be sent together with a set of user IDs loaded to the ‘users’ temporary table, which should be filtered.

 If the right side of the operator is a table name that has the Set engine (a prepared data set that is always in RAM), the data set will not be created over again for each query.
--- a/docs/ru/sql-reference/functions/date-time-functions.md
+++ b/docs/ru/sql-reference/functions/date-time-functions.md
@ -63,40 +63,58 @@ int32samoa: 1546300800

 Переводит дату или дату-с-временем в число типа UInt16, содержащее номер года (AD).

+Синоним: `YEAR`.
+
 ## toQuarter {#toquarter}

 Переводит дату или дату-с-временем в число типа UInt8, содержащее номер квартала.

+Синоним: `QUARTER`.
+
 ## toMonth {#tomonth}

 Переводит дату или дату-с-временем в число типа UInt8, содержащее номер месяца (1-12).

+Синоним: `MONTH`.
+
 ## toDayOfYear {#todayofyear}

 Переводит дату или дату-с-временем в число типа UInt16, содержащее номер дня года (1-366).

+Синоним: `DAYOFYEAR`.
+
 ## toDayOfMonth {#todayofmonth}

 Переводит дату или дату-с-временем в число типа UInt8, содержащее номер дня в месяце (1-31).

+Синонимы: `DAYOFMONTH`, `DAY`.
+
 ## toDayOfWeek {#todayofweek}

 Переводит дату или дату-с-временем в число типа UInt8, содержащее номер дня в неделе (понедельник - 1, воскресенье - 7).

+Синоним: `DAYOFWEEK`.
+
 ## toHour {#tohour}

 Переводит дату-с-временем в число типа UInt8, содержащее номер часа в сутках (0-23).
 Функция исходит из допущения, что перевод стрелок вперёд, если осуществляется, то на час, в два часа ночи, а перевод стрелок назад, если осуществляется, то на час, в три часа ночи (что, в общем, не верно - даже в Москве два раза перевод стрелок был осуществлён в другое время).

+Синоним: `HOUR`.
+
 ## toMinute {#tominute}

 Переводит дату-с-временем в число типа UInt8, содержащее номер минуты в часе (0-59).

+Синоним: `MINUTE`.
+
 ## toSecond {#tosecond}

 Переводит дату-с-временем в число типа UInt8, содержащее номер секунды в минуте (0-59).
 Секунды координации не учитываются.

+Синоним: `SECOND`.
+
 ## toUnixTimestamp {#to-unix-timestamp}

 Переводит дату-с-временем в число типа UInt32 -- Unix Timestamp (https://en.wikipedia.org/wiki/Unix_time).
--- a/docs/ru/sql-reference/functions/encoding-functions.md
+++ b/docs/ru/sql-reference/functions/encoding-functions.md
@ -75,6 +75,8 @@ SELECT char(0xE4, 0xBD, 0xA0, 0xE5, 0xA5, 0xBD) AS hello;

 Returns a string containing the argument’s hexadecimal representation.

+Синоним: `HEX`.
+
 **Syntax**

 ``` sql
--- a/docs/ru/sql-reference/functions/functions-for-nulls.md
+++ b/docs/ru/sql-reference/functions/functions-for-nulls.md
@ -13,6 +13,8 @@ toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u0434\u043b\u044f\u
 isNull(x)
 ```

+Синоним: `ISNULL`.
+
 **Параметры**

 -   `x` — значение с не составным типом данных.
--- a/docs/ru/sql-reference/functions/ip-address-functions.md
+++ b/docs/ru/sql-reference/functions/ip-address-functions.md
@ -9,10 +9,14 @@ toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u0434\u043b\u044f\u

 Принимает число типа UInt32. Интерпретирует его, как IPv4-адрес в big endian. Возвращает строку, содержащую соответствующий IPv4-адрес в формате A.B.C.D (числа в десятичной форме через точки).

+Синоним: `INET_NTOA`.
+
 ## IPv4StringToNum(s) {#ipv4stringtonums}

 Функция, обратная к IPv4NumToString. Если IPv4 адрес в неправильном формате, то возвращает 0.

+Синоним: `INET_ATON`.
+
 ## IPv4NumToStringClassC(num) {#ipv4numtostringclasscnum}

 Похоже на IPv4NumToString, но вместо последнего октета используется xxx.
@ -49,7 +53,11 @@ LIMIT 10
 ### IPv6NumToString(x) {#ipv6numtostringx}

 Принимает значение типа FixedString(16), содержащее IPv6-адрес в бинарном виде. Возвращает строку, содержащую этот адрес в текстовом виде.
-IPv6-mapped IPv4 адреса выводится в формате ::ffff:111.222.33.44. Примеры:
+IPv6-mapped IPv4 адреса выводится в формате ::ffff:111.222.33.44. 
+
+Примеры: `INET6_NTOA`.
+
+Примеры:

 ``` sql
 SELECT IPv6NumToString(toFixedString(unhex('2A0206B8000000000000000000000011'), 16)) AS addr
@ -118,6 +126,8 @@ LIMIT 10
 Функция, обратная к IPv6NumToString. Если IPv6 адрес в неправильном формате, то возвращает строку из нулевых байт.
 HEX может быть в любом регистре.

+Alias: `INET6_ATON`.
+
 ## IPv4ToIPv6(x) {#ipv4toipv6x}

 Принимает число типа `UInt32`. Интерпретирует его, как IPv4-адрес в [big endian](https://en.wikipedia.org/wiki/Endianness). Возвращает значение `FixedString(16)`, содержащее адрес IPv6 в двоичном формате. Примеры:
--- a/docs/ru/sql-reference/functions/string-functions.md
+++ b/docs/ru/sql-reference/functions/string-functions.md
@ -95,6 +95,8 @@ SELECT toValidUTF8('\x61\xF0\x80\x80\x80b')

 Повторяет строку определенное количество раз и объединяет повторяемые значения в одну строку.

+Синоним: `REPEAT`.
+
 **Синтаксис**

 ``` sql
@ -273,10 +275,14 @@ SELECT concat(key1, key2), sum(value) FROM key_val GROUP BY (key1, key2)

 Производит кодирование строки s в base64-представление.

+Синоним: `TO_BASE64`.
+
 ## base64Decode(s) {#base64decode}

 Декодирует base64-представление s в исходную строку. При невозможности декодирования выбрасывает исключение

+Синоним: `FROM_BASE64`.
+
 ## tryBase64Decode(s) {#trybase64decode}

 Функционал аналогичен base64Decode, но при невозможности декодирования возвращает пустую строку.
--- a/docs/ru/sql-reference/functions/type-conversion-functions.md
+++ b/docs/ru/sql-reference/functions/type-conversion-functions.md
@ -36,10 +36,14 @@ toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u043f\u0440\u0435\u

 **Пример**

+Запрос:
+
 ``` sql
-SELECT toInt64(nan), toInt32(32), toInt16('16'), toInt8(8.8)
+SELECT toInt64(nan), toInt32(32), toInt16('16'), toInt8(8.8);
 ```

+Результат:
+
 ``` text
 ┌─────────toInt64(nan)─┬─toInt32(32)─┬─toInt16('16')─┬─toInt8(8.8)─┐
 │ -9223372036854775808 │          32 │            16 │           8 │
@ -52,10 +56,14 @@ SELECT toInt64(nan), toInt32(32), toInt16('16'), toInt8(8.8)

 **Пример**

+Запрос:
+
 ``` sql
-select toInt64OrZero('123123'), toInt8OrZero('123qwe123')
+SELECT toInt64OrZero('123123'), toInt8OrZero('123qwe123');
 ```

+Результат:
+
 ``` text
 ┌─toInt64OrZero('123123')─┬─toInt8OrZero('123qwe123')─┐
 │                  123123 │                         0 │
@ -68,10 +76,14 @@ select toInt64OrZero('123123'), toInt8OrZero('123qwe123')

 **Пример**

+Запрос:
+
 ``` sql
-select toInt64OrNull('123123'), toInt8OrNull('123qwe123')
+SELECT toInt64OrNull('123123'), toInt8OrNull('123qwe123');
 ```

+Результат:
+
 ``` text
 ┌─toInt64OrNull('123123')─┬─toInt8OrNull('123qwe123')─┐
 │                  123123 │                      ᴺᵁᴸᴸ │
@ -102,10 +114,14 @@ select toInt64OrNull('123123'), toInt8OrNull('123qwe123')

 **Пример**

+Запрос:
+
 ``` sql
-SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8)
+SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8);
 ```

+Результат:
+
 ``` text
 ┌───────toUInt64(nan)─┬─toUInt32(-32)─┬─toUInt16('16')─┬─toUInt8(8.8)─┐
 │ 9223372036854775808 │    4294967264 │             16 │            8 │
@ -124,6 +140,8 @@ SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8)

 ## toDate {#todate}

+Cиноним: `DATE`.
+
 ## toDateOrZero {#todateorzero}

 ## toDateOrNull {#todateornull}
@ -168,20 +186,28 @@ SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8)

 **Примеры**

+Запрос:
+
 ``` sql
-SELECT toDecimal32OrNull(toString(-1.111), 5) AS val, toTypeName(val)
+SELECT toDecimal32OrNull(toString(-1.111), 5) AS val, toTypeName(val);
 ```

+Результат:
+
 ``` text
 ┌──────val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 5))─┐
 │ -1.11100 │ Nullable(Decimal(9, 5))                            │
 └──────────┴────────────────────────────────────────────────────┘
 ```

+Запрос:
+
 ``` sql
-SELECT toDecimal32OrNull(toString(-1.111), 2) AS val, toTypeName(val)
+SELECT toDecimal32OrNull(toString(-1.111), 2) AS val, toTypeName(val);
 ```

+Результат:
+
 ``` text
 ┌──val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 2))─┐
 │ ᴺᵁᴸᴸ │ Nullable(Decimal(9, 2))                            │
@ -213,20 +239,28 @@ SELECT toDecimal32OrNull(toString(-1.111), 2) AS val, toTypeName(val)

 **Пример**

+Запрос:
+
 ``` sql
-SELECT toDecimal32OrZero(toString(-1.111), 5) AS val, toTypeName(val)
+SELECT toDecimal32OrZero(toString(-1.111), 5) AS val, toTypeName(val);
 ```

+Результат:
+
 ``` text
 ┌──────val─┬─toTypeName(toDecimal32OrZero(toString(-1.111), 5))─┐
 │ -1.11100 │ Decimal(9, 5)                                      │
 └──────────┴────────────────────────────────────────────────────┘
 ```

+Запрос:
+
 ``` sql
-SELECT toDecimal32OrZero(toString(-1.111), 2) AS val, toTypeName(val)
+SELECT toDecimal32OrZero(toString(-1.111), 2) AS val, toTypeName(val);
 ```

+Результат:
+
 ``` text
 ┌──val─┬─toTypeName(toDecimal32OrZero(toString(-1.111), 2))─┐
 │ 0.00 │ Decimal(9, 2)                                      │
@ -258,12 +292,18 @@ YYYY-MM-DD hh:mm:ss

 Дополнительно, функция toString от аргумента типа DateTime может принимать второй аргумент String - имя тайм-зоны. Пример: `Asia/Yekaterinburg` В этом случае, форматирование времени производится согласно указанной тайм-зоне.

+**Пример**
+
+Запрос:
+
 ``` sql
 SELECT
    now() AS now_local,
-    toString(now(), 'Asia/Yekaterinburg') AS now_yekat
+    toString(now(), 'Asia/Yekaterinburg') AS now_yekat;
 ```

+Результат:
+
 ``` text
 ┌───────────now_local─┬─now_yekat───────────┐
 │ 2016-06-15 00:11:21 │ 2016-06-15 02:11:21 │
@ -281,22 +321,30 @@ SELECT

 Принимает аргумент типа String или FixedString. Возвращает String, вырезая содержимое строки до первого найденного нулевого байта.

-Пример:
+**Примеры**
+
+Запрос:

 ``` sql
-SELECT toFixedString('foo', 8) AS s, toStringCutToZero(s) AS s_cut
+SELECT toFixedString('foo', 8) AS s, toStringCutToZero(s) AS s_cut;
 ```

+Результат:
+
 ``` text
 ┌─s─────────────┬─s_cut─┐
 │ foo\0\0\0\0\0 │ foo   │
 └───────────────┴───────┘
 ```

+Запрос:
+
 ``` sql
-SELECT toFixedString('foo\0bar', 8) AS s, toStringCutToZero(s) AS s_cut
+SELECT toFixedString('foo\0bar', 8) AS s, toStringCutToZero(s) AS s_cut;
 ```

+Результат:
+
 ``` text
 ┌─s──────────┬─s_cut─┐
 │ foo\0bar\0 │ foo   │
@ -344,7 +392,7 @@ reinterpretAsUUID(fixed_string)
 Запрос:

 ``` sql
-SELECT reinterpretAsUUID(reverse(unhex('000102030405060708090a0b0c0d0e0f')))
+SELECT reinterpretAsUUID(reverse(unhex('000102030405060708090a0b0c0d0e0f')));
 ```

 Результат:
@ -377,10 +425,15 @@ SELECT uuid = uuid2;

 ## CAST(x, T) {#type_conversion_function-cast}

-Преобразует x в тип данных t.
-Поддерживается также синтаксис CAST(x AS t).
+Преобразует входное значение `x` в указанный тип данных `T`.

-Пример:
+Поддерживается также синтаксис `CAST(x AS t)`.
+
+Обратите внимание, что если значение `x` не может быть преобразовано к типу `T`, возникает переполнение. Например, `CAST(-1, 'UInt8')` возвращает 255.
+
+**Пример**
+
+Запрос:

 ``` sql
 SELECT
@ -388,9 +441,11 @@ SELECT
    CAST(timestamp AS DateTime) AS datetime,
    CAST(timestamp AS Date) AS date,
    CAST(timestamp, 'String') AS string,
-    CAST(timestamp, 'FixedString(22)') AS fixed_string
+    CAST(timestamp, 'FixedString(22)') AS fixed_string;
 ```

+Результат:
+
 ``` text
 ┌─timestamp───────────┬────────────datetime─┬───────date─┬─string──────────────┬─fixed_string──────────────┐
 │ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00 │ 2016-06-15 │ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00\0\0\0 │
@ -399,12 +454,18 @@ SELECT

 Преобразование в FixedString(N) работает только для аргументов типа String или FixedString(N).

-Поддержано преобразование к типу [Nullable](../../sql-reference/functions/type-conversion-functions.md) и обратно. Пример:
+Поддержано преобразование к типу [Nullable](../../sql-reference/functions/type-conversion-functions.md) и обратно. 
+
+**Примеры**
+
+Запрос:

 ``` sql
-SELECT toTypeName(x) FROM t_null
+SELECT toTypeName(x) FROM t_null;
 ```

+Результат:
+
 ``` text
 ┌─toTypeName(x)─┐
 │ Int8          │
@ -412,10 +473,14 @@ SELECT toTypeName(x) FROM t_null
 └───────────────┘
 ```

+Запрос:
+
 ``` sql
-SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null
+SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null;
 ```

+Результат:
+
 ``` text
 ┌─toTypeName(CAST(x, 'Nullable(UInt16)'))─┐
 │ Nullable(UInt16)                        │
@ -427,6 +492,93 @@ SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null

 -   Настройка [cast_keep_nullable](../../operations/settings/settings.md#cast_keep_nullable)

+## accurateCast(x, T) {#type_conversion_function-accurate-cast}
+
+Преобразует входное значение `x` в указанный тип данных `T`.
+
+В отличие от функции [cast(x, T)](#type_conversion_function-cast), `accurateCast` не допускает переполнения при преобразовании числовых типов. Например, `accurateCast(-1, 'UInt8')` вызовет исключение.
+
+**Примеры**
+
+Запрос:
+
+``` sql
+SELECT cast(-1, 'UInt8') as uint8; 
+```
+
+Результат:
+
+``` text
+┌─uint8─┐
+│   255 │
+└─────
+
+Запрос:
+
+```sql
+SELECT accurateCast(-1, 'UInt8') as uint8;
+```
+
+Результат:
+
+``` text
+Code: 70. DB::Exception: Received from localhost:9000. DB::Exception: Value in column Int8 cannot be safely converted into type UInt8: While processing accurateCast(-1, 'UInt8') AS uint8.
+```
+
+## accurateCastOrNull(x, T) {#type_conversion_function-accurate-cast_or_null}
+
+Преобразует входное значение `x` в указанный тип данных `T`.
+
+Всегда возвращает тип [Nullable](../../sql-reference/data-types/nullable.md). Если исходное значение не может быть преобразовано к целевому типу, возвращает [NULL](../../sql-reference/syntax.md#null-literal).
+
+**Синтаксис**
+
+```sql
+accurateCastOrNull(x, T)
+```
+
+**Параметры**
+
+-   `x` — входное значение.
+-   `T` — имя возвращаемого типа данных.
+
+**Возвращаемое значение**
+
+-   Значение, преобразованное в указанный тип `T`.
+
+**Примеры**
+
+Запрос:
+
+``` sql
+SELECT toTypeName(accurateCastOrNull(5, 'UInt8'));
+```
+
+Результат:
+
+``` text
+┌─toTypeName(accurateCastOrNull(5, 'UInt8'))─┐
+│ Nullable(UInt8)                            │
+└────────────────────────────────────────────┘
+```
+
+Запрос:
+
+``` sql
+SELECT
+    accurateCastOrNull(-1, 'UInt8') as uint8,
+    accurateCastOrNull(128, 'Int8') as int8,
+    accurateCastOrNull('Test', 'FixedString(2)') as fixed_string;
+```
+
+Результат:
+
+``` text
+┌─uint8─┬─int8─┬─fixed_string─┐
+│  ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ         │
+└───────┴──────┴──────────────┘
+```
+
 ## toInterval(Year\|Quarter\|Month\|Week\|Day\|Hour\|Minute\|Second) {#function-tointerval}

 Приводит аргумент из числового типа данных к типу данных [IntervalType](../../sql-reference/data-types/special-data-types/interval.md).
@ -454,6 +606,8 @@ toIntervalYear(number)

 **Пример**

+Запрос:
+
 ``` sql
 WITH
    toDate('2019-01-01') AS date,
@ -461,9 +615,11 @@ WITH
    toIntervalWeek(1) AS interval_to_week
 SELECT
    date + interval_week,
-    date + interval_to_week
+    date + interval_to_week;
 ```

+Результат:
+
 ``` text
 ┌─plus(date, interval_week)─┬─plus(date, interval_to_week)─┐
 │                2019-01-08 │                   2019-01-08 │
@ -479,7 +635,7 @@ SELECT
 **Синтаксис**

 ``` sql
-parseDateTimeBestEffort(time_string[, time_zone]);
+parseDateTimeBestEffort(time_string[, time_zone])
 ```

 **Параметры**
@ -522,7 +678,7 @@ AS parseDateTimeBestEffort;

 ``` sql
 SELECT parseDateTimeBestEffort('Sat, 18 Aug 2018 07:22:16 GMT', 'Europe/Moscow')
-AS parseDateTimeBestEffort
+AS parseDateTimeBestEffort;
 ```

 Результат:
@ -537,7 +693,7 @@ AS parseDateTimeBestEffort

 ``` sql
 SELECT parseDateTimeBestEffort('1284101485')
-AS parseDateTimeBestEffort
+AS parseDateTimeBestEffort;
 ```

 Результат:
@ -552,7 +708,7 @@ AS parseDateTimeBestEffort

 ``` sql
 SELECT parseDateTimeBestEffort('2018-12-12 10:12:12')
-AS parseDateTimeBestEffort
+AS parseDateTimeBestEffort;
 ```

 Результат:
@ -566,7 +722,7 @@ AS parseDateTimeBestEffort
 Запрос:

 ``` sql
-SELECT parseDateTimeBestEffort('10 20:19')
+SELECT parseDateTimeBestEffort('10 20:19');
 ```

 Результат:
@ -591,7 +747,7 @@ SELECT parseDateTimeBestEffort('10 20:19')
 **Синтаксис**

 ``` sql
-parseDateTimeBestEffortUS(time_string [, time_zone]);
+parseDateTimeBestEffortUS(time_string [, time_zone])
 ```

 **Параметры**
@ -620,7 +776,7 @@ SELECT parseDateTimeBestEffortUS('09/12/2020 12:12:57')
 AS parseDateTimeBestEffortUS;
 ```

-Ответ:
+Результат:

 ``` text
 ┌─parseDateTimeBestEffortUS─┐
@ -635,7 +791,7 @@ SELECT parseDateTimeBestEffortUS('09-12-2020 12:12:57')
 AS parseDateTimeBestEffortUS;
 ```

-Ответ:
+Результат:

 ``` text
 ┌─parseDateTimeBestEffortUS─┐
@ -650,7 +806,7 @@ SELECT parseDateTimeBestEffortUS('09.12.2020 12:12:57')
 AS parseDateTimeBestEffortUS;
 ```

-Ответ:
+Результат:

 ``` text
 ┌─parseDateTimeBestEffortUS─┐
@ -658,6 +814,178 @@ AS parseDateTimeBestEffortUS;
 └─────────────────────────——┘
 ```

+## parseDateTimeBestEffortUSOrNull {#parsedatetimebesteffortusornull}
+
+Работает аналогично функции [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS), но в отличие от нее возвращает `NULL`, если входная строка не может быть преобразована в тип данных [DateTime](../../sql-reference/data-types/datetime.md).
+
+**Синтаксис**
+
+``` sql
+parseDateTimeBestEffortUSOrNull(time_string[, time_zone])
+```
+
+**Параметры**
+
+-   `time_string` — строка, содержащая дату или дату со временем для преобразования. Дата должна быть в американском формате (`MM/DD/YYYY` и т.д.). [String](../../sql-reference/data-types/string.md).
+-   `time_zone` — [часовой пояс](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone). Функция анализирует `time_string` в соответствии с заданным часовым поясом. Опциональный параметр. [String](../../sql-reference/data-types/string.md).
+
+**Поддерживаемые нестандартные форматы**
+
+-   Строка в формате [unix timestamp](https://en.wikipedia.org/wiki/Unix_time), содержащая 9-10 цифр.
+-   Строка, содержащая дату и время: `YYYYMMDDhhmmss`, `MM/DD/YYYY hh:mm:ss`, `MM-DD-YY hh:mm`, `YYYY-MM-DD hh:mm:ss` и т.д.
+-   Строка, содержащая дату без времени: `YYYY`, `YYYYMM`, `YYYY*MM`, `MM/DD/YYYY`, `MM-DD-YY` и т.д.
+-   Строка, содержащая день и время: `DD`, `DD hh`, `DD hh:mm`. В этом случае `YYYY-MM` заменяется на `2000-01`.
+-   Строка, содержащая дату и время, а также информацию о часовом поясе: `YYYY-MM-DD hh:mm:ss ±h:mm` и т.д. Например, `2020-12-12 17:36:00 -5:00`.
+
+**Возвращаемые значения**
+
+-   `time_string`, преобразованная в тип данных `DateTime`.
+-   `NULL`, если входная строка не может быть преобразована в тип данных `DateTime`.
+
+**Примеры**
+
+Запрос:
+
+``` sql
+SELECT parseDateTimeBestEffortUSOrNull('02/10/2021 21:12:57') AS parseDateTimeBestEffortUSOrNull;
+```
+
+Результат:
+
+``` text
+┌─parseDateTimeBestEffortUSOrNull─┐
+│             2021-02-10 21:12:57 │
+└─────────────────────────────────┘
+```
+
+Запрос:
+
+``` sql
+SELECT parseDateTimeBestEffortUSOrNull('02-10-2021 21:12:57 GMT', 'Europe/Moscow') AS parseDateTimeBestEffortUSOrNull;
+```
+
+Результат:
+
+``` text
+┌─parseDateTimeBestEffortUSOrNull─┐
+│             2021-02-11 00:12:57 │
+└─────────────────────────────────┘
+```
+
+Запрос:
+
+``` sql
+SELECT parseDateTimeBestEffortUSOrNull('02.10.2021') AS parseDateTimeBestEffortUSOrNull;
+```
+
+Результат:
+
+``` text
+┌─parseDateTimeBestEffortUSOrNull─┐
+│             2021-02-10 00:00:00 │
+└─────────────────────────────────┘
+```
+
+Запрос:
+
+``` sql
+SELECT parseDateTimeBestEffortUSOrNull('10.2021') AS parseDateTimeBestEffortUSOrNull;
+```
+
+Результат:
+
+``` text
+┌─parseDateTimeBestEffortUSOrNull─┐
+│                            ᴺᵁᴸᴸ │
+└─────────────────────────────────┘
+```
+
+## parseDateTimeBestEffortUSOrZero {#parsedatetimebesteffortusorzero}
+
+Работает аналогично функции [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS), но в отличие от нее возвращает нулевую дату (`1970-01-01`) или нулевую дату со временем (`1970-01-01 00:00:00`), если входная строка не может быть преобразована в тип данных [DateTime](../../sql-reference/data-types/datetime.md).
+
+**Синтаксис**
+
+``` sql
+parseDateTimeBestEffortUSOrZero(time_string[, time_zone])
+```
+
+**Параметры**
+
+-   `time_string` — строка, содержащая дату или дату со временем для преобразования. Дата должна быть в американском формате (`MM/DD/YYYY` и т.д.). [String](../../sql-reference/data-types/string.md).
+-   `time_zone` — [часовой пояс](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone). Функция анализирует `time_string` в соответствии с заданным часовым поясом. Опциональный параметр. [String](../../sql-reference/data-types/string.md).
+
+**Поддерживаемые нестандартные форматы**
+
+-   Строка в формате [unix timestamp](https://en.wikipedia.org/wiki/Unix_time), содержащая 9-10 цифр.
+-   Строка, содержащая дату и время: `YYYYMMDDhhmmss`, `MM/DD/YYYY hh:mm:ss`, `MM-DD-YY hh:mm`, `YYYY-MM-DD hh:mm:ss` и т.д.
+-   Строка, содержащая дату без времени: `YYYY`, `YYYYMM`, `YYYY*MM`, `MM/DD/YYYY`, `MM-DD-YY` и т.д.
+-   Строка, содержащая день и время: `DD`, `DD hh`, `DD hh:mm`. В этом случае `YYYY-MM` заменяется на `2000-01`.
+-   Строка, содержащая дату и время, а также информацию о часовом поясе: `YYYY-MM-DD hh:mm:ss ±h:mm` и т.д. Например, `2020-12-12 17:36:00 -5:00`.
+
+**Возвращаемые значения**
+
+-   `time_string`, преобразованная в тип данных `DateTime`.
+-   Нулевая дата или нулевая дата со временем, если входная строка не может быть преобразована в тип данных `DateTime`.
+
+**Примеры**
+
+Запрос:
+
+``` sql
+SELECT parseDateTimeBestEffortUSOrZero('02/10/2021 21:12:57') AS parseDateTimeBestEffortUSOrZero;
+```
+
+Результат:
+
+``` text
+┌─parseDateTimeBestEffortUSOrZero─┐
+│             2021-02-10 21:12:57 │
+└─────────────────────────────────┘
+```
+
+Запрос:
+
+``` sql
+SELECT parseDateTimeBestEffortUSOrZero('02-10-2021 21:12:57 GMT', 'Europe/Moscow') AS parseDateTimeBestEffortUSOrZero;
+```
+
+Результат:
+
+``` text
+┌─parseDateTimeBestEffortUSOrZero─┐
+│             2021-02-11 00:12:57 │
+└─────────────────────────────────┘
+```
+
+Запрос:
+
+``` sql
+SELECT parseDateTimeBestEffortUSOrZero('02.10.2021') AS parseDateTimeBestEffortUSOrZero;
+```
+
+Результат:
+
+``` text
+┌─parseDateTimeBestEffortUSOrZero─┐
+│             2021-02-10 00:00:00 │
+└─────────────────────────────────┘
+```
+
+Запрос:
+
+``` sql
+SELECT parseDateTimeBestEffortUSOrZero('02.2021') AS parseDateTimeBestEffortUSOrZero;
+```
+
+Результат:
+
+``` text
+┌─parseDateTimeBestEffortUSOrZero─┐
+│             1970-01-01 00:00:00 │
+└─────────────────────────────────┘
+```
+
 ## toUnixTimestamp64Milli
 ## toUnixTimestamp64Micro
 ## toUnixTimestamp64Nano
@ -685,10 +1013,10 @@ toUnixTimestamp64Milli(value)

 ``` sql
 WITH toDateTime64('2019-09-16 19:20:12.345678910', 6) AS dt64
-SELECT toUnixTimestamp64Milli(dt64)
+SELECT toUnixTimestamp64Milli(dt64);
 ```

-Ответ:
+Результат:

 ``` text
 ┌─toUnixTimestamp64Milli(dt64)─┐
@ -700,10 +1028,10 @@ SELECT toUnixTimestamp64Milli(dt64)

 ``` sql
 WITH toDateTime64('2019-09-16 19:20:12.345678910', 6) AS dt64
-SELECT toUnixTimestamp64Nano(dt64)
+SELECT toUnixTimestamp64Nano(dt64);
 ```

-Ответ:
+Результат:

 ``` text
 ┌─toUnixTimestamp64Nano(dt64)─┐
@ -738,10 +1066,10 @@ fromUnixTimestamp64Milli(value [, ti])

 ``` sql
 WITH CAST(1234567891011, 'Int64') AS i64
-SELECT fromUnixTimestamp64Milli(i64, 'UTC')
+SELECT fromUnixTimestamp64Milli(i64, 'UTC');
 ```

-Ответ:
+Результат:

 ``` text
 ┌─fromUnixTimestamp64Milli(i64, 'UTC')─┐
@ -772,12 +1100,12 @@ toLowCardinality(expr)

 Тип: `LowCardinality(expr_result_type)`

-**Example**
+**Пример**

 Запрос:

 ```sql
-SELECT toLowCardinality('1')
+SELECT toLowCardinality('1');
 ```

 Результат:
@ -813,10 +1141,10 @@ formatRow(format, x, y, ...)

 ``` sql
 SELECT formatRow('CSV', number, 'good')
-FROM numbers(3)
+FROM numbers(3);
 ```

-Ответ:
+Результат:

 ``` text
 ┌─formatRow('CSV', number, 'good')─┐
@ -854,10 +1182,10 @@ formatRowNoNewline(format, x, y, ...)

 ``` sql
 SELECT formatRowNoNewline('CSV', number, 'good')
-FROM numbers(3)
+FROM numbers(3);
 ```

-Ответ:
+Результат:

 ``` text
 ┌─formatRowNoNewline('CSV', number, 'good')─┐
--- a/docs/ru/sql-reference/operators/in.md
+++ b/docs/ru/sql-reference/operators/in.md
@ -13,10 +13,28 @@ SELECT (CounterID, UserID) IN ((34, 123), (101500, 456)) FROM ...

 Если слева стоит один столбец, входящий в индекс, а справа - множество констант, то при выполнении запроса, система воспользуется индексом.

-Не перечисляйте слишком большое количество значений (миллионы) явно. Если множество большое - лучше загрузить его во временную таблицу (например, смотрите раздел «Внешние данные для обработки запроса»), и затем воспользоваться подзапросом.
+Не перечисляйте слишком большое количество значений (миллионы) явно. Если множество большое - лучше загрузить его во временную таблицу (например, смотрите раздел [Внешние данные для обработки запроса](../../engines/table-engines/special/external-data.md)), и затем воспользоваться подзапросом.

 В качестве правой части оператора может быть множество константных выражений, множество кортежей с константными выражениями (показано в примерах выше), а также имя таблицы или подзапрос SELECT в скобках.

+Если типы данных в левой и правой частях подзапроса `IN` различаются, ClickHouse преобразует значение в левой части к типу данных из правой части. Преобразование выполняется по аналогии с функцией [accurateCastOrNull](../functions/type-conversion-functions.md#type_conversion_function-accurate-cast_or_null), т.е. тип данных становится [Nullable](../../sql-reference/data-types/nullable.md), а если преобразование не может быть выполнено, возвращается значение [NULL](../../sql-reference/syntax.md#null-literal).
+
+**Пример**
+
+Запрос:
+
+``` sql
+SELECT '1' IN (SELECT 1);
+```
+
+Результат:
+
+``` text
+┌─in('1', _subquery49)─┐
+│                    1 │
+└──────────────────────┘
+```
+
 Если в качестве правой части оператора указано имя таблицы (например, `UserID IN users`), то это эквивалентно подзапросу `UserID IN (SELECT * FROM users)`. Это используется при работе с внешними данными, отправляемым вместе с запросом. Например, вместе с запросом может быть отправлено множество идентификаторов посетителей, загруженное во временную таблицу users, по которому следует выполнить фильтрацию.

 Если в качестве правой части оператора, указано имя таблицы, имеющий движок Set (подготовленное множество, постоянно находящееся в оперативке), то множество не будет создаваться заново при каждом запросе.
--- a/src/AggregateFunctions/ReservoirSamplerDeterministic.h
+++ b/src/AggregateFunctions/ReservoirSamplerDeterministic.h
@ -56,7 +56,7 @@ class ReservoirSamplerDeterministic
 {
    bool good(const UInt32 hash)
    {
-        return hash == ((hash >> skip_degree) << skip_degree);
+        return !(hash & skip_mask);
    }

 public:
@ -135,11 +135,8 @@ public:
            throw Poco::Exception("Cannot merge ReservoirSamplerDeterministic's with different max sample size");
        sorted = false;

-        if (b.skip_degree > skip_degree)
-        {
-            skip_degree = b.skip_degree;
-            thinOut();
-        }
+        if (skip_degree < b.skip_degree)
+            setSkipDegree(b.skip_degree);

        for (const auto & sample : b.samples)
            if (good(sample.second))
@ -184,22 +181,39 @@ private:
    size_t total_values = 0;   /// How many values were inserted (regardless if they remain in sample or not).
    bool sorted = false;
    Array samples;
-    UInt8 skip_degree = 0;     /// The number N determining that we save only one per 2^N elements in average.
+
+    /// The number N determining that we store only one per 2^N elements in average.
+    UInt8 skip_degree = 0;
+
+    /// skip_mask is calculated as (2 ^ skip_degree - 1). We store an element only if (hash & skip_mask) == 0.
+    /// For example, if skip_degree==0 then skip_mask==0 means we store each element;
+    /// if skip_degree==1 then skip_mask==0b0001 means we store one per 2 elements in average;
+    /// if skip_degree==4 then skip_mask==0b1111 means we store one per 16 elements in average.
+    UInt32 skip_mask = 0;

    void insertImpl(const T & v, const UInt32 hash)
    {
        /// Make a room for plus one element.
        while (samples.size() >= max_sample_size)
-        {
-            ++skip_degree;
-            if (skip_degree > detail::MAX_SKIP_DEGREE)
-                throw DB::Exception{"skip_degree exceeds maximum value", DB::ErrorCodes::MEMORY_LIMIT_EXCEEDED};
-            thinOut();
-        }
+            setSkipDegree(skip_degree + 1);

        samples.emplace_back(v, hash);
    }

+    void setSkipDegree(UInt8 skip_degree_)
+    {
+        if (skip_degree_ == skip_degree)
+            return;
+        if (skip_degree_ > detail::MAX_SKIP_DEGREE)
+            throw DB::Exception{"skip_degree exceeds maximum value", DB::ErrorCodes::MEMORY_LIMIT_EXCEEDED};
+        skip_degree = skip_degree_;
+        if (skip_degree == detail::MAX_SKIP_DEGREE)
+            skip_mask = static_cast<UInt32>(-1);
+        else
+            skip_mask = (1 << skip_degree) - 1;
+        thinOut();
+    }
+
    void thinOut()
    {
        samples.resize(std::distance(samples.begin(),
--- a/src/Common/MemorySanitizer.h
+++ b/src/Common/MemorySanitizer.h
@ -1,5 +1,7 @@
 #pragma once

+#include <common/defines.h>
+
 #ifdef __clang__
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wreserved-id-macro"
@ -9,14 +11,15 @@
 #define __msan_test_shadow(X, Y) (false)
 #define __msan_print_shadow(X, Y)
 #define __msan_unpoison_string(X)
-#if defined(__has_feature)
-#   if __has_feature(memory_sanitizer)
-#       undef __msan_unpoison
-#       undef __msan_test_shadow
-#       undef __msan_print_shadow
-#       undef __msan_unpoison_string
-#       include <sanitizer/msan_interface.h>
-#   endif
+
+#if defined(ch_has_feature)
+#    if ch_has_feature(memory_sanitizer)
+#        undef __msan_unpoison
+#        undef __msan_test_shadow
+#        undef __msan_print_shadow
+#        undef __msan_unpoison_string
+#        include <sanitizer/msan_interface.h>
+#    endif
 #endif

 #ifdef __clang__
--- a/src/Common/StackTrace.h
+++ b/src/Common/StackTrace.h
@ -37,8 +37,12 @@ public:

    static constexpr size_t capacity =
 #ifndef NDEBUG
-        /* The stacks are normally larger in debug version due to less inlining. */
-        64
+        /* The stacks are normally larger in debug version due to less inlining.
+         *
+         * NOTE: it cannot be larger then 56 right now, since otherwise it will
+         * not fit into minimal PIPE_BUF (512) in TraceCollector.
+         */
+        56
 #else
        32
 #endif
--- a/src/Common/SymbolIndex.cpp
+++ b/src/Common/SymbolIndex.cpp
@ -60,11 +60,11 @@ Otherwise you will get only exported symbols from program headers.
 #endif

 #define __msan_unpoison_string(X) // NOLINT
-#if defined(__has_feature)
-#   if __has_feature(memory_sanitizer)
-#       undef __msan_unpoison_string
-#       include <sanitizer/msan_interface.h>
-#   endif
+#if defined(ch_has_feature)
+#    if ch_has_feature(memory_sanitizer)
+#        undef __msan_unpoison_string
+#        include <sanitizer/msan_interface.h>
+#    endif
 #endif


--- a/src/Common/TraceCollector.cpp
+++ b/src/Common/TraceCollector.cpp
@ -22,7 +22,9 @@ namespace
 {
    /// Normally query_id is a UUID (string with a fixed length) but user can provide custom query_id.
    /// Thus upper bound on query_id length should be introduced to avoid buffer overflow in signal handler.
-    constexpr size_t QUERY_ID_MAX_LEN = 1024;
+    ///
+    /// And it cannot be large, since otherwise it will not fit into PIPE_BUF.
+    constexpr size_t QUERY_ID_MAX_LEN = sizeof("00000000-0000-0000-0000-000000000000") - 1; // 36
 }

 LazyPipeFDs pipe;
@ -60,10 +62,14 @@ void TraceCollector::collect(TraceType trace_type, const StackTrace & stack_trac
        8 * sizeof(char) +                     // maximum VarUInt length for string size
        QUERY_ID_MAX_LEN * sizeof(char) +      // maximum query_id length
        sizeof(UInt8) +                        // number of stack frames
-        sizeof(StackTrace::Frames) +           // collected stack trace, maximum capacity
+        sizeof(StackTrace::FramePointers) +    // collected stack trace, maximum capacity
        sizeof(TraceType) +                    // trace type
        sizeof(UInt64) +                       // thread_id
        sizeof(Int64);                         // size
+    /// Write should be atomic to avoid overlaps
+    /// (since recursive collect() is possible)
+    static_assert(buf_size < PIPE_BUF, "Only write of PIPE_BUF to pipe is atomic");
+
    char buffer[buf_size];
    WriteBufferFromFileDescriptorDiscardOnFailure out(pipe.fds_rw[1], buf_size, buffer);

--- a/src/Coordination/Changelog.cpp
+++ b/src/Coordination/Changelog.cpp
@ -0,0 +1,557 @@
+#include <Coordination/Changelog.h>
+#include <IO/WriteHelpers.h>
+#include <IO/ReadHelpers.h>
+#include <IO/ReadBufferFromFile.h>
+#include <filesystem>
+#include <boost/algorithm/string/split.hpp>
+#include <boost/algorithm/string/join.hpp>
+#include <boost/algorithm/string/trim.hpp>
+#include <Common/Exception.h>
+#include <Common/SipHash.h>
+#include <common/logger_useful.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int CHECKSUM_DOESNT_MATCH;
+    extern const int CORRUPTED_DATA;
+    extern const int UNKNOWN_FORMAT_VERSION;
+    extern const int LOGICAL_ERROR;
+}
+
+namespace
+{
+
+constexpr auto DEFAULT_PREFIX = "changelog";
+
+std::string formatChangelogPath(const std::string & prefix, const ChangelogFileDescription & name)
+{
+    std::filesystem::path path(prefix);
+    path /= std::filesystem::path(name.prefix + "_" + std::to_string(name.from_log_index) + "_" + std::to_string(name.to_log_index) + ".bin");
+    return path;
+}
+
+ChangelogFileDescription getChangelogFileDescription(const std::string & path_str)
+{
+    std::filesystem::path path(path_str);
+    std::string filename = path.stem();
+    Strings filename_parts;
+    boost::split(filename_parts, filename, boost::is_any_of("_"));
+    if (filename_parts.size() < 3)
+        throw Exception(ErrorCodes::CORRUPTED_DATA, "Invalid changelog {}", path_str);
+
+    ChangelogFileDescription result;
+    result.prefix = filename_parts[0];
+    result.from_log_index = parse<size_t>(filename_parts[1]);
+    result.to_log_index = parse<size_t>(filename_parts[2]);
+    result.path = path_str;
+    return result;
+}
+
+LogEntryPtr makeClone(const LogEntryPtr & entry)
+{
+    return cs_new<nuraft::log_entry>(entry->get_term(), nuraft::buffer::clone(entry->get_buf()), entry->get_val_type());
+}
+
+Checksum computeRecordChecksum(const ChangelogRecord & record)
+{
+    SipHash hash;
+    hash.update(record.header.version);
+    hash.update(record.header.index);
+    hash.update(record.header.term);
+    hash.update(record.header.value_type);
+    hash.update(record.header.blob_size);
+    if (record.header.blob_size != 0)
+        hash.update(reinterpret_cast<char *>(record.blob->data_begin()), record.blob->size());
+    return hash.get64();
+}
+
+}
+
+class ChangelogWriter
+{
+public:
+    ChangelogWriter(const std::string & filepath_, WriteMode mode, size_t start_index_)
+        : filepath(filepath_)
+        , plain_buf(filepath, DBMS_DEFAULT_BUFFER_SIZE, mode == WriteMode::Rewrite ? -1 : (O_APPEND | O_CREAT | O_WRONLY))
+        , start_index(start_index_)
+    {}
+
+
+    off_t appendRecord(ChangelogRecord && record, bool sync)
+    {
+        off_t result = plain_buf.count();
+        writeIntBinary(computeRecordChecksum(record), plain_buf);
+
+        writeIntBinary(record.header.version, plain_buf);
+        writeIntBinary(record.header.index, plain_buf);
+        writeIntBinary(record.header.term, plain_buf);
+        writeIntBinary(record.header.value_type, plain_buf);
+        writeIntBinary(record.header.blob_size, plain_buf);
+
+        if (record.header.blob_size != 0)
+            plain_buf.write(reinterpret_cast<char *>(record.blob->data_begin()), record.blob->size());
+
+        entries_written++;
+
+        if (sync)
+            plain_buf.sync();
+        return result;
+    }
+
+    void truncateToLength(off_t new_length)
+    {
+        flush();
+        plain_buf.truncate(new_length);
+        plain_buf.seek(new_length, SEEK_SET);
+    }
+
+    void flush()
+    {
+        plain_buf.sync();
+    }
+
+    size_t getEntriesWritten() const
+    {
+        return entries_written;
+    }
+
+    void setEntriesWritten(size_t entries_written_)
+    {
+        entries_written = entries_written_;
+    }
+
+    size_t getStartIndex() const
+    {
+        return start_index;
+    }
+
+    void setStartIndex(size_t start_index_)
+    {
+        start_index = start_index_;
+    }
+
+private:
+    std::string filepath;
+    WriteBufferFromFile plain_buf;
+    size_t entries_written = 0;
+    size_t start_index;
+};
+
+struct ChangelogReadResult
+{
+    size_t entries_read;
+    off_t last_position;
+    bool error;
+};
+
+class ChangelogReader
+{
+public:
+    explicit ChangelogReader(const std::string & filepath_)
+        : filepath(filepath_)
+        , read_buf(filepath)
+    {}
+
+    ChangelogReadResult readChangelog(IndexToLogEntry & logs, size_t start_log_index, IndexToOffset & index_to_offset, Poco::Logger * log)
+    {
+        size_t previous_index = 0;
+        ChangelogReadResult result{};
+        try
+        {
+            while (!read_buf.eof())
+            {
+                result.last_position = read_buf.count();
+                Checksum record_checksum;
+                readIntBinary(record_checksum, read_buf);
+
+                /// Initialization is required, otherwise checksums may fail
+                ChangelogRecord record;
+                readIntBinary(record.header.version, read_buf);
+                readIntBinary(record.header.index, read_buf);
+                readIntBinary(record.header.term, read_buf);
+                readIntBinary(record.header.value_type, read_buf);
+                readIntBinary(record.header.blob_size, read_buf);
+
+                if (record.header.version > CURRENT_CHANGELOG_VERSION)
+                    throw Exception(ErrorCodes::UNKNOWN_FORMAT_VERSION, "Unsupported changelog version {} on path {}", record.header.version, filepath);
+
+                if (record.header.blob_size != 0)
+                {
+                    auto buffer = nuraft::buffer::alloc(record.header.blob_size);
+                    auto * buffer_begin = reinterpret_cast<char *>(buffer->data_begin());
+                    read_buf.readStrict(buffer_begin, record.header.blob_size);
+                    record.blob = buffer;
+                }
+                else
+                    record.blob = nullptr;
+
+                if (previous_index != 0 && previous_index + 1 != record.header.index)
+                    throw Exception(ErrorCodes::CORRUPTED_DATA, "Previous log entry {}, next log entry {}, seems like some entries skipped", previous_index, record.header.index);
+
+                previous_index = record.header.index;
+
+                Checksum checksum = computeRecordChecksum(record);
+                if (checksum != record_checksum)
+                {
+                    throw Exception(ErrorCodes::CHECKSUM_DOESNT_MATCH,
+                                    "Checksums doesn't match for log {} (version {}), index {}, blob_size {}",
+                                    filepath, record.header.version, record.header.index, record.header.blob_size);
+                }
+
+                if (logs.count(record.header.index) != 0)
+                    throw Exception(ErrorCodes::CORRUPTED_DATA, "Duplicated index id {} in log {}", record.header.index, filepath);
+
+                result.entries_read += 1;
+
+                if (record.header.index < start_log_index)
+                    continue;
+
+                auto log_entry = nuraft::cs_new<nuraft::log_entry>(record.header.term, record.blob, record.header.value_type);
+
+                logs.emplace(record.header.index, log_entry);
+                index_to_offset[record.header.index] = result.last_position;
+                if (result.entries_read % 50000 == 0)
+                    LOG_TRACE(log, "Reading changelog from path {}, entries {}", filepath, result.entries_read);
+            }
+        }
+        catch (const Exception & ex)
+        {
+            if (ex.code() == ErrorCodes::UNKNOWN_FORMAT_VERSION)
+                throw ex;
+
+            result.error = true;
+            LOG_WARNING(log, "Cannot completely read changelog on path {}, error: {}", filepath, ex.message());
+        }
+        catch (...)
+        {
+            result.error = true;
+            tryLogCurrentException(log);
+        }
+        LOG_TRACE(log, "Totally read from changelog {} {} entries", filepath, result.entries_read);
+
+        return result;
+    }
+
+private:
+    std::string filepath;
+    ReadBufferFromFile read_buf;
+};
+
+Changelog::Changelog(const std::string & changelogs_dir_, size_t rotate_interval_, Poco::Logger * log_)
+    : changelogs_dir(changelogs_dir_)
+    , rotate_interval(rotate_interval_)
+    , log(log_)
+{
+    namespace fs = std::filesystem;
+    if (!fs::exists(changelogs_dir))
+        fs::create_directories(changelogs_dir);
+
+    for (const auto & p : fs::directory_iterator(changelogs_dir))
+    {
+        auto file_description = getChangelogFileDescription(p.path());
+        existing_changelogs[file_description.from_log_index] = file_description;
+    }
+}
+
+void Changelog::readChangelogAndInitWriter(size_t from_log_index)
+{
+    start_index = from_log_index == 0 ? 1 : from_log_index;
+    size_t total_read = 0;
+    size_t entries_in_last = 0;
+    size_t incomplete_log_index = 0;
+    ChangelogReadResult result{};
+
+    bool started = false;
+    for (const auto & [changelog_start_index, changelog_description] : existing_changelogs)
+    {
+        entries_in_last = changelog_description.to_log_index - changelog_description.from_log_index + 1;
+
+        if (changelog_description.to_log_index >= from_log_index)
+        {
+            if (!started)
+            {
+                if (changelog_description.from_log_index > start_index)
+                    throw Exception(ErrorCodes::CORRUPTED_DATA, "Cannot read changelog from index {}, smallest available index {}", start_index, changelog_description.from_log_index);
+                started = true;
+            }
+
+            ChangelogReader reader(changelog_description.path);
+            result = reader.readChangelog(logs, from_log_index, index_to_start_pos, log);
+            total_read += result.entries_read;
+
+            /// May happen after truncate, crash or simply unfinished log
+            if (result.entries_read < entries_in_last)
+            {
+                incomplete_log_index = changelog_start_index;
+                break;
+            }
+        }
+    }
+
+    if (!started && start_index != 1)
+        throw Exception(ErrorCodes::CORRUPTED_DATA, "Required to read data from {}, but we don't have any active changelogs", from_log_index);
+
+    if (incomplete_log_index != 0)
+    {
+        /// All subsequent logs shouldn't exist. But they may exist if we crashed after writeAt started. Remove them.
+        for (auto itr = existing_changelogs.upper_bound(incomplete_log_index); itr != existing_changelogs.end();)
+        {
+            LOG_WARNING(log, "Removing changelog {}, because it's goes after broken changelog entry", itr->second.path);
+            std::filesystem::remove(itr->second.path);
+            itr = existing_changelogs.erase(itr);
+        }
+
+        /// Continue to write into existing log
+        if (!existing_changelogs.empty())
+        {
+            auto description = existing_changelogs.rbegin()->second;
+            LOG_TRACE(log, "Continue to write into {}", description.path);
+            current_writer = std::make_unique<ChangelogWriter>(description.path, WriteMode::Append, description.from_log_index);
+            current_writer->setEntriesWritten(result.entries_read);
+
+            /// Truncate all broken entries from log
+            if (result.error)
+            {
+                LOG_WARNING(log, "Read finished with error, truncating all broken log entries");
+                current_writer->truncateToLength(result.last_position);
+            }
+        }
+    }
+
+    /// Start new log if we don't initialize writer from previous log
+    if (!current_writer)
+        rotate(start_index + total_read);
+}
+
+void Changelog::rotate(size_t new_start_log_index)
+{
+    //// doesn't exist on init
+    if (current_writer)
+        current_writer->flush();
+
+    ChangelogFileDescription new_description;
+    new_description.prefix = DEFAULT_PREFIX;
+    new_description.from_log_index = new_start_log_index;
+    new_description.to_log_index = new_start_log_index + rotate_interval - 1;
+
+    new_description.path = formatChangelogPath(changelogs_dir, new_description);
+
+    LOG_TRACE(log, "Starting new changelog {}", new_description.path);
+    existing_changelogs[new_start_log_index] = new_description;
+    current_writer = std::make_unique<ChangelogWriter>(new_description.path, WriteMode::Rewrite, new_start_log_index);
+}
+
+ChangelogRecord Changelog::buildRecord(size_t index, const LogEntryPtr & log_entry)
+{
+    ChangelogRecord record;
+    record.header.version = ChangelogVersion::V0;
+    record.header.index = index;
+    record.header.term = log_entry->get_term();
+    record.header.value_type = log_entry->get_val_type();
+    auto buffer = log_entry->get_buf_ptr();
+    if (buffer)
+        record.header.blob_size = buffer->size();
+    else
+        record.header.blob_size = 0;
+
+    record.blob = buffer;
+
+    return record;
+}
+
+void Changelog::appendEntry(size_t index, const LogEntryPtr & log_entry, bool force_sync)
+{
+    if (!current_writer)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Changelog must be initialized before appending records");
+
+    if (logs.empty())
+        start_index = index;
+
+    if (current_writer->getEntriesWritten() == rotate_interval)
+        rotate(index);
+
+    auto offset = current_writer->appendRecord(buildRecord(index, log_entry), force_sync);
+    if (!index_to_start_pos.try_emplace(index, offset).second)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Record with index {} already exists", index);
+
+    logs[index] = makeClone(log_entry);
+}
+
+void Changelog::writeAt(size_t index, const LogEntryPtr & log_entry, bool force_sync)
+{
+    if (index_to_start_pos.count(index) == 0)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot write at index {} because changelog doesn't contain it", index);
+
+    bool go_to_previous_file = index < current_writer->getStartIndex();
+    if (go_to_previous_file)
+    {
+        auto index_changelog = existing_changelogs.lower_bound(index);
+        ChangelogFileDescription description;
+        if (index_changelog->first == index)
+            description = index_changelog->second;
+        else
+            description = std::prev(index_changelog)->second;
+
+        current_writer = std::make_unique<ChangelogWriter>(description.path, WriteMode::Append, index_changelog->first);
+        current_writer->setEntriesWritten(description.to_log_index - description.from_log_index + 1);
+    }
+
+    auto entries_written = current_writer->getEntriesWritten();
+    current_writer->truncateToLength(index_to_start_pos[index]);
+
+    if (go_to_previous_file)
+    {
+        /// Remove all subsequent files
+        auto to_remove_itr = existing_changelogs.upper_bound(index);
+        for (auto itr = to_remove_itr; itr != existing_changelogs.end();)
+        {
+            std::filesystem::remove(itr->second.path);
+            itr = existing_changelogs.erase(itr);
+        }
+    }
+
+    /// Remove redundant logs from memory
+    for (size_t i = index; ; ++i)
+    {
+        auto log_itr = logs.find(i);
+        if (log_itr == logs.end())
+            break;
+        logs.erase(log_itr);
+        index_to_start_pos.erase(i);
+        entries_written--;
+    }
+
+    current_writer->setEntriesWritten(entries_written);
+
+    appendEntry(index, log_entry, force_sync);
+}
+
+void Changelog::compact(size_t up_to_log_index)
+{
+    for (auto itr = existing_changelogs.begin(); itr != existing_changelogs.end();)
+    {
+        /// Remove all completely outdated changelog files
+        if (itr->second.to_log_index <= up_to_log_index)
+        {
+
+            LOG_INFO(log, "Removing changelog {} because of compaction", itr->second.path);
+            std::erase_if(index_to_start_pos, [right_index = itr->second.to_log_index] (const auto & item) { return item.first <= right_index; });
+            std::filesystem::remove(itr->second.path);
+            itr = existing_changelogs.erase(itr);
+        }
+        else /// Files are ordered, so all subsequent should exist
+            break;
+    }
+    start_index = up_to_log_index + 1;
+    std::erase_if(logs, [up_to_log_index] (const auto & item) { return item.first <= up_to_log_index; });
+}
+
+LogEntryPtr Changelog::getLastEntry() const
+{
+    static LogEntryPtr fake_entry = nuraft::cs_new<nuraft::log_entry>(0, nuraft::buffer::alloc(sizeof(size_t)));
+
+    size_t next_index = getNextEntryIndex() - 1;
+    auto entry = logs.find(next_index);
+    if (entry == logs.end())
+        return fake_entry;
+
+    return entry->second;
+}
+
+LogEntriesPtr Changelog::getLogEntriesBetween(size_t start, size_t end)
+{
+    LogEntriesPtr ret = nuraft::cs_new<std::vector<nuraft::ptr<nuraft::log_entry>>>();
+
+    ret->resize(end - start);
+    size_t result_pos = 0;
+    for (size_t i = start; i < end; ++i)
+    {
+        (*ret)[result_pos] = entryAt(i);
+        result_pos++;
+    }
+    return ret;
+}
+
+LogEntryPtr Changelog::entryAt(size_t index)
+{
+    nuraft::ptr<nuraft::log_entry> src = nullptr;
+    auto entry = logs.find(index);
+    if (entry == logs.end())
+        return nullptr;
+
+    src = entry->second;
+    return src;
+}
+
+nuraft::ptr<nuraft::buffer> Changelog::serializeEntriesToBuffer(size_t index, int32_t count)
+{
+    std::vector<nuraft::ptr<nuraft::buffer>> returned_logs;
+
+    size_t size_total = 0;
+    for (size_t i = index; i < index + count; ++i)
+    {
+        auto entry = logs.find(i);
+        if (entry == logs.end())
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Don't have log entry {}", i);
+
+        nuraft::ptr<nuraft::buffer> buf = entry->second->serialize();
+        size_total += buf->size();
+        returned_logs.push_back(buf);
+    }
+
+    nuraft::ptr<nuraft::buffer> buf_out = nuraft::buffer::alloc(sizeof(int32_t) + count * sizeof(int32_t) + size_total);
+    buf_out->pos(0);
+    buf_out->put(static_cast<int32_t>(count));
+
+    for (auto & entry : returned_logs)
+    {
+        nuraft::ptr<nuraft::buffer> & bb = entry;
+        buf_out->put(static_cast<int32_t>(bb->size()));
+        buf_out->put(*bb);
+    }
+    return buf_out;
+}
+
+void Changelog::applyEntriesFromBuffer(size_t index, nuraft::buffer & buffer, bool force_sync)
+{
+    buffer.pos(0);
+    int num_logs = buffer.get_int();
+
+    for (int i = 0; i < num_logs; ++i)
+    {
+        size_t cur_index = index + i;
+        int buf_size = buffer.get_int();
+
+        nuraft::ptr<nuraft::buffer> buf_local = nuraft::buffer::alloc(buf_size);
+        buffer.get(buf_local);
+
+        LogEntryPtr log_entry = nuraft::log_entry::deserialize(*buf_local);
+        if (i == 0 && logs.count(cur_index))
+            writeAt(cur_index, log_entry, force_sync);
+        else
+            appendEntry(cur_index, log_entry, force_sync);
+    }
+}
+
+void Changelog::flush()
+{
+    current_writer->flush();
+}
+
+Changelog::~Changelog()
+{
+    try
+    {
+        if (current_writer)
+            current_writer->flush();
+    }
+    catch (...)
+    {
+        tryLogCurrentException(__PRETTY_FUNCTION__);
+    }
+}
+
+}
--- a/src/Coordination/Changelog.h
+++ b/src/Coordination/Changelog.h
@ -0,0 +1,136 @@
+#pragma once
+
+#include <libnuraft/nuraft.hxx> // Y_IGNORE
+#include <city.h>
+#include <IO/WriteBufferFromFile.h>
+#include <IO/HashingWriteBuffer.h>
+#include <Compression/CompressedWriteBuffer.h>
+#include <Disks/IDisk.h>
+
+namespace DB
+{
+
+using Checksum = UInt64;
+
+using LogEntryPtr = nuraft::ptr<nuraft::log_entry>;
+using LogEntries = std::vector<LogEntryPtr>;
+using LogEntriesPtr = nuraft::ptr<LogEntries>;
+using BufferPtr = nuraft::ptr<nuraft::buffer>;
+
+using IndexToOffset = std::unordered_map<size_t, off_t>;
+using IndexToLogEntry = std::unordered_map<size_t, LogEntryPtr>;
+
+enum class ChangelogVersion : uint8_t
+{
+    V0 = 0,
+};
+
+static constexpr auto CURRENT_CHANGELOG_VERSION = ChangelogVersion::V0;
+
+struct ChangelogRecordHeader
+{
+    ChangelogVersion version = CURRENT_CHANGELOG_VERSION;
+    size_t index; /// entry log number
+    size_t term;
+    nuraft::log_val_type value_type;
+    size_t blob_size;
+};
+
+/// Changelog record on disk
+struct ChangelogRecord
+{
+    ChangelogRecordHeader header;
+    nuraft::ptr<nuraft::buffer> blob;
+};
+
+/// changelog_fromindex_toindex.bin
+/// [fromindex, toindex] <- inclusive
+struct ChangelogFileDescription
+{
+    std::string prefix;
+    size_t from_log_index;
+    size_t to_log_index;
+
+    std::string path;
+};
+
+class ChangelogWriter;
+
+/// Simplest changelog with files rotation.
+/// No compression, no metadata, just entries with headers one by one
+/// Able to read broken files/entries and discard them.
+class Changelog
+{
+
+public:
+    Changelog(const std::string & changelogs_dir_, size_t rotate_interval_, Poco::Logger * log_);
+
+    /// Read changelog from files on changelogs_dir_ skipping all entries before from_log_index
+    /// Truncate broken entries, remove files after broken entries.
+    void readChangelogAndInitWriter(size_t from_log_index);
+
+    /// Add entry to log with index. Call fsync if force_sync true.
+    void appendEntry(size_t index, const LogEntryPtr & log_entry, bool force_sync);
+
+    /// Write entry at index and truncate all subsequent entries.
+    void writeAt(size_t index, const LogEntryPtr & log_entry, bool force_sync);
+
+    /// Remove log files with to_log_index <= up_to_log_index.
+    void compact(size_t up_to_log_index);
+
+    size_t getNextEntryIndex() const
+    {
+        return start_index + logs.size();
+    }
+
+    size_t getStartIndex() const
+    {
+        return start_index;
+    }
+
+    /// Last entry in log, or fake entry with term 0 if log is empty
+    LogEntryPtr getLastEntry() const;
+
+    /// Return log entries between [start, end)
+    LogEntriesPtr getLogEntriesBetween(size_t start_index, size_t end_index);
+
+    /// Return entry at position index
+    LogEntryPtr entryAt(size_t index);
+
+    /// Serialize entries from index into buffer
+    BufferPtr serializeEntriesToBuffer(size_t index, int32_t count);
+
+    /// Apply entries from buffer overriding existing entries
+    void applyEntriesFromBuffer(size_t index, nuraft::buffer & buffer, bool force_sync);
+
+    /// Fsync log to disk
+    void flush();
+
+    size_t size() const
+    {
+        return logs.size();
+    }
+
+    /// Fsync log to disk
+    ~Changelog();
+
+private:
+    /// Pack log_entry into changelog record
+    static ChangelogRecord buildRecord(size_t index, const LogEntryPtr & log_entry);
+
+    /// Starts new file [new_start_log_index, new_start_log_index + rotate_interval]
+    void rotate(size_t new_start_log_index);
+
+private:
+    const std::string changelogs_dir;
+    const size_t rotate_interval;
+    Poco::Logger * log;
+
+    std::map<size_t, ChangelogFileDescription> existing_changelogs;
+    std::unique_ptr<ChangelogWriter> current_writer;
+    IndexToOffset index_to_start_pos;
+    IndexToLogEntry logs;
+    size_t start_index = 0;
+};
+
+}
--- a/src/Coordination/CoordinationSettings.h
+++ b/src/Coordination/CoordinationSettings.h
@ -22,13 +22,15 @@ struct Settings;
    M(Milliseconds, heart_beat_interval_ms, 500, "Heartbeat interval between quorum nodes", 0) \
    M(Milliseconds, election_timeout_lower_bound_ms, 1000, "Lower bound of election timer (avoid too often leader elections)", 0) \
    M(Milliseconds, election_timeout_upper_bound_ms, 2000, "Lower bound of election timer (avoid too often leader elections)", 0) \
-    M(UInt64, reserved_log_items, 5000, "How many log items to store (don't remove during compaction)", 0) \
-    M(UInt64, snapshot_distance, 5000, "How many log items we have to collect to write new snapshot", 0) \
+    M(UInt64, reserved_log_items, 50000, "How many log items to store (don't remove during compaction)", 0) \
+    M(UInt64, snapshot_distance, 100000, "How many log items we have to collect to write new snapshot", 0) \
    M(UInt64, max_stored_snapshots, 3, "How many snapshots we want to store", 0) \
    M(Bool, auto_forwarding, true, "Allow to forward write requests from followers to leader", 0) \
    M(Milliseconds, shutdown_timeout, 5000, "How many time we will until RAFT shutdown", 0) \
    M(Milliseconds, startup_timeout, 30000, "How many time we will until RAFT to start", 0) \
-    M(LogsLevel, raft_logs_level, LogsLevel::information, "Log internal RAFT logs into main server log level. Valid values: 'trace', 'debug', 'information', 'warning', 'error', 'fatal', 'none'", 0)
+    M(LogsLevel, raft_logs_level, LogsLevel::information, "Log internal RAFT logs into main server log level. Valid values: 'trace', 'debug', 'information', 'warning', 'error', 'fatal', 'none'", 0) \
+    M(UInt64, rotate_log_storage_interval, 100000, "How many records will be stored in one log storage file", 0) \
+    M(Bool, force_sync, true, " Call fsync on each change in RAFT changelog", 0)

 DECLARE_SETTINGS_TRAITS(CoordinationSettingsTraits, LIST_OF_COORDINATION_SETTINGS)

--- a/src/Coordination/InMemoryLogStore.cpp
+++ b/src/Coordination/InMemoryLogStore.cpp
@ -72,12 +72,12 @@ nuraft::ptr<std::vector<nuraft::ptr<nuraft::log_entry>>> InMemoryLogStore::log_e

    ret->resize(end - start);
    size_t cc = 0;
-    for (size_t ii = start; ii < end; ++ii)
+    for (size_t i = start; i < end; ++i)
    {
        nuraft::ptr<nuraft::log_entry> src = nullptr;
        {
            std::lock_guard<std::mutex> l(logs_lock);
-            auto entry = logs.find(ii);
+            auto entry = logs.find(i);
            if (entry == logs.end())
            {
                entry = logs.find(0);
@ -152,9 +152,9 @@ void InMemoryLogStore::apply_pack(size_t index, nuraft::buffer & pack)
    pack.pos(0);
    Int32 num_logs = pack.get_int();

-    for (Int32 ii = 0; ii < num_logs; ++ii)
+    for (Int32 i = 0; i < num_logs; ++i)
    {
-        size_t cur_idx = index + ii;
+        size_t cur_idx = index + i;
        Int32 buf_size = pack.get_int();

        nuraft::ptr<nuraft::buffer> buf_local = nuraft::buffer::alloc(buf_size);
--- a/src/Coordination/LoggerWrapper.h
+++ b/src/Coordination/LoggerWrapper.h
@ -9,12 +9,26 @@ namespace DB

 class LoggerWrapper : public nuraft::logger
 {
+private:
+
+    static inline const std::unordered_map<LogsLevel, Poco::Message::Priority> LEVELS =
+    {
+        {LogsLevel::trace, Poco::Message::Priority::PRIO_TRACE},
+        {LogsLevel::debug, Poco::Message::Priority::PRIO_DEBUG},
+        {LogsLevel::information, Poco::Message::PRIO_INFORMATION},
+        {LogsLevel::warning, Poco::Message::PRIO_WARNING},
+        {LogsLevel::error, Poco::Message::PRIO_ERROR},
+        {LogsLevel::fatal, Poco::Message::PRIO_FATAL}
+    };
+    static inline const int LEVEL_MAX = static_cast<int>(LogsLevel::trace);
+    static inline const int LEVEL_MIN = static_cast<int>(LogsLevel::none);
+
 public:
    LoggerWrapper(const std::string & name, LogsLevel level_)
        : log(&Poco::Logger::get(name))
-        , level(static_cast<int>(level_))
+        , level(level_)
    {
-        log->setLevel(level);
+        log->setLevel(static_cast<int>(LEVELS.at(level)));
    }

    void put_details(
@ -24,24 +38,26 @@ public:
        size_t /* line_number */,
        const std::string & msg) override
    {
-        LOG_IMPL(log, static_cast<DB::LogsLevel>(level_), static_cast<Poco::Message::Priority>(level_), msg);
+        LogsLevel db_level = static_cast<LogsLevel>(level_);
+        LOG_IMPL(log, db_level, LEVELS.at(db_level), msg);
    }

    void set_level(int level_) override
    {
-        level_ = std::min(6, std::max(1, level_));
-        log->setLevel(level_);
-        level = level_;
+        level_ = std::min(LEVEL_MAX, std::max(LEVEL_MIN, level_));
+        level = static_cast<LogsLevel>(level_);
+        log->setLevel(static_cast<int>(LEVELS.at(level)));
    }

    int get_level() override
    {
-        return level;
+        LogsLevel lvl = level;
+        return static_cast<int>(lvl);
    }

 private:
    Poco::Logger * log;
-    std::atomic<int> level;
+    std::atomic<LogsLevel> level;
 };

 }
--- a/src/Coordination/NuKeeperLogStore.cpp
+++ b/src/Coordination/NuKeeperLogStore.cpp
@ -0,0 +1,105 @@
+#include <Coordination/NuKeeperLogStore.h>
+
+namespace DB
+{
+
+NuKeeperLogStore::NuKeeperLogStore(const std::string & changelogs_path, size_t rotate_interval_, bool force_sync_)
+    : log(&Poco::Logger::get("NuKeeperLogStore"))
+    , changelog(changelogs_path, rotate_interval_, log)
+    , force_sync(force_sync_)
+{
+}
+
+size_t NuKeeperLogStore::start_index() const
+{
+    std::lock_guard lock(changelog_lock);
+    return changelog.getStartIndex();
+}
+
+void NuKeeperLogStore::init(size_t from_log_idx)
+{
+    std::lock_guard lock(changelog_lock);
+    changelog.readChangelogAndInitWriter(from_log_idx);
+}
+
+size_t NuKeeperLogStore::next_slot() const
+{
+    std::lock_guard lock(changelog_lock);
+    return changelog.getNextEntryIndex();
+}
+
+nuraft::ptr<nuraft::log_entry> NuKeeperLogStore::last_entry() const
+{
+    std::lock_guard lock(changelog_lock);
+    return changelog.getLastEntry();
+}
+
+size_t NuKeeperLogStore::append(nuraft::ptr<nuraft::log_entry> & entry)
+{
+    std::lock_guard lock(changelog_lock);
+    size_t idx = changelog.getNextEntryIndex();
+    changelog.appendEntry(idx, entry, force_sync);
+    return idx;
+}
+
+
+void NuKeeperLogStore::write_at(size_t index, nuraft::ptr<nuraft::log_entry> & entry)
+{
+    std::lock_guard lock(changelog_lock);
+    changelog.writeAt(index, entry, force_sync);
+}
+
+nuraft::ptr<std::vector<nuraft::ptr<nuraft::log_entry>>> NuKeeperLogStore::log_entries(size_t start, size_t end)
+{
+    std::lock_guard lock(changelog_lock);
+    return changelog.getLogEntriesBetween(start, end);
+}
+
+nuraft::ptr<nuraft::log_entry> NuKeeperLogStore::entry_at(size_t index)
+{
+    std::lock_guard lock(changelog_lock);
+    return changelog.entryAt(index);
+}
+
+size_t NuKeeperLogStore::term_at(size_t index)
+{
+    std::lock_guard lock(changelog_lock);
+    auto entry = changelog.entryAt(index);
+    if (entry)
+        return entry->get_term();
+    return 0;
+}
+
+nuraft::ptr<nuraft::buffer> NuKeeperLogStore::pack(size_t index, int32_t cnt)
+{
+    std::lock_guard lock(changelog_lock);
+    return changelog.serializeEntriesToBuffer(index, cnt);
+}
+
+bool NuKeeperLogStore::compact(size_t last_log_index)
+{
+    std::lock_guard lock(changelog_lock);
+    changelog.compact(last_log_index);
+    return true;
+}
+
+bool NuKeeperLogStore::flush()
+{
+    std::lock_guard lock(changelog_lock);
+    changelog.flush();
+    return true;
+}
+
+void NuKeeperLogStore::apply_pack(size_t index, nuraft::buffer & pack)
+{
+    std::lock_guard lock(changelog_lock);
+    changelog.applyEntriesFromBuffer(index, pack, force_sync);
+}
+
+size_t NuKeeperLogStore::size() const
+{
+    std::lock_guard lock(changelog_lock);
+    return changelog.size();
+}
+
+}
--- a/src/Coordination/NuKeeperLogStore.h
+++ b/src/Coordination/NuKeeperLogStore.h
@ -0,0 +1,52 @@
+#pragma once
+#include <libnuraft/log_store.hxx> // Y_IGNORE
+#include <map>
+#include <mutex>
+#include <Core/Types.h>
+#include <Coordination/Changelog.h>
+#include <common/logger_useful.h>
+
+namespace DB
+{
+
+class NuKeeperLogStore : public nuraft::log_store
+{
+public:
+    NuKeeperLogStore(const std::string & changelogs_path, size_t rotate_interval_, bool force_sync_);
+
+    void init(size_t from_log_idx);
+
+    size_t start_index() const override;
+
+    size_t next_slot() const override;
+
+    nuraft::ptr<nuraft::log_entry> last_entry() const override;
+
+    size_t append(nuraft::ptr<nuraft::log_entry> & entry) override;
+
+    void write_at(size_t index, nuraft::ptr<nuraft::log_entry> & entry) override;
+
+    nuraft::ptr<std::vector<nuraft::ptr<nuraft::log_entry>>> log_entries(size_t start, size_t end) override;
+
+    nuraft::ptr<nuraft::log_entry> entry_at(size_t index) override;
+
+    size_t term_at(size_t index) override;
+
+    nuraft::ptr<nuraft::buffer> pack(size_t index, int32_t cnt) override;
+
+    void apply_pack(size_t index, nuraft::buffer & pack) override;
+
+    bool compact(size_t last_log_index) override;
+
+    bool flush() override;
+
+    size_t size() const;
+
+private:
+    mutable std::mutex changelog_lock;
+    Poco::Logger * log;
+    Changelog changelog;
+    bool force_sync;
+};
+
+}
--- a/src/Coordination/NuKeeperServer.cpp
+++ b/src/Coordination/NuKeeperServer.cpp
@ -1,7 +1,7 @@
 #include <Coordination/NuKeeperServer.h>
 #include <Coordination/LoggerWrapper.h>
 #include <Coordination/NuKeeperStateMachine.h>
-#include <Coordination/InMemoryStateManager.h>
+#include <Coordination/NuKeeperStateManager.h>
 #include <Coordination/WriteBufferFromNuraftBuffer.h>
 #include <Coordination/ReadBufferFromNuraftBuffer.h>
 #include <IO/ReadHelpers.h>
@ -26,13 +26,16 @@ NuKeeperServer::NuKeeperServer(
    : server_id(server_id_)
    , coordination_settings(coordination_settings_)
    , state_machine(nuraft::cs_new<NuKeeperStateMachine>(responses_queue_, coordination_settings))
-    , state_manager(nuraft::cs_new<InMemoryStateManager>(server_id, "test_keeper_server.raft_configuration", config))
+    , state_manager(nuraft::cs_new<NuKeeperStateManager>(server_id, "test_keeper_server", config, coordination_settings))
    , responses_queue(responses_queue_)
 {
 }

 void NuKeeperServer::startup()
 {
+
+    state_manager->loadLogStore(state_machine->last_commit_index());
+
    nuraft::raft_params params;
    params.heart_beat_interval_ = coordination_settings->heart_beat_interval_ms.totalMilliseconds();
    params.election_timeout_lower_bound_ = coordination_settings->election_timeout_lower_bound_ms.totalMilliseconds();
@ -64,6 +67,7 @@ void NuKeeperServer::startup()
 void NuKeeperServer::shutdown()
 {
    state_machine->shutdownStorage();
+    state_manager->flushLogStore();
    if (!launcher.shutdown(coordination_settings->shutdown_timeout.totalSeconds()))
        LOG_WARNING(&Poco::Logger::get("NuKeeperServer"), "Failed to shutdown RAFT server in {} seconds", 5);
 }
@ -157,7 +161,7 @@ bool NuKeeperServer::isLeaderAlive() const

 nuraft::cb_func::ReturnCode NuKeeperServer::callbackFunc(nuraft::cb_func::Type type, nuraft::cb_func::Param * /* param */)
 {
-    if (type == nuraft::cb_func::Type::BecomeFresh || type == nuraft::cb_func::Type::BecomeLeader)
+    if ((type == nuraft::cb_func::InitialBatchCommited && isLeader()) || type == nuraft::cb_func::BecomeFresh)
    {
        std::unique_lock lock(initialized_mutex);
        initialized_flag = true;
--- a/src/Coordination/NuKeeperServer.h
+++ b/src/Coordination/NuKeeperServer.h
@ -2,7 +2,7 @@

 #include <libnuraft/nuraft.hxx> // Y_IGNORE
 #include <Coordination/InMemoryLogStore.h>
-#include <Coordination/InMemoryStateManager.h>
+#include <Coordination/NuKeeperStateManager.h>
 #include <Coordination/NuKeeperStateMachine.h>
 #include <Coordination/NuKeeperStorage.h>
 #include <Coordination/CoordinationSettings.h>
@ -20,7 +20,7 @@ private:

    nuraft::ptr<NuKeeperStateMachine> state_machine;

-    nuraft::ptr<InMemoryStateManager> state_manager;
+    nuraft::ptr<NuKeeperStateManager> state_manager;

    nuraft::raft_launcher launcher;

--- a/src/Coordination/NuKeeperStateMachine.cpp
+++ b/src/Coordination/NuKeeperStateMachine.cpp
@ -46,7 +46,7 @@ NuKeeperStateMachine::NuKeeperStateMachine(ResponsesQueue & responses_queue_, co
    , storage(coordination_settings->dead_session_check_period_ms.totalMilliseconds())
    , responses_queue(responses_queue_)
    , last_committed_idx(0)
-    , log(&Poco::Logger::get("NuRaftStateMachine"))
+    , log(&Poco::Logger::get("NuKeeperStateMachine"))
 {
    LOG_DEBUG(log, "Created nukeeper state machine");
 }
--- a/src/Coordination/NuKeeperStateManager.cpp
+++ b/src/Coordination/NuKeeperStateManager.cpp
@ -1,4 +1,4 @@
-#include <Coordination/InMemoryStateManager.h>
+#include <Coordination/NuKeeperStateManager.h>
 #include <Common/Exception.h>

 namespace DB
@ -9,30 +9,34 @@ namespace ErrorCodes
    extern const int RAFT_ERROR;
 }

-InMemoryStateManager::InMemoryStateManager(int server_id_, const std::string & host, int port)
+NuKeeperStateManager::NuKeeperStateManager(int server_id_, const std::string & host, int port, const std::string & logs_path)
    : my_server_id(server_id_)
    , my_port(port)
-    , log_store(nuraft::cs_new<InMemoryLogStore>())
+    , log_store(nuraft::cs_new<NuKeeperLogStore>(logs_path, 5000, false))
    , cluster_config(nuraft::cs_new<nuraft::cluster_config>())
 {
    auto peer_config = nuraft::cs_new<nuraft::srv_config>(my_server_id, host + ":" + std::to_string(port));
    cluster_config->get_servers().push_back(peer_config);
 }

-InMemoryStateManager::InMemoryStateManager(
+NuKeeperStateManager::NuKeeperStateManager(
    int my_server_id_,
    const std::string & config_prefix,
-    const Poco::Util::AbstractConfiguration & config)
+    const Poco::Util::AbstractConfiguration & config,
+    const CoordinationSettingsPtr & coordination_settings)
    : my_server_id(my_server_id_)
-    , log_store(nuraft::cs_new<InMemoryLogStore>())
+    , log_store(nuraft::cs_new<NuKeeperLogStore>(
+                    config.getString(config_prefix + ".log_storage_path"),
+                    coordination_settings->rotate_log_storage_interval, coordination_settings->force_sync))
    , cluster_config(nuraft::cs_new<nuraft::cluster_config>())
 {
+
    Poco::Util::AbstractConfiguration::Keys keys;
-    config.keys(config_prefix, keys);
+    config.keys(config_prefix + ".raft_configuration", keys);

    for (const auto & server_key : keys)
    {
-        std::string full_prefix = config_prefix + "." + server_key;
+        std::string full_prefix = config_prefix + ".raft_configuration." + server_key;
        int server_id = config.getInt(full_prefix + ".id");
        std::string hostname = config.getString(full_prefix + ".hostname");
        int port = config.getInt(full_prefix + ".port");
@ -53,13 +57,23 @@ InMemoryStateManager::InMemoryStateManager(
        cluster_config->get_servers().push_back(peer_config);
    }
    if (!my_server_config)
-        throw Exception(ErrorCodes::RAFT_ERROR, "Our server id {} not found in raft_configuration section");
+        throw Exception(ErrorCodes::RAFT_ERROR, "Our server id {} not found in raft_configuration section", my_server_id);

    if (start_as_follower_servers.size() == cluster_config->get_servers().size())
        throw Exception(ErrorCodes::RAFT_ERROR, "At least one of servers should be able to start as leader (without <start_as_follower>)");
 }

-void InMemoryStateManager::save_config(const nuraft::cluster_config & config)
+void NuKeeperStateManager::loadLogStore(size_t start_log_index)
+{
+    log_store->init(start_log_index);
+}
+
+void NuKeeperStateManager::flushLogStore()
+{
+    log_store->flush();
+}
+
+void NuKeeperStateManager::save_config(const nuraft::cluster_config & config)
 {
    // Just keep in memory in this example.
    // Need to write to disk here, if want to make it durable.
@ -67,7 +81,7 @@ void InMemoryStateManager::save_config(const nuraft::cluster_config & config)
    cluster_config = nuraft::cluster_config::deserialize(*buf);
 }

-void InMemoryStateManager::save_state(const nuraft::srv_state & state)
+void NuKeeperStateManager::save_state(const nuraft::srv_state & state)
 {
     // Just keep in memory in this example.
     // Need to write to disk here, if want to make it durable.
--- a/src/Coordination/NuKeeperStateManager.h
+++ b/src/Coordination/NuKeeperStateManager.h
@ -2,25 +2,32 @@

 #include <Core/Types.h>
 #include <string>
-#include <Coordination/InMemoryLogStore.h>
+#include <Coordination/NuKeeperLogStore.h>
+#include <Coordination/CoordinationSettings.h>
 #include <libnuraft/nuraft.hxx> // Y_IGNORE
 #include <Poco/Util/AbstractConfiguration.h>

 namespace DB
 {

-class InMemoryStateManager : public nuraft::state_mgr
+class NuKeeperStateManager : public nuraft::state_mgr
 {
 public:
-    InMemoryStateManager(
+    NuKeeperStateManager(
        int server_id_,
        const std::string & config_prefix,
-        const Poco::Util::AbstractConfiguration & config);
+        const Poco::Util::AbstractConfiguration & config,
+        const CoordinationSettingsPtr & coordination_settings);

-    InMemoryStateManager(
+    NuKeeperStateManager(
        int server_id_,
        const std::string & host,
-        int port);
+        int port,
+        const std::string & logs_path);
+
+    void loadLogStore(size_t start_log_index);
+
+    void flushLogStore();

    nuraft::ptr<nuraft::cluster_config> load_config() override { return cluster_config; }

@ -49,7 +56,7 @@ private:
    int my_server_id;
    int my_port;
    std::unordered_set<int> start_as_follower_servers;
-    nuraft::ptr<InMemoryLogStore> log_store;
+    nuraft::ptr<NuKeeperLogStore> log_store;
    nuraft::ptr<nuraft::srv_config> my_server_config;
    nuraft::ptr<nuraft::cluster_config> cluster_config;
    nuraft::ptr<nuraft::srv_state> server_state;
--- a/src/Coordination/NuKeeperStorage.cpp
+++ b/src/Coordination/NuKeeperStorage.cpp
@ -25,10 +25,10 @@ static String parentPath(const String & path)
    return "/";
 }

-static String baseName(const String & path)
+static std::string getBaseName(const String & path)
 {
-    auto rslash_pos = path.rfind('/');
-    return path.substr(rslash_pos + 1);
+    size_t basename_start = path.rfind('/');
+    return std::string{&path[basename_start + 1], path.length() - basename_start - 1};
 }

 static NuKeeperStorage::ResponsesForSessions processWatchesImpl(const String & path, NuKeeperStorage::Watches & watches, NuKeeperStorage::Watches & list_watches, Coordination::Event event_type)
@ -167,14 +167,17 @@ struct NuKeeperStorageCreateRequest final : public NuKeeperStorageRequest

                /// Increment sequential number even if node is not sequential
                ++it->second.seq_num;
-
                response.path_created = path_created;
+
                container.emplace(path_created, std::move(created_node));

+                auto child_path = getBaseName(path_created);
+                it->second.children.insert(child_path);
+
                if (request.is_ephemeral)
                    ephemerals[session_id].emplace(path_created);

-                undo = [&container, &ephemerals, session_id, path_created, is_ephemeral = request.is_ephemeral, parent_path = it->first]
+                undo = [&container, &ephemerals, session_id, path_created, is_ephemeral = request.is_ephemeral, parent_path = it->first, child_path]
                {
                    container.erase(path_created);
                    if (is_ephemeral)
@ -183,6 +186,7 @@ struct NuKeeperStorageCreateRequest final : public NuKeeperStorageRequest
                    --undo_parent.stat.cversion;
                    --undo_parent.stat.numChildren;
                    --undo_parent.seq_num;
+                    undo_parent.children.erase(child_path);
                };

                ++it->second.stat.cversion;
@ -250,13 +254,16 @@ struct NuKeeperStorageRemoveRequest final : public NuKeeperStorageRequest
            if (prev_node.is_ephemeral)
                ephemerals[session_id].erase(request.path);

-            container.erase(it);
+            auto child_basename = getBaseName(it->first);
            auto & parent = container.at(parentPath(request.path));
            --parent.stat.numChildren;
            ++parent.stat.cversion;
+            parent.children.erase(child_basename);
            response.error = Coordination::Error::ZOK;

-            undo = [prev_node, &container, &ephemerals, session_id, path = request.path]
+            container.erase(it);
+
+            undo = [prev_node, &container, &ephemerals, session_id, path = request.path, child_basename]
            {
                if (prev_node.is_ephemeral)
                    ephemerals[session_id].emplace(path);
@ -265,6 +272,7 @@ struct NuKeeperStorageRemoveRequest final : public NuKeeperStorageRequest
                auto & undo_parent = container.at(parentPath(path));
                ++undo_parent.stat.numChildren;
                --undo_parent.stat.cversion;
+                undo_parent.children.insert(child_basename);
            };
        }

@ -370,17 +378,9 @@ struct NuKeeperStorageListRequest final : public NuKeeperStorageRequest
            if (path_prefix.empty())
                throw DB::Exception("Logical error: path cannot be empty", ErrorCodes::LOGICAL_ERROR);

-            if (path_prefix.back() != '/')
-                path_prefix += '/';
+            response.names.insert(response.names.end(), it->second.children.begin(), it->second.children.end());

-            /// Fairly inefficient.
-            for (auto child_it = container.upper_bound(path_prefix);
-                 child_it != container.end() && startsWith(child_it->first, path_prefix);
-                ++child_it)
-            {
-                if (parentPath(child_it->first) == request.path)
-                    response.names.emplace_back(baseName(child_it->first));
-            }
+            std::sort(response.names.begin(), response.names.end());

            response.stat = it->second.stat;
            response.error = Coordination::Error::ZOK;
--- a/src/Coordination/NuKeeperStorage.h
+++ b/src/Coordination/NuKeeperStorage.h
@ -16,6 +16,7 @@ using namespace DB;
 struct NuKeeperStorageRequest;
 using NuKeeperStorageRequestPtr = std::shared_ptr<NuKeeperStorageRequest>;
 using ResponseCallback = std::function<void(const Coordination::ZooKeeperResponsePtr &)>;
+using ChildrenSet = std::unordered_set<std::string>;

 class NuKeeperStorage
 {
@ -30,6 +31,7 @@ public:
        bool is_sequental = false;
        Coordination::Stat stat{};
        int32_t seq_num = 0;
+        ChildrenSet children{};
    };

    struct ResponseForSession
@ -48,9 +50,9 @@ public:

    using RequestsForSessions = std::vector<RequestForSession>;

-    using Container = std::map<std::string, Node>;
-    using Ephemerals = std::unordered_map<int64_t, std::unordered_set<String>>;
-    using SessionAndWatcher = std::unordered_map<int64_t, std::unordered_set<String>>;
+    using Container = std::unordered_map<std::string, Node>;
+    using Ephemerals = std::unordered_map<int64_t, std::unordered_set<std::string>>;
+    using SessionAndWatcher = std::unordered_map<int64_t, std::unordered_set<std::string>>;
    using SessionAndTimeout = std::unordered_map<int64_t, long>;
    using SessionIDs = std::vector<int64_t>;

--- a/src/Coordination/NuKeeperStorageSerializer.cpp
+++ b/src/Coordination/NuKeeperStorageSerializer.cpp
@ -59,13 +59,16 @@ void NuKeeperStorageSerializer::deserialize(NuKeeperStorage & storage, ReadBuffe

    size_t container_size;
    Coordination::read(container_size, in);
-    while (storage.container.size() < container_size)
+
+    size_t current_size = 0;
+    while (current_size < container_size)
    {
        std::string path;
        Coordination::read(path, in);
        NuKeeperStorage::Node node;
        readNode(node, in);
        storage.container[path] = node;
+        current_size++;
    }
    size_t ephemerals_size;
    Coordination::read(ephemerals_size, in);
--- a/src/Coordination/tests/gtest_for_build.cpp
+++ b/src/Coordination/tests/gtest_for_build.cpp
@ -6,9 +6,10 @@
 #endif

 #if USE_NURAFT
-
+#include <Poco/ConsoleChannel.h>
+#include <Poco/Logger.h>
 #include <Coordination/InMemoryLogStore.h>
-#include <Coordination/InMemoryStateManager.h>
+#include <Coordination/NuKeeperStateManager.h>
 #include <Coordination/NuKeeperStorageSerializer.h>
 #include <Coordination/SummingStateMachine.h>
 #include <Coordination/NuKeeperStateMachine.h>
@ -20,9 +21,35 @@
 #include <Common/ZooKeeper/ZooKeeperCommon.h>
 #include <Common/ZooKeeper/ZooKeeperIO.h>
 #include <Common/Exception.h>
+#include <common/logger_useful.h>
 #include <libnuraft/nuraft.hxx> // Y_IGNORE
 #include <thread>
+#include <Coordination/NuKeeperLogStore.h>
+#include <Coordination/Changelog.h>
+#include <filesystem>

+namespace fs = std::filesystem;
+struct ChangelogDirTest
+{
+    std::string path;
+    bool drop;
+    explicit ChangelogDirTest(std::string path_, bool drop_ = true)
+        : path(path_)
+        , drop(drop_)
+    {
+        if (fs::exists(path))
+        {
+            EXPECT_TRUE(false) << "Path " << path << " already exists, remove it to run test";
+        }
+        fs::create_directory(path);
+    }
+
+    ~ChangelogDirTest()
+    {
+        if (fs::exists(path) && drop)
+            fs::remove_all(path);
+    }
+};

 TEST(CoordinationTest, BuildTest)
 {
@ -67,14 +94,15 @@ TEST(CoordinationTest, BufferSerde)
 template <typename StateMachine>
 struct SimpliestRaftServer
 {
-    SimpliestRaftServer(int server_id_, const std::string & hostname_, int port_)
+    SimpliestRaftServer(int server_id_, const std::string & hostname_, int port_, const std::string & logs_path)
        : server_id(server_id_)
        , hostname(hostname_)
        , port(port_)
        , endpoint(hostname + ":" + std::to_string(port))
        , state_machine(nuraft::cs_new<StateMachine>())
-        , state_manager(nuraft::cs_new<DB::InMemoryStateManager>(server_id, hostname, port))
+        , state_manager(nuraft::cs_new<DB::NuKeeperStateManager>(server_id, hostname, port, logs_path))
    {
+        state_manager->loadLogStore(1);
        nuraft::raft_params params;
        params.heart_beat_interval_ = 100;
        params.election_timeout_lower_bound_ = 200;
@ -90,10 +118,10 @@ struct SimpliestRaftServer

        if (!raft_instance)
        {
-            std::cerr << "Failed to initialize launcher (see the message "
-                         "in the log file)." << std::endl;
+            std::cerr << "Failed to initialize launcher" << std::endl;
            exit(-1);
        }
+
        std::cout << "init Raft instance " << server_id;
        for (size_t ii = 0; ii < 20; ++ii)
        {
@ -123,7 +151,7 @@ struct SimpliestRaftServer
    nuraft::ptr<StateMachine> state_machine;

    // State manager.
-    nuraft::ptr<nuraft::state_mgr> state_manager;
+    nuraft::ptr<DB::NuKeeperStateManager> state_manager;

    // Raft launcher.
    nuraft::raft_launcher launcher;
@ -134,11 +162,10 @@ struct SimpliestRaftServer

 using SummingRaftServer = SimpliestRaftServer<DB::SummingStateMachine>;

-nuraft::ptr<nuraft::buffer> getLogEntry(int64_t number)
+nuraft::ptr<nuraft::buffer> getBuffer(int64_t number)
 {
    nuraft::ptr<nuraft::buffer> ret = nuraft::buffer::alloc(sizeof(number));
    nuraft::buffer_serializer bs(ret);
-    // WARNING: We don't consider endian-safety in this example.
    bs.put_raw(&number, sizeof(number));
    return ret;
 }
@ -146,12 +173,13 @@ nuraft::ptr<nuraft::buffer> getLogEntry(int64_t number)

 TEST(CoordinationTest, TestSummingRaft1)
 {
-    SummingRaftServer s1(1, "localhost", 44444);
+    ChangelogDirTest test("./logs");
+    SummingRaftServer s1(1, "localhost", 44444, "./logs");

    /// Single node is leader
    EXPECT_EQ(s1.raft_instance->get_leader(), 1);

-    auto entry1 = getLogEntry(143);
+    auto entry1 = getBuffer(143);
    auto ret = s1.raft_instance->append_entries({entry1});
    EXPECT_TRUE(ret->get_accepted()) << "failed to replicate: entry 1" << ret->get_result_code();
    EXPECT_EQ(ret->get_result_code(), nuraft::cmd_result_code::OK) << "failed to replicate: entry 1" << ret->get_result_code();
@ -169,17 +197,23 @@ TEST(CoordinationTest, TestSummingRaft1)

 TEST(CoordinationTest, TestSummingRaft3)
 {
-    SummingRaftServer s1(1, "localhost", 44444);
-    SummingRaftServer s2(2, "localhost", 44445);
-    SummingRaftServer s3(3, "localhost", 44446);
+    ChangelogDirTest test1("./logs1");
+    SummingRaftServer s1(1, "localhost", 44444, "./logs1");
+    ChangelogDirTest test2("./logs2");
+    SummingRaftServer s2(2, "localhost", 44445, "./logs2");
+    ChangelogDirTest test3("./logs3");
+    SummingRaftServer s3(3, "localhost", 44446, "./logs3");

-    nuraft::srv_config first_config(1, "localhost:44444");
+    nuraft::srv_config first_config(1, 0, "localhost:44444", "", false, 0);
    auto ret1 = s2.raft_instance->add_srv(first_config);
-    if (!ret1->get_accepted())
+    while (!ret1->get_accepted())
    {
+
        std::cout << "failed to add server: "
                  << ret1->get_result_str() << std::endl;
-        EXPECT_TRUE(false);
+
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+        ret1 = s2.raft_instance->add_srv(first_config);
    }

    while (s1.raft_instance->get_leader() != 2)
@ -188,13 +222,15 @@ TEST(CoordinationTest, TestSummingRaft3)
        std::this_thread::sleep_for(std::chrono::milliseconds(100));
    }

-    nuraft::srv_config third_config(3, "localhost:44446");
+    nuraft::srv_config third_config(3, 0, "localhost:44446", "", false, 0);
    auto ret3 = s2.raft_instance->add_srv(third_config);
    if (!ret3->get_accepted())
    {
        std::cout << "failed to add server: "
                  << ret3->get_result_str() << std::endl;
-        EXPECT_TRUE(false);
+
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+        ret3 = s2.raft_instance->add_srv(third_config);
    }

    while (s3.raft_instance->get_leader() != 2)
@ -209,10 +245,13 @@ TEST(CoordinationTest, TestSummingRaft3)
    EXPECT_EQ(s3.raft_instance->get_leader(), 2);

    std::cerr << "Starting to add entries\n";
-    auto entry = getLogEntry(1);
+    auto entry = getBuffer(1);
    auto ret = s2.raft_instance->append_entries({entry});
-    EXPECT_TRUE(ret->get_accepted()) << "failed to replicate: entry 1" << ret->get_result_code();
-    EXPECT_EQ(ret->get_result_code(), nuraft::cmd_result_code::OK) << "failed to replicate: entry 1" << ret->get_result_code();
+    while (!ret->get_accepted() || ret->get_result_code() != nuraft::cmd_result_code::OK)
+    {
+        std::cerr <<  ret->get_accepted() << "failed to replicate: entry 1" << ret->get_result_code() << std::endl;
+        ret = s2.raft_instance->append_entries({entry});
+    }

    while (s1.state_machine->getValue() != 1)
    {
@ -236,7 +275,7 @@ TEST(CoordinationTest, TestSummingRaft3)
    EXPECT_EQ(s2.state_machine->getValue(), 1);
    EXPECT_EQ(s3.state_machine->getValue(), 1);

-    auto non_leader_entry = getLogEntry(3);
+    auto non_leader_entry = getBuffer(3);
    auto ret_non_leader1 = s1.raft_instance->append_entries({non_leader_entry});

    EXPECT_FALSE(ret_non_leader1->get_accepted());
@ -245,10 +284,13 @@ TEST(CoordinationTest, TestSummingRaft3)

    EXPECT_FALSE(ret_non_leader3->get_accepted());

-    auto leader_entry = getLogEntry(77);
+    auto leader_entry = getBuffer(77);
    auto ret_leader = s2.raft_instance->append_entries({leader_entry});
-    EXPECT_TRUE(ret_leader->get_accepted()) << "failed to replicate: entry 78" << ret_leader->get_result_code();
-    EXPECT_EQ(ret_leader->get_result_code(), nuraft::cmd_result_code::OK) << "failed to replicate: entry 78" << ret_leader->get_result_code();
+    while (!ret_leader->get_accepted() || ret_leader->get_result_code() != nuraft::cmd_result_code::OK)
+    {
+        std::cerr << "failed to replicate: entry 78" << ret_leader->get_result_code() << std::endl;
+        ret_leader = s2.raft_instance->append_entries({leader_entry});
+    }

    while (s1.state_machine->getValue() != 78)
    {
@ -333,4 +375,586 @@ TEST(CoordinationTest, TestStorageSerialization)
    EXPECT_EQ(new_storage.ephemerals[1].size(), 1);
 }

+DB::LogEntryPtr getLogEntry(const std::string & s, size_t term)
+{
+    DB::WriteBufferFromNuraftBuffer bufwriter;
+    writeText(s, bufwriter);
+    return nuraft::cs_new<nuraft::log_entry>(term, bufwriter.getBuffer());
+}
+
+TEST(CoordinationTest, ChangelogTestSimple)
+{
+    ChangelogDirTest test("./logs");
+    DB::NuKeeperLogStore changelog("./logs", 5, true);
+    changelog.init(1);
+    auto entry = getLogEntry("hello world", 77);
+    changelog.append(entry);
+    EXPECT_EQ(changelog.next_slot(), 2);
+    EXPECT_EQ(changelog.start_index(), 1);
+    EXPECT_EQ(changelog.last_entry()->get_term(), 77);
+    EXPECT_EQ(changelog.entry_at(1)->get_term(), 77);
+    EXPECT_EQ(changelog.log_entries(1, 2)->size(), 1);
+}
+
+TEST(CoordinationTest, ChangelogTestFile)
+{
+    ChangelogDirTest test("./logs");
+    DB::NuKeeperLogStore changelog("./logs", 5, true);
+    changelog.init(1);
+    auto entry = getLogEntry("hello world", 77);
+    changelog.append(entry);
+    EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin"));
+    for (const auto & p : fs::directory_iterator("./logs"))
+        EXPECT_EQ(p.path(), "./logs/changelog_1_5.bin");
+
+    changelog.append(entry);
+    changelog.append(entry);
+    changelog.append(entry);
+    changelog.append(entry);
+    changelog.append(entry);
+
+    EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin"));
+}
+
+TEST(CoordinationTest, ChangelogReadWrite)
+{
+    ChangelogDirTest test("./logs");
+    DB::NuKeeperLogStore changelog("./logs", 1000, true);
+    changelog.init(1);
+    for (size_t i = 0; i < 10; ++i)
+    {
+        auto entry = getLogEntry("hello world", i * 10);
+        changelog.append(entry);
+    }
+    EXPECT_EQ(changelog.size(), 10);
+    DB::NuKeeperLogStore changelog_reader("./logs", 1000, true);
+    changelog_reader.init(1);
+    EXPECT_EQ(changelog_reader.size(), 10);
+    EXPECT_EQ(changelog_reader.last_entry()->get_term(), changelog.last_entry()->get_term());
+    EXPECT_EQ(changelog_reader.start_index(), changelog.start_index());
+    EXPECT_EQ(changelog_reader.next_slot(), changelog.next_slot());
+
+    for (size_t i = 0; i < 10; ++i)
+        EXPECT_EQ(changelog_reader.entry_at(i + 1)->get_term(), changelog.entry_at(i + 1)->get_term());
+
+    auto entries_from_range_read = changelog_reader.log_entries(1, 11);
+    auto entries_from_range = changelog.log_entries(1, 11);
+    EXPECT_EQ(entries_from_range_read->size(), entries_from_range->size());
+    EXPECT_EQ(10, entries_from_range->size());
+}
+
+TEST(CoordinationTest, ChangelogWriteAt)
+{
+    ChangelogDirTest test("./logs");
+    DB::NuKeeperLogStore changelog("./logs", 1000, true);
+    changelog.init(1);
+    for (size_t i = 0; i < 10; ++i)
+    {
+        auto entry = getLogEntry("hello world", i * 10);
+        changelog.append(entry);
+    }
+    EXPECT_EQ(changelog.size(), 10);
+
+    auto entry = getLogEntry("writer", 77);
+    changelog.write_at(7, entry);
+    EXPECT_EQ(changelog.size(), 7);
+    EXPECT_EQ(changelog.last_entry()->get_term(), 77);
+    EXPECT_EQ(changelog.entry_at(7)->get_term(), 77);
+    EXPECT_EQ(changelog.next_slot(), 8);
+
+    DB::NuKeeperLogStore changelog_reader("./logs", 1000, true);
+    changelog_reader.init(1);
+
+    EXPECT_EQ(changelog_reader.size(), changelog.size());
+    EXPECT_EQ(changelog_reader.last_entry()->get_term(), changelog.last_entry()->get_term());
+    EXPECT_EQ(changelog_reader.start_index(), changelog.start_index());
+    EXPECT_EQ(changelog_reader.next_slot(), changelog.next_slot());
+}
+
+
+TEST(CoordinationTest, ChangelogTestAppendAfterRead)
+{
+    ChangelogDirTest test("./logs");
+    DB::NuKeeperLogStore changelog("./logs", 5, true);
+    changelog.init(1);
+    for (size_t i = 0; i < 7; ++i)
+    {
+        auto entry = getLogEntry("hello world", i * 10);
+        changelog.append(entry);
+    }
+
+    EXPECT_EQ(changelog.size(), 7);
+    EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin"));
+
+    DB::NuKeeperLogStore changelog_reader("./logs", 5, true);
+    changelog_reader.init(1);
+
+    EXPECT_EQ(changelog_reader.size(), 7);
+    for (size_t i = 7; i < 10; ++i)
+    {
+        auto entry = getLogEntry("hello world", i * 10);
+        changelog_reader.append(entry);
+    }
+    EXPECT_EQ(changelog_reader.size(), 10);
+    EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin"));
+
+    size_t logs_count = 0;
+    for (const auto & _ [[maybe_unused]]: fs::directory_iterator("./logs"))
+        logs_count++;
+
+    EXPECT_EQ(logs_count, 2);
+
+    auto entry = getLogEntry("someentry", 77);
+    changelog_reader.append(entry);
+    EXPECT_EQ(changelog_reader.size(), 11);
+    EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_11_15.bin"));
+
+    logs_count = 0;
+    for (const auto & _ [[maybe_unused]]: fs::directory_iterator("./logs"))
+        logs_count++;
+
+    EXPECT_EQ(logs_count, 3);
+}
+
+TEST(CoordinationTest, ChangelogTestCompaction)
+{
+    ChangelogDirTest test("./logs");
+    DB::NuKeeperLogStore changelog("./logs", 5, true);
+    changelog.init(1);
+
+    for (size_t i = 0; i < 3; ++i)
+    {
+        auto entry = getLogEntry("hello world", i * 10);
+        changelog.append(entry);
+    }
+
+    EXPECT_EQ(changelog.size(), 3);
+
+    changelog.compact(2);
+
+    EXPECT_EQ(changelog.size(), 1);
+    EXPECT_EQ(changelog.start_index(), 3);
+    EXPECT_EQ(changelog.next_slot(), 4);
+    EXPECT_EQ(changelog.last_entry()->get_term(), 20);
+    EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin"));
+
+    auto e1 = getLogEntry("hello world", 30);
+    changelog.append(e1);
+    auto e2 = getLogEntry("hello world", 40);
+    changelog.append(e2);
+    auto e3 = getLogEntry("hello world", 50);
+    changelog.append(e3);
+    auto e4 = getLogEntry("hello world", 60);
+    changelog.append(e4);
+
+    EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin"));
+
+    changelog.compact(6);
+
+    EXPECT_FALSE(fs::exists("./logs/changelog_1_5.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin"));
+
+    EXPECT_EQ(changelog.size(), 1);
+    EXPECT_EQ(changelog.start_index(), 7);
+    EXPECT_EQ(changelog.next_slot(), 8);
+    EXPECT_EQ(changelog.last_entry()->get_term(), 60);
+    /// And we able to read it
+    DB::NuKeeperLogStore changelog_reader("./logs", 5, true);
+    changelog_reader.init(7);
+    EXPECT_EQ(changelog_reader.size(), 1);
+    EXPECT_EQ(changelog_reader.start_index(), 7);
+    EXPECT_EQ(changelog_reader.next_slot(), 8);
+    EXPECT_EQ(changelog_reader.last_entry()->get_term(), 60);
+}
+
+TEST(CoordinationTest, ChangelogTestBatchOperations)
+{
+    ChangelogDirTest test("./logs");
+    DB::NuKeeperLogStore changelog("./logs", 100, true);
+    changelog.init(1);
+    for (size_t i = 0; i < 10; ++i)
+    {
+        auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10);
+        changelog.append(entry);
+    }
+
+    EXPECT_EQ(changelog.size(), 10);
+
+    auto entries = changelog.pack(1, 5);
+
+    DB::NuKeeperLogStore apply_changelog("./logs", 100, true);
+    apply_changelog.init(1);
+
+    for (size_t i = 0; i < 10; ++i)
+    {
+        EXPECT_EQ(apply_changelog.entry_at(i + 1)->get_term(), i * 10);
+    }
+    EXPECT_EQ(apply_changelog.size(), 10);
+
+    apply_changelog.apply_pack(8, *entries);
+
+    EXPECT_EQ(apply_changelog.size(), 12);
+    EXPECT_EQ(apply_changelog.start_index(), 1);
+    EXPECT_EQ(apply_changelog.next_slot(), 13);
+
+    for (size_t i = 0; i < 7; ++i)
+    {
+        EXPECT_EQ(apply_changelog.entry_at(i + 1)->get_term(), i * 10);
+    }
+
+    EXPECT_EQ(apply_changelog.entry_at(8)->get_term(), 0);
+    EXPECT_EQ(apply_changelog.entry_at(9)->get_term(), 10);
+    EXPECT_EQ(apply_changelog.entry_at(10)->get_term(), 20);
+    EXPECT_EQ(apply_changelog.entry_at(11)->get_term(), 30);
+    EXPECT_EQ(apply_changelog.entry_at(12)->get_term(), 40);
+}
+
+TEST(CoordinationTest, ChangelogTestBatchOperationsEmpty)
+{
+    ChangelogDirTest test("./logs");
+    DB::NuKeeperLogStore changelog("./logs", 100, true);
+    changelog.init(1);
+    for (size_t i = 0; i < 10; ++i)
+    {
+        auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10);
+        changelog.append(entry);
+    }
+
+    EXPECT_EQ(changelog.size(), 10);
+
+    auto entries = changelog.pack(5, 5);
+
+    ChangelogDirTest test1("./logs1");
+    DB::NuKeeperLogStore changelog_new("./logs1", 100, true);
+    changelog_new.init(1);
+    EXPECT_EQ(changelog_new.size(), 0);
+
+    changelog_new.apply_pack(5, *entries);
+
+    EXPECT_EQ(changelog_new.size(), 5);
+    EXPECT_EQ(changelog_new.start_index(), 5);
+    EXPECT_EQ(changelog_new.next_slot(), 10);
+
+    for (size_t i = 4; i < 9; ++i)
+        EXPECT_EQ(changelog_new.entry_at(i + 1)->get_term(), i * 10);
+
+    auto e = getLogEntry("hello_world", 110);
+    changelog_new.append(e);
+    EXPECT_EQ(changelog_new.size(), 6);
+    EXPECT_EQ(changelog_new.start_index(), 5);
+    EXPECT_EQ(changelog_new.next_slot(), 11);
+
+    DB::NuKeeperLogStore changelog_reader("./logs1", 100, true);
+    changelog_reader.init(5);
+}
+
+
+TEST(CoordinationTest, ChangelogTestWriteAtPreviousFile)
+{
+    ChangelogDirTest test("./logs");
+    DB::NuKeeperLogStore changelog("./logs", 5, true);
+    changelog.init(1);
+
+    for (size_t i = 0; i < 33; ++i)
+    {
+        auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10);
+        changelog.append(entry);
+    }
+
+    EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_11_15.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_16_20.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_21_25.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_26_30.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_31_35.bin"));
+
+    EXPECT_EQ(changelog.size(), 33);
+
+    auto e1 = getLogEntry("helloworld", 5555);
+    changelog.write_at(7, e1);
+    EXPECT_EQ(changelog.size(), 7);
+    EXPECT_EQ(changelog.start_index(), 1);
+    EXPECT_EQ(changelog.next_slot(), 8);
+    EXPECT_EQ(changelog.last_entry()->get_term(), 5555);
+
+    EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin"));
+
+    EXPECT_FALSE(fs::exists("./logs/changelog_11_15.bin"));
+    EXPECT_FALSE(fs::exists("./logs/changelog_16_20.bin"));
+    EXPECT_FALSE(fs::exists("./logs/changelog_21_25.bin"));
+    EXPECT_FALSE(fs::exists("./logs/changelog_26_30.bin"));
+    EXPECT_FALSE(fs::exists("./logs/changelog_31_35.bin"));
+
+    DB::NuKeeperLogStore changelog_read("./logs", 5, true);
+    changelog_read.init(1);
+    EXPECT_EQ(changelog_read.size(), 7);
+    EXPECT_EQ(changelog_read.start_index(), 1);
+    EXPECT_EQ(changelog_read.next_slot(), 8);
+    EXPECT_EQ(changelog_read.last_entry()->get_term(), 5555);
+}
+
+TEST(CoordinationTest, ChangelogTestWriteAtFileBorder)
+{
+    ChangelogDirTest test("./logs");
+    DB::NuKeeperLogStore changelog("./logs", 5, true);
+    changelog.init(1);
+
+    for (size_t i = 0; i < 33; ++i)
+    {
+        auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10);
+        changelog.append(entry);
+    }
+
+    EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_11_15.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_16_20.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_21_25.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_26_30.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_31_35.bin"));
+
+    EXPECT_EQ(changelog.size(), 33);
+
+    auto e1 = getLogEntry("helloworld", 5555);
+    changelog.write_at(11, e1);
+    EXPECT_EQ(changelog.size(), 11);
+    EXPECT_EQ(changelog.start_index(), 1);
+    EXPECT_EQ(changelog.next_slot(), 12);
+    EXPECT_EQ(changelog.last_entry()->get_term(), 5555);
+
+    EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_11_15.bin"));
+
+    EXPECT_FALSE(fs::exists("./logs/changelog_16_20.bin"));
+    EXPECT_FALSE(fs::exists("./logs/changelog_21_25.bin"));
+    EXPECT_FALSE(fs::exists("./logs/changelog_26_30.bin"));
+    EXPECT_FALSE(fs::exists("./logs/changelog_31_35.bin"));
+
+    DB::NuKeeperLogStore changelog_read("./logs", 5, true);
+    changelog_read.init(1);
+    EXPECT_EQ(changelog_read.size(), 11);
+    EXPECT_EQ(changelog_read.start_index(), 1);
+    EXPECT_EQ(changelog_read.next_slot(), 12);
+    EXPECT_EQ(changelog_read.last_entry()->get_term(), 5555);
+}
+
+TEST(CoordinationTest, ChangelogTestWriteAtAllFiles)
+{
+    ChangelogDirTest test("./logs");
+    DB::NuKeeperLogStore changelog("./logs", 5, true);
+    changelog.init(1);
+
+    for (size_t i = 0; i < 33; ++i)
+    {
+        auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10);
+        changelog.append(entry);
+    }
+
+    EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_11_15.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_16_20.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_21_25.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_26_30.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_31_35.bin"));
+
+    EXPECT_EQ(changelog.size(), 33);
+
+    auto e1 = getLogEntry("helloworld", 5555);
+    changelog.write_at(1, e1);
+    EXPECT_EQ(changelog.size(), 1);
+    EXPECT_EQ(changelog.start_index(), 1);
+    EXPECT_EQ(changelog.next_slot(), 2);
+    EXPECT_EQ(changelog.last_entry()->get_term(), 5555);
+
+    EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin"));
+
+    EXPECT_FALSE(fs::exists("./logs/changelog_6_10.bin"));
+    EXPECT_FALSE(fs::exists("./logs/changelog_11_15.bin"));
+    EXPECT_FALSE(fs::exists("./logs/changelog_16_20.bin"));
+    EXPECT_FALSE(fs::exists("./logs/changelog_21_25.bin"));
+    EXPECT_FALSE(fs::exists("./logs/changelog_26_30.bin"));
+    EXPECT_FALSE(fs::exists("./logs/changelog_31_35.bin"));
+}
+
+TEST(CoordinationTest, ChangelogTestStartNewLogAfterRead)
+{
+    ChangelogDirTest test("./logs");
+    DB::NuKeeperLogStore changelog("./logs", 5, true);
+    changelog.init(1);
+
+    for (size_t i = 0; i < 35; ++i)
+    {
+        auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10);
+        changelog.append(entry);
+    }
+    EXPECT_EQ(changelog.size(), 35);
+    EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_11_15.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_16_20.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_21_25.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_26_30.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_31_35.bin"));
+    EXPECT_FALSE(fs::exists("./logs/changelog_36_40.bin"));
+
+
+    DB::NuKeeperLogStore changelog_reader("./logs", 5, true);
+    changelog_reader.init(1);
+
+    auto entry = getLogEntry("36_hello_world", 360);
+    changelog_reader.append(entry);
+
+    EXPECT_EQ(changelog_reader.size(), 36);
+    EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_11_15.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_16_20.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_21_25.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_26_30.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_31_35.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_36_40.bin"));
+}
+
+
+TEST(CoordinationTest, ChangelogTestReadAfterBrokenTruncate)
+{
+    ChangelogDirTest test("./logs");
+
+    DB::NuKeeperLogStore changelog("./logs", 5, true);
+    changelog.init(1);
+
+    for (size_t i = 0; i < 35; ++i)
+    {
+        auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10);
+        changelog.append(entry);
+    }
+    EXPECT_EQ(changelog.size(), 35);
+    EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_11_15.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_16_20.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_21_25.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_26_30.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_31_35.bin"));
+
+    DB::WriteBufferFromFile plain_buf("./logs/changelog_11_15.bin", DBMS_DEFAULT_BUFFER_SIZE, O_APPEND | O_CREAT | O_WRONLY);
+    plain_buf.truncate(0);
+
+    DB::NuKeeperLogStore changelog_reader("./logs", 5, true);
+    changelog_reader.init(1);
+
+    EXPECT_EQ(changelog_reader.size(), 10);
+    EXPECT_EQ(changelog_reader.last_entry()->get_term(), 90);
+
+    EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_11_15.bin"));
+
+    EXPECT_FALSE(fs::exists("./logs/changelog_16_20.bin"));
+    EXPECT_FALSE(fs::exists("./logs/changelog_21_25.bin"));
+    EXPECT_FALSE(fs::exists("./logs/changelog_26_30.bin"));
+    EXPECT_FALSE(fs::exists("./logs/changelog_31_35.bin"));
+
+    auto entry = getLogEntry("h", 7777);
+    changelog_reader.append(entry);
+    EXPECT_EQ(changelog_reader.size(), 11);
+    EXPECT_EQ(changelog_reader.last_entry()->get_term(), 7777);
+
+    EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_11_15.bin"));
+
+    EXPECT_FALSE(fs::exists("./logs/changelog_16_20.bin"));
+    EXPECT_FALSE(fs::exists("./logs/changelog_21_25.bin"));
+    EXPECT_FALSE(fs::exists("./logs/changelog_26_30.bin"));
+    EXPECT_FALSE(fs::exists("./logs/changelog_31_35.bin"));
+
+    DB::NuKeeperLogStore changelog_reader2("./logs", 5, true);
+    changelog_reader2.init(1);
+    EXPECT_EQ(changelog_reader2.size(), 11);
+    EXPECT_EQ(changelog_reader2.last_entry()->get_term(), 7777);
+}
+
+TEST(CoordinationTest, ChangelogTestReadAfterBrokenTruncate2)
+{
+    ChangelogDirTest test("./logs");
+
+    DB::NuKeeperLogStore changelog("./logs", 20, true);
+    changelog.init(1);
+
+    for (size_t i = 0; i < 35; ++i)
+    {
+        auto entry = getLogEntry(std::to_string(i) + "_hello_world", (i + 44) * 10);
+        changelog.append(entry);
+    }
+
+    EXPECT_TRUE(fs::exists("./logs/changelog_1_20.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_21_40.bin"));
+
+    DB::WriteBufferFromFile plain_buf("./logs/changelog_1_20.bin", DBMS_DEFAULT_BUFFER_SIZE, O_APPEND | O_CREAT | O_WRONLY);
+    plain_buf.truncate(140);
+
+    DB::NuKeeperLogStore changelog_reader("./logs", 20, true);
+    changelog_reader.init(1);
+
+    EXPECT_EQ(changelog_reader.size(), 2);
+    EXPECT_EQ(changelog_reader.last_entry()->get_term(), 450);
+    EXPECT_TRUE(fs::exists("./logs/changelog_1_20.bin"));
+    EXPECT_FALSE(fs::exists("./logs/changelog_21_40.bin"));
+    auto entry = getLogEntry("hello_world", 7777);
+    changelog_reader.append(entry);
+    EXPECT_EQ(changelog_reader.size(), 3);
+    EXPECT_EQ(changelog_reader.last_entry()->get_term(), 7777);
+
+
+    DB::NuKeeperLogStore changelog_reader2("./logs", 20, true);
+    changelog_reader2.init(1);
+    EXPECT_EQ(changelog_reader2.size(), 3);
+    EXPECT_EQ(changelog_reader2.last_entry()->get_term(), 7777);
+}
+
+TEST(CoordinationTest, ChangelogTestLostFiles)
+{
+    ChangelogDirTest test("./logs");
+
+    DB::NuKeeperLogStore changelog("./logs", 20, true);
+    changelog.init(1);
+
+    for (size_t i = 0; i < 35; ++i)
+    {
+        auto entry = getLogEntry(std::to_string(i) + "_hello_world", (i + 44) * 10);
+        changelog.append(entry);
+    }
+
+    EXPECT_TRUE(fs::exists("./logs/changelog_1_20.bin"));
+    EXPECT_TRUE(fs::exists("./logs/changelog_21_40.bin"));
+
+    fs::remove("./logs/changelog_1_20.bin");
+
+    DB::NuKeeperLogStore changelog_reader("./logs", 20, true);
+    EXPECT_THROW(changelog_reader.init(5), DB::Exception);
+
+    fs::remove("./logs/changelog_21_40.bin");
+    EXPECT_THROW(changelog_reader.init(3), DB::Exception);
+}
+
+int main(int argc, char ** argv)
+{
+    Poco::AutoPtr<Poco::ConsoleChannel> channel(new Poco::ConsoleChannel(std::cerr));
+    Poco::Logger::root().setChannel(channel);
+    Poco::Logger::root().setLevel("trace");
+    testing::InitGoogleTest(&argc, argv);
+    return RUN_ALL_TESTS();
+}
+
 #endif
--- a/src/Formats/ProtobufSerializer.cpp
+++ b/src/Formats/ProtobufSerializer.cpp
@ -2811,12 +2811,7 @@ namespace
                    const auto & array_data_type = assert_cast<const DataTypeArray &>(*data_type);

                    if (!allow_repeat)
-                    {
-                        throw Exception(
-                            "The field " + quoteString(field_descriptor.full_name())
-                                + " must be repeated in the protobuf schema to match the column " + backQuote(StringRef{column_name}),
-                            ErrorCodes::PROTOBUF_FIELD_NOT_REPEATED);
-                    }
+                        throwFieldNotRepeated(field_descriptor, column_name);

                    auto nested_serializer = buildFieldSerializer(column_name, array_data_type.getNestedType(), field_descriptor,
                                                                  /* allow_repeat = */ false); // We do our repeating now, so for nested type we forget about the repeating.
@ -2861,12 +2856,7 @@ namespace

                    /// Serialize as a repeated field.
                    if (!allow_repeat && (size_of_tuple > 1))
-                    {
-                        throw Exception(
-                            "The field " + quoteString(field_descriptor.full_name())
-                                + " must be repeated in the protobuf schema to match the column " + backQuote(StringRef{column_name}),
-                            ErrorCodes::PROTOBUF_FIELD_NOT_REPEATED);
-                    }
+                        throwFieldNotRepeated(field_descriptor, column_name);

                    std::vector<std::unique_ptr<ProtobufSerializer>> nested_serializers;
                    for (const auto & nested_data_type : tuple_data_type.getElements())
@ -2892,6 +2882,21 @@ namespace
            }
        }

+        [[noreturn]] static void throwFieldNotRepeated(const FieldDescriptor & field_descriptor, const std::string_view & column_name)
+        {
+            if (!field_descriptor.is_repeated())
+                throw Exception(
+                    "The field " + quoteString(field_descriptor.full_name())
+                        + " must be repeated in the protobuf schema to match the column " + backQuote(StringRef{column_name}),
+                    ErrorCodes::PROTOBUF_FIELD_NOT_REPEATED);
+
+            throw Exception(
+                "The field " + quoteString(field_descriptor.full_name())
+                    + " is repeated but the level of repeatedness is not enough to serialize a multidimensional array from the column "
+                    + backQuote(StringRef{column_name}) + ". It's recommended to make the parent field repeated as well.",
+                ErrorCodes::PROTOBUF_FIELD_NOT_REPEATED);
+        }
+
        const ProtobufReaderOrWriter reader_or_writer;
    };
 }
--- a/src/Functions/FunctionJoinGet.cpp
+++ b/src/Functions/FunctionJoinGet.cpp
@ -25,16 +25,18 @@ ColumnPtr ExecutableFunctionJoinGet<or_null>::execute(const ColumnsWithTypeAndNa
        auto key = arguments[i];
        keys.emplace_back(std::move(key));
    }
-    return join->joinGet(keys, result_columns).column;
+    return storage_join->joinGet(keys, result_columns).column;
 }

 template <bool or_null>
 ExecutableFunctionImplPtr FunctionJoinGet<or_null>::prepare(const ColumnsWithTypeAndName &) const
 {
-    return std::make_unique<ExecutableFunctionJoinGet<or_null>>(join, DB::Block{{return_type->createColumn(), return_type, attr_name}});
+    Block result_columns {{return_type->createColumn(), return_type, attr_name}};
+    return std::make_unique<ExecutableFunctionJoinGet<or_null>>(table_lock, storage_join, result_columns);
 }

-static auto getJoin(const ColumnsWithTypeAndName & arguments, const Context & context)
+static std::pair<std::shared_ptr<StorageJoin>, String>
+getJoin(const ColumnsWithTypeAndName & arguments, const Context & context)
 {
    String join_name;
    if (const auto * name_col = checkAndGetColumnConst<ColumnString>(arguments[0].column.get()))
@ -87,13 +89,12 @@ FunctionBaseImplPtr JoinGetOverloadResolver<or_null>::build(const ColumnsWithTyp
                + ", should be greater or equal to 3",
            ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
    auto [storage_join, attr_name] = getJoin(arguments, context);
-    auto join = storage_join->getJoin();
    DataTypes data_types(arguments.size() - 2);
    for (size_t i = 2; i < arguments.size(); ++i)
        data_types[i - 2] = arguments[i].type;
-    auto return_type = join->joinGetCheckAndGetReturnType(data_types, attr_name, or_null);
+    auto return_type = storage_join->joinGetCheckAndGetReturnType(data_types, attr_name, or_null);
    auto table_lock = storage_join->lockForShare(context.getInitialQueryId(), context.getSettingsRef().lock_acquire_timeout);
-    return std::make_unique<FunctionJoinGet<or_null>>(table_lock, storage_join, join, attr_name, data_types, return_type);
+    return std::make_unique<FunctionJoinGet<or_null>>(table_lock, storage_join, attr_name, data_types, return_type);
 }

 void registerFunctionJoinGet(FunctionFactory & factory)
--- a/src/Functions/FunctionJoinGet.h
+++ b/src/Functions/FunctionJoinGet.h
@ -9,14 +9,20 @@ namespace DB

 class Context;
 class HashJoin;
-using HashJoinPtr = std::shared_ptr<HashJoin>;
+class StorageJoin;
+using StorageJoinPtr = std::shared_ptr<StorageJoin>;

 template <bool or_null>
 class ExecutableFunctionJoinGet final : public IExecutableFunctionImpl
 {
 public:
-    ExecutableFunctionJoinGet(HashJoinPtr join_, const DB::Block & result_columns_)
-        : join(std::move(join_)), result_columns(result_columns_) {}
+    ExecutableFunctionJoinGet(TableLockHolder table_lock_,
+                              StorageJoinPtr storage_join_,
+                              const DB::Block & result_columns_)
+        : table_lock(std::move(table_lock_))
+        , storage_join(std::move(storage_join_))
+        , result_columns(result_columns_)
+    {}

    static constexpr auto name = or_null ? "joinGetOrNull" : "joinGet";

@ -29,7 +35,8 @@ public:
    String getName() const override { return name; }

 private:
-    HashJoinPtr join;
+    TableLockHolder table_lock;
+    StorageJoinPtr storage_join;
    DB::Block result_columns;
 };

@ -39,12 +46,11 @@ class FunctionJoinGet final : public IFunctionBaseImpl
 public:
    static constexpr auto name = or_null ? "joinGetOrNull" : "joinGet";

-    FunctionJoinGet(TableLockHolder table_lock_, StoragePtr storage_join_,
-                    HashJoinPtr join_, String attr_name_,
+    FunctionJoinGet(TableLockHolder table_lock_,
+                    StorageJoinPtr storage_join_, String attr_name_,
                    DataTypes argument_types_, DataTypePtr return_type_)
        : table_lock(std::move(table_lock_))
-        , storage_join(std::move(storage_join_))
-        , join(std::move(join_))
+        , storage_join(storage_join_)
        , attr_name(std::move(attr_name_))
        , argument_types(std::move(argument_types_))
        , return_type(std::move(return_type_))
@ -60,8 +66,7 @@ public:

 private:
    TableLockHolder table_lock;
-    StoragePtr storage_join;
-    HashJoinPtr join;
+    StorageJoinPtr storage_join;
    const String attr_name;
    DataTypes argument_types;
    DataTypePtr return_type;
--- a/src/Interpreters/ExpressionAnalyzer.cpp
+++ b/src/Interpreters/ExpressionAnalyzer.cpp
@ -739,7 +739,7 @@ static JoinPtr tryGetStorageJoin(std::shared_ptr<TableJoin> analyzed_join)
 {
    if (auto * table = analyzed_join->joined_storage.get())
        if (auto * storage_join = dynamic_cast<StorageJoin *>(table))
-            return storage_join->getJoin(analyzed_join);
+            return storage_join->getJoinLocked(analyzed_join);
    return {};
 }

--- a/src/Interpreters/HashJoin.cpp
+++ b/src/Interpreters/HashJoin.cpp
@ -421,25 +421,12 @@ bool HashJoin::empty() const
    return data->type == Type::EMPTY;
 }

-size_t HashJoin::getTotalByteCount() const
-{
-    std::shared_lock lock(data->rwlock);
-    return getTotalByteCountLocked();
-}
-
-size_t HashJoin::getTotalRowCount() const
-{
-    std::shared_lock lock(data->rwlock);
-    return getTotalRowCountLocked();
-}
-
 bool HashJoin::alwaysReturnsEmptySet() const
 {
-    std::shared_lock lock(data->rwlock);
    return isInnerOrRight(getKind()) && data->empty && !overDictionary();
 }

-size_t HashJoin::getTotalRowCountLocked() const
+size_t HashJoin::getTotalRowCount() const
 {
    size_t res = 0;

@ -456,7 +443,7 @@ size_t HashJoin::getTotalRowCountLocked() const
    return res;
 }

-size_t HashJoin::getTotalByteCountLocked() const
+size_t HashJoin::getTotalByteCount() const
 {
    size_t res = 0;

@ -652,7 +639,9 @@ bool HashJoin::addJoinedBlock(const Block & source_block, bool check_limits)
    size_t total_bytes = 0;

    {
-        std::unique_lock lock(data->rwlock);
+        if (storage_join_lock.mutex())
+            throw DB::Exception("addJoinedBlock called when HashJoin locked to prevent updates",
+                                ErrorCodes::LOGICAL_ERROR);

        data->blocks.emplace_back(std::move(structured_block));
        Block * stored_block = &data->blocks.back();
@ -677,8 +666,8 @@ bool HashJoin::addJoinedBlock(const Block & source_block, bool check_limits)
            return true;

        /// TODO: Do not calculate them every time
-        total_rows = getTotalRowCountLocked();
-        total_bytes = getTotalByteCountLocked();
+        total_rows = getTotalRowCount();
+        total_bytes = getTotalByteCount();
    }

    return table_join->sizeLimits().check(total_rows, total_bytes, "JOIN", ErrorCodes::SET_SIZE_LIMIT_EXCEEDED);
@ -1216,11 +1205,8 @@ void HashJoin::joinBlockImplCross(Block & block, ExtraBlockPtr & not_processed)
    block = block.cloneWithColumns(std::move(dst_columns));
 }

-
 DataTypePtr HashJoin::joinGetCheckAndGetReturnType(const DataTypes & data_types, const String & column_name, bool or_null) const
 {
-    std::shared_lock lock(data->rwlock);
-
    size_t num_keys = data_types.size();
    if (right_table_keys.columns() != num_keys)
        throw Exception(
@ -1232,8 +1218,8 @@ DataTypePtr HashJoin::joinGetCheckAndGetReturnType(const DataTypes & data_types,
    {
        const auto & left_type_origin = data_types[i];
        const auto & [c2, right_type_origin, right_name] = right_table_keys.safeGetByPosition(i);
-        auto left_type = removeNullable(left_type_origin);
-        auto right_type = removeNullable(right_type_origin);
+        auto left_type = removeNullable(recursiveRemoveLowCardinality(left_type_origin));
+        auto right_type = removeNullable(recursiveRemoveLowCardinality(right_type_origin));
        if (!left_type->equals(*right_type))
            throw Exception(
                "Type mismatch in joinGet key " + toString(i) + ": found type " + left_type->getName() + ", while the needed type is "
@ -1250,11 +1236,16 @@ DataTypePtr HashJoin::joinGetCheckAndGetReturnType(const DataTypes & data_types,
    return elem.type;
 }

-
-template <typename Maps>
-ColumnWithTypeAndName HashJoin::joinGetImpl(const Block & block, const Block & block_with_columns_to_add, const Maps & maps_) const
+/// TODO: return multiple columns as named tuple
+/// TODO: return array of values when strictness == ASTTableJoin::Strictness::All
+ColumnWithTypeAndName HashJoin::joinGet(const Block & block, const Block & block_with_columns_to_add) const
 {
-    // Assemble the key block with correct names.
+    bool is_valid = (strictness == ASTTableJoin::Strictness::Any || strictness == ASTTableJoin::Strictness::RightAny)
+        && kind == ASTTableJoin::Kind::Left;
+    if (!is_valid)
+        throw Exception("joinGet only supports StorageJoin of type Left Any", ErrorCodes::INCOMPATIBLE_TYPE_OF_JOIN);
+
+    /// Assemble the key block with correct names.
    Block keys;
    for (size_t i = 0; i < block.columns(); ++i)
    {
@ -1263,32 +1254,15 @@ ColumnWithTypeAndName HashJoin::joinGetImpl(const Block & block, const Block & b
        keys.insert(std::move(key));
    }

+    static_assert(!MapGetter<ASTTableJoin::Kind::Left, ASTTableJoin::Strictness::Any>::flagged,
+                  "joinGet are not protected from hash table changes between block processing");
    joinBlockImpl<ASTTableJoin::Kind::Left, ASTTableJoin::Strictness::Any>(
-        keys, key_names_right, block_with_columns_to_add, maps_);
+        keys, key_names_right, block_with_columns_to_add, std::get<MapsOne>(data->maps));
    return keys.getByPosition(keys.columns() - 1);
 }

-
-// TODO: return multiple columns as named tuple
-// TODO: return array of values when strictness == ASTTableJoin::Strictness::All
-ColumnWithTypeAndName HashJoin::joinGet(const Block & block, const Block & block_with_columns_to_add) const
-{
-    std::shared_lock lock(data->rwlock);
-
-    if ((strictness == ASTTableJoin::Strictness::Any || strictness == ASTTableJoin::Strictness::RightAny) &&
-        kind == ASTTableJoin::Kind::Left)
-    {
-        return joinGetImpl(block, block_with_columns_to_add, std::get<MapsOne>(data->maps));
-    }
-    else
-        throw Exception("joinGet only supports StorageJoin of type Left Any", ErrorCodes::INCOMPATIBLE_TYPE_OF_JOIN);
-}
-
-
 void HashJoin::joinBlock(Block & block, ExtraBlockPtr & not_processed)
 {
-    std::shared_lock lock(data->rwlock);
-
    const Names & key_names_left = table_join->keyNamesLeft();
    JoinCommon::checkTypesOfKeys(block, key_names_left, right_table_keys, key_names_right);

--- a/src/Interpreters/HashJoin.h
+++ b/src/Interpreters/HashJoin.h
@ -306,10 +306,6 @@ public:

    struct RightTableData
    {
-        /// Protect state for concurrent use in insertFromBlock and joinBlock.
-        /// @note that these methods could be called simultaneously only while use of StorageJoin.
-        mutable std::shared_mutex rwlock;
-
        Type type = Type::EMPTY;
        bool empty = true;

@ -322,6 +318,13 @@ public:
        Arena pool;
    };

+    /// We keep correspondence between used_flags and hash table internal buffer.
+    /// Hash table cannot be modified during HashJoin lifetime and must be protected with lock.
+    void setLock(std::shared_mutex & rwlock)
+    {
+        storage_join_lock = std::shared_lock<std::shared_mutex>(rwlock);
+    }
+
    void reuseJoinedData(const HashJoin & join);

    std::shared_ptr<RightTableData> getJoinedData() const
@ -353,6 +356,8 @@ private:
    /// Flags that indicate that particular row already used in join.
    /// Flag is stored for every record in hash map.
    /// Number of this flags equals to hashtable buffer size (plus one for zero value).
+    /// Changes in hash table broke correspondence,
+    /// so we must guarantee constantness of hash table during HashJoin lifetime (using method setLock)
    mutable JoinStuff::JoinUsedFlags used_flags;
    Sizes key_sizes;

@ -371,6 +376,10 @@ private:

    Block totals;

+    /// Should be set via setLock to protect hash table from modification from StorageJoin
+    /// If set HashJoin instance is not available for modification (addJoinedBlock)
+    std::shared_lock<std::shared_mutex> storage_join_lock;
+
    void init(Type type_);

    const Block & savedBlockSample() const { return data->sample_block; }
@ -388,15 +397,8 @@ private:

    void joinBlockImplCross(Block & block, ExtraBlockPtr & not_processed) const;

-    template <typename Maps>
-    ColumnWithTypeAndName joinGetImpl(const Block & block, const Block & block_with_columns_to_add, const Maps & maps_) const;
-
    static Type chooseMethod(const ColumnRawPtrs & key_columns, Sizes & key_sizes);

-    /// Call with already locked rwlock.
-    size_t getTotalRowCountLocked() const;
-    size_t getTotalByteCountLocked() const;
-
    bool empty() const;
    bool overDictionary() const;
 };
--- a/src/Interpreters/InterpreterAlterQuery.cpp
+++ b/src/Interpreters/InterpreterAlterQuery.cpp
@ -104,6 +104,7 @@ BlockIO InterpreterAlterQuery::execute()

    if (!mutation_commands.empty())
    {
+        table->checkMutationIsPossible(mutation_commands, context.getSettingsRef());
        MutationsInterpreter(table, metadata_snapshot, mutation_commands, context, false).validate();
        table->mutate(mutation_commands, context);
    }
--- a/src/Interpreters/TreeRewriter.cpp
+++ b/src/Interpreters/TreeRewriter.cpp
@ -715,18 +715,17 @@ void TreeRewriterResult::collectUsedColumns(const ASTPtr & query, bool is_select

        if (storage)
        {
-            String hint_name{};
+            std::vector<String> hint_name{};
            for (const auto & name : columns_context.requiredColumns())
            {
                auto hints = storage->getHints(name);
-                if (!hints.empty())
-                    hint_name = hint_name + " '" + toString(hints) + "'";
+                hint_name.insert(hint_name.end(), hints.begin(), hints.end());
            }

            if (!hint_name.empty())
            {
                ss << ", maybe you meant: ";
-                ss << hint_name;
+                ss << toString(hint_name);
            }
        }
        else
--- a/src/Server/NuKeeperTCPHandler.cpp
+++ b/src/Server/NuKeeperTCPHandler.cpp
@ -40,7 +40,7 @@ namespace ErrorCodes

 struct PollResult
 {
-    size_t ready_responses_count{0};
+    size_t responses_count{0};
    bool has_requests{false};
    bool error{false};
 };
@ -70,14 +70,14 @@ struct SocketInterruptablePollWrapper
        if (epollfd < 0)
            throwFromErrno("Cannot epoll_create", ErrorCodes::SYSTEM_ERROR);

-        socket_event.events = EPOLLIN | EPOLLERR;
+        socket_event.events = EPOLLIN | EPOLLERR | EPOLLPRI;
        socket_event.data.fd = sockfd;
        if (epoll_ctl(epollfd, EPOLL_CTL_ADD, sockfd, &socket_event) < 0)
        {
            ::close(epollfd);
            throwFromErrno("Cannot insert socket into epoll queue", ErrorCodes::SYSTEM_ERROR);
        }
-        pipe_event.events = EPOLLIN | EPOLLERR;
+        pipe_event.events = EPOLLIN | EPOLLERR | EPOLLPRI;
        pipe_event.data.fd = pipe.fds_rw[0];
        if (epoll_ctl(epollfd, EPOLL_CTL_ADD, pipe.fds_rw[0], &pipe_event) < 0)
        {
@ -92,97 +92,92 @@ struct SocketInterruptablePollWrapper
        return pipe.fds_rw[1];
    }

-    PollResult poll(Poco::Timespan remaining_time)
+    PollResult poll(Poco::Timespan remaining_time, const std::shared_ptr<ReadBufferFromPocoSocket> & in)
    {
-        std::array<int, 2> outputs = {-1, -1};
+
+        bool socket_ready = false;
+        bool fd_ready = false;
+
+        if (in->available() != 0)
+            socket_ready = true;
+
+        if (response_in.available() != 0)
+            fd_ready = true;
+
+        int rc = 0;
+        if (!fd_ready)
+        {
 #if defined(POCO_HAVE_FD_EPOLL)
-        int rc;
-        epoll_event evout[2];
-        memset(evout, 0, sizeof(evout));
-        do
-        {
-            Poco::Timestamp start;
-            rc = epoll_wait(epollfd, evout, 2, remaining_time.totalMilliseconds());
-            if (rc < 0 && errno == EINTR)
+            epoll_event evout[2];
+            evout[0].data.fd = evout[1].data.fd = -1;
+            do
            {
-                Poco::Timestamp end;
-                Poco::Timespan waited = end - start;
-                if (waited < remaining_time)
-                    remaining_time -= waited;
-                else
-                    remaining_time = 0;
-            }
-        }
-        while (rc < 0 && errno == EINTR);
-
-        if (rc >= 1 && evout[0].events & EPOLLIN)
-            outputs[0] = evout[0].data.fd;
-        if (rc == 2 && evout[1].events & EPOLLIN)
-            outputs[1] = evout[1].data.fd;
-#else
-        pollfd poll_buf[2];
-        poll_buf[0].fd = sockfd;
-        poll_buf[0].events = POLLIN;
-        poll_buf[1].fd = pipe.fds_rw[0];
-        poll_buf[1].events = POLLIN;
-
-        int rc;
-        do
-        {
-            Poco::Timestamp start;
-            rc = ::poll(poll_buf, 2, remaining_time.totalMilliseconds());
-            if (rc < 0 && errno == POCO_EINTR)
-            {
-                Poco::Timestamp end;
-                Poco::Timespan waited = end - start;
-                if (waited < remaining_time)
-                    remaining_time -= waited;
-                else
-                    remaining_time = 0;
-            }
-        }
-        while (rc < 0 && errno == POCO_EINTR);
-        if (rc >= 1 && poll_buf[0].revents & POLLIN)
-            outputs[0] = sockfd;
-        if (rc == 2 && poll_buf[1].revents & POLLIN)
-            outputs[1] = pipe.fds_rw[0];
-#endif
-
-        PollResult result{};
-        if (rc < 0)
-        {
-            result.error = true;
-            return result;
-        }
-        else if (rc == 0)
-        {
-            return result;
-        }
-        else
-        {
-            for (auto fd : outputs)
-            {
-                if (fd != -1)
+                Poco::Timestamp start;
+                rc = epoll_wait(epollfd, evout, 2, remaining_time.totalMilliseconds());
+                if (rc < 0 && errno == EINTR)
                {
-                    if (fd == sockfd)
-                        result.has_requests = true;
+                    Poco::Timestamp end;
+                    Poco::Timespan waited = end - start;
+                    if (waited < remaining_time)
+                        remaining_time -= waited;
                    else
-                    {
-                        UInt8 dummy;
-                        do
-                        {
-                            /// All ready responses stored in responses queue,
-                            /// but we have to count amount of ready responses in pipe
-                            /// and process them only. Otherwise states of response_in
-                            /// and response queue will be inconsistent and race condition is possible.
-                            readIntBinary(dummy, response_in);
-                            result.ready_responses_count++;
-                        }
-                        while (response_in.available());
-                    }
+                        remaining_time = 0;
                }
            }
+            while (rc < 0 && errno == EINTR);
+
+            for (int i = 0; i < rc; ++i)
+            {
+                if (evout[i].data.fd == sockfd)
+                    socket_ready = true;
+                if (evout[i].data.fd == pipe.fds_rw[0])
+                    fd_ready = true;
+            }
+#else
+            pollfd poll_buf[2];
+            poll_buf[0].fd = sockfd;
+            poll_buf[0].events = POLLIN;
+            poll_buf[1].fd = pipe.fds_rw[0];
+            poll_buf[1].events = POLLIN;
+
+            do
+            {
+                Poco::Timestamp start;
+                rc = ::poll(poll_buf, 2, remaining_time.totalMilliseconds());
+                if (rc < 0 && errno == POCO_EINTR)
+                {
+                    Poco::Timestamp end;
+                    Poco::Timespan waited = end - start;
+                    if (waited < remaining_time)
+                        remaining_time -= waited;
+                    else
+                        remaining_time = 0;
+                }
+            }
+            while (rc < 0 && errno == POCO_EINTR);
+
+            if (rc >= 1 && poll_buf[0].revents & POLLIN)
+                socket_ready = true;
+            if (rc == 2 && poll_buf[1].revents & POLLIN)
+                fd_ready = true;
+#endif
        }
+
+        PollResult result{};
+        result.has_requests = socket_ready;
+        if (fd_ready)
+        {
+            UInt8 dummy;
+            readIntBinary(dummy, response_in);
+            result.responses_count = 1;
+            auto available = response_in.available();
+            response_in.ignore(available);
+            result.responses_count += available;
+        }
+
+        if (rc < 0)
+            result.error = true;
+
        return result;
    }

@ -339,43 +334,40 @@ void NuKeeperTCPHandler::runImpl()
        {
            using namespace std::chrono_literals;

-            PollResult result = poll_wrapper->poll(session_timeout);
+            PollResult result = poll_wrapper->poll(session_timeout, in);
            if (result.has_requests && !close_received)
            {
-                do
-                {
-                    auto [received_op, received_xid] = receiveRequest();
+                auto [received_op, received_xid] = receiveRequest();

-                    if (received_op == Coordination::OpNum::Close)
-                    {
-                        LOG_DEBUG(log, "Received close event with xid {} for session id #{}", received_xid, session_id);
-                        close_xid = received_xid;
-                        close_received = true;
-                        break;
-                    }
-                    else if (received_op == Coordination::OpNum::Heartbeat)
-                    {
-                        LOG_TRACE(log, "Received heartbeat for session #{}", session_id);
-                        session_stopwatch.restart();
-                    }
+                if (received_op == Coordination::OpNum::Close)
+                {
+                    LOG_DEBUG(log, "Received close event with xid {} for session id #{}", received_xid, session_id);
+                    close_xid = received_xid;
+                    close_received = true;
+                }
+                else if (received_op == Coordination::OpNum::Heartbeat)
+                {
+                    LOG_TRACE(log, "Received heartbeat for session #{}", session_id);
+                    session_stopwatch.restart();
                }
-                while (in->available());
            }

            /// Process exact amount of responses from pipe
            /// otherwise state of responses queue and signaling pipe
            /// became inconsistent and race condition is possible.
-            while (result.ready_responses_count != 0)
+            while (result.responses_count != 0)
            {
                Coordination::ZooKeeperResponsePtr response;
+
                if (!responses->tryPop(response))
-                    throw Exception(ErrorCodes::LOGICAL_ERROR, "We must have at least {} ready responses, but queue is empty. It's a bug.", result.ready_responses_count);
+                    throw Exception(ErrorCodes::LOGICAL_ERROR, "We must have ready response, but queue is empty. It's a bug.");

                if (response->xid == close_xid)
                {
                    LOG_DEBUG(log, "Session #{} successfully closed", session_id);
                    return;
                }
+
                response->write(*out);
                if (response->error == Coordination::Error::ZSESSIONEXPIRED)
                {
@ -383,7 +375,8 @@ void NuKeeperTCPHandler::runImpl()
                    nu_keeper_storage_dispatcher->finishSession(session_id);
                    return;
                }
-                result.ready_responses_count--;
+
+                result.responses_count--;
            }

            if (result.error)
--- a/src/Storages/IStorage.cpp
+++ b/src/Storages/IStorage.cpp
@ -145,6 +145,11 @@ void IStorage::checkAlterIsPossible(const AlterCommands & commands, const Settin
    }
 }

+void IStorage::checkMutationIsPossible(const MutationCommands & /*commands*/, const Settings & /*settings*/) const
+{
+    throw Exception("Table engine " + getName() + " doesn't support mutations", ErrorCodes::NOT_IMPLEMENTED);
+}
+
 void IStorage::checkAlterPartitionIsPossible(
    const PartitionCommands & /*commands*/, const StorageMetadataPtr & /*metadata_snapshot*/, const Settings & /*settings*/) const
 {
--- a/src/Storages/IStorage.h
+++ b/src/Storages/IStorage.h
@ -364,6 +364,11 @@ public:
      */
    virtual void checkAlterIsPossible(const AlterCommands & commands, const Settings & settings) const;

+    /**
+      * Checks that mutation commands can be applied to storage.
+      */
+    virtual void checkMutationIsPossible(const MutationCommands & commands, const Settings & settings) const;
+
    /** ALTER tables with regard to its partitions.
      * Should handle locks for each command on its own.
      */
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@ -1670,6 +1670,12 @@ void MergeTreeData::checkAlterIsPossible(const AlterCommands & commands, const S
    }
 }

+
+void MergeTreeData::checkMutationIsPossible(const MutationCommands & /*commands*/, const Settings & /*settings*/) const
+{
+    /// Some validation will be added
+}
+
 MergeTreeDataPartType MergeTreeData::choosePartType(size_t bytes_uncompressed, size_t rows_count) const
 {
    const auto settings = getSettings();
--- a/src/Storages/MergeTree/MergeTreeData.h
+++ b/src/Storages/MergeTree/MergeTreeData.h
@ -519,6 +519,10 @@ public:
    /// If something is wrong, throws an exception.
    void checkAlterIsPossible(const AlterCommands & commands, const Settings & settings) const override;

+    /// Checks if the Mutation can be performed.
+    /// (currently no additional checks: always ok)
+    void checkMutationIsPossible(const MutationCommands & commands, const Settings & settings) const override;
+
    /// Checks that partition name in all commands is valid
    void checkAlterPartitionIsPossible(const PartitionCommands & commands, const StorageMetadataPtr & metadata_snapshot, const Settings & settings) const override;

--- a/src/Storages/RabbitMQ/StorageRabbitMQ.cpp
+++ b/src/Storages/RabbitMQ/StorageRabbitMQ.cpp
@ -99,9 +99,7 @@ StorageRabbitMQ::StorageRabbitMQ(
        , unique_strbase(getRandomName())
        , queue_size(std::max(QUEUE_SIZE, static_cast<uint32_t>(getMaxBlockSize())))
 {
-    loop = std::make_unique<uv_loop_t>();
-    uv_loop_init(loop.get());
-    event_handler = std::make_shared<RabbitMQHandler>(loop.get(), log);
+    event_handler = std::make_shared<RabbitMQHandler>(loop.getLoop(), log);
    restoreConnection(false);

    StorageInMemoryMetadata storage_metadata;
@ -482,7 +480,7 @@ bool StorageRabbitMQ::restoreConnection(bool reconnecting)
        /* Connection is not closed immediately (firstly, all pending operations are completed, and then
         * an AMQP closing-handshake is  performed). But cannot open a new connection until previous one is properly closed
         */
-        while (!connection->closed() && ++cnt_retries != RETRIES_MAX)
+        while (!connection->closed() && cnt_retries++ != RETRIES_MAX)
            event_handler->iterateLoop();

        /// This will force immediate closure if not yet closed
@ -498,7 +496,7 @@ bool StorageRabbitMQ::restoreConnection(bool reconnecting)
                AMQP::Login(login_password.first, login_password.second), vhost));

    cnt_retries = 0;
-    while (!connection->ready() && !stream_cancelled && ++cnt_retries != RETRIES_MAX)
+    while (!connection->ready() && !stream_cancelled && cnt_retries++ != RETRIES_MAX)
    {
        event_handler->iterateLoop();
        std::this_thread::sleep_for(std::chrono::milliseconds(CONNECT_SLEEP));
@ -653,7 +651,7 @@ void StorageRabbitMQ::shutdown()
    connection->close();

    size_t cnt_retries = 0;
-    while (!connection->closed() && ++cnt_retries != RETRIES_MAX)
+    while (!connection->closed() && cnt_retries++ != RETRIES_MAX)
        event_handler->iterateLoop();

    /// Should actually force closure, if not yet closed, but it generates distracting error logs
--- a/src/Storages/RabbitMQ/StorageRabbitMQ.h
+++ b/src/Storages/RabbitMQ/StorageRabbitMQ.h
@ -9,6 +9,7 @@
 #include <Storages/RabbitMQ/Buffer_fwd.h>
 #include <Storages/RabbitMQ/RabbitMQHandler.h>
 #include <Storages/RabbitMQ/RabbitMQSettings.h>
+#include <Storages/RabbitMQ/UVLoop.h>
 #include <Common/thread_local_rng.h>
 #include <amqpcpp/libuv.h>
 #include <uv.h>
@ -96,7 +97,7 @@ private:
    std::pair<String, String> login_password;
    String vhost;

-    std::unique_ptr<uv_loop_t> loop;
+    UVLoop loop;
    std::shared_ptr<RabbitMQHandler> event_handler;
    std::unique_ptr<AMQP::TcpConnection> connection; /// Connection for all consumers

--- a/src/Storages/RabbitMQ/UVLoop.h
+++ b/src/Storages/RabbitMQ/UVLoop.h
@ -0,0 +1,44 @@
+#pragma once
+
+#include <memory>
+
+#include <boost/noncopyable.hpp>
+#include <uv.h>
+
+#include <Common/Exception.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int SYSTEM_ERROR;
+}
+
+/// RAII wrapper around uv event loop
+class UVLoop : public boost::noncopyable
+{
+public:
+    UVLoop(): loop_ptr(new uv_loop_t())
+    {
+        int res = uv_loop_init(loop_ptr.get());
+
+        if (res != 0)
+            throw Exception("UVLoop could not initialize", ErrorCodes::SYSTEM_ERROR);
+    }
+
+    ~UVLoop()
+    {
+        if (loop_ptr)
+            uv_loop_close(loop_ptr.get());
+    }
+
+    inline uv_loop_t * getLoop() { return loop_ptr.get(); }
+
+    inline const uv_loop_t * getLoop() const { return loop_ptr.get(); }
+
+private:
+    std::unique_ptr<uv_loop_t> loop_ptr;
+};
+
+}
--- a/src/Storages/RabbitMQ/WriteBufferToRabbitMQProducer.cpp
+++ b/src/Storages/RabbitMQ/WriteBufferToRabbitMQProducer.cpp
@ -57,9 +57,7 @@ WriteBufferToRabbitMQProducer::WriteBufferToRabbitMQProducer(
        , max_rows(rows_per_message)
        , chunk_size(chunk_size_)
 {
-    loop = std::make_unique<uv_loop_t>();
-    uv_loop_init(loop.get());
-    event_handler = std::make_unique<RabbitMQHandler>(loop.get(), log);
+    event_handler = std::make_unique<RabbitMQHandler>(loop.getLoop(), log);

    if (setupConnection(false))
    {
@ -97,7 +95,7 @@ WriteBufferToRabbitMQProducer::~WriteBufferToRabbitMQProducer()
    connection->close();

    size_t cnt_retries = 0;
-    while (!connection->closed() && ++cnt_retries != RETRIES_MAX)
+    while (!connection->closed() && cnt_retries++ != RETRIES_MAX)
    {
        event_handler->iterateLoop();
        std::this_thread::sleep_for(std::chrono::milliseconds(CONNECT_SLEEP));
@ -189,11 +187,12 @@ void WriteBufferToRabbitMQProducer::setupChannel()
        /// Delivery tags are scoped per channel.
        delivery_record.clear();
        delivery_tag = 0;
+        producer_ready = false;
    });

    producer_channel->onReady([&]()
    {
-        channel_id = channel_id_base + std::to_string(channel_id_counter++);
+        channel_id = channel_id_base + "_" + std::to_string(channel_id_counter++);
        LOG_DEBUG(log, "Producer's channel {} is ready", channel_id);

        /* if persistent == true, onAck is received when message is persisted to disk or when it is consumed on every queue. If fails,
@ -211,6 +210,7 @@ void WriteBufferToRabbitMQProducer::setupChannel()
        {
            removeRecord(nacked_delivery_tag, multiple, true);
        });
+        producer_ready = true;
    });
 }

@ -218,30 +218,27 @@ void WriteBufferToRabbitMQProducer::setupChannel()
 void WriteBufferToRabbitMQProducer::removeRecord(UInt64 received_delivery_tag, bool multiple, bool republish)
 {
    auto record_iter = delivery_record.find(received_delivery_tag);
+    assert(record_iter != delivery_record.end());

-    if (record_iter != delivery_record.end())
+    if (multiple)
    {
-        if (multiple)
-        {
-            /// If multiple is true, then all delivery tags up to and including current are confirmed (with ack or nack).
-            ++record_iter;
+        /// If multiple is true, then all delivery tags up to and including current are confirmed (with ack or nack).
+        ++record_iter;

-            if (republish)
-                for (auto record = delivery_record.begin(); record != record_iter; ++record)
-                    returned.tryPush(record->second);
+        if (republish)
+            for (auto record = delivery_record.begin(); record != record_iter; ++record)
+                returned.tryPush(record->second);

-            /// Delete the records even in case when republished because new delivery tags will be assigned by the server.
-            delivery_record.erase(delivery_record.begin(), record_iter);
-        }
-        else
-        {
-            if (republish)
-                returned.tryPush(record_iter->second);
-
-            delivery_record.erase(record_iter);
-        }
+        /// Delete the records even in case when republished because new delivery tags will be assigned by the server.
+        delivery_record.erase(delivery_record.begin(), record_iter);
+    }
+    else
+    {
+        if (republish)
+            returned.tryPush(record_iter->second);
+
+        delivery_record.erase(record_iter);
    }
-    /// else is theoretically not possible
 }


@ -308,13 +305,18 @@ void WriteBufferToRabbitMQProducer::writingFunc()
 {
    while ((!payloads.empty() || wait_all) && wait_confirm.load())
    {
-        /* Publish main paylods only when there are no returned messages. This way it is ensured that returned messages are republished
-         * as fast as possible and no new publishes are made before returned messages are handled
-         */
-        if (!returned.empty() && producer_channel->usable())
-            publish(returned, true);
-        else if (!payloads.empty() && producer_channel->usable())
-            publish(payloads, false);
+        /// If onReady callback is not received, producer->usable() will anyway return true,
+        /// but must publish only after onReady callback.
+        if (producer_ready)
+        {
+            /* Publish main paylods only when there are no returned messages. This way it is ensured that returned messages are republished
+             * as fast as possible and no new publishes are made before returned messages are handled
+             */
+            if (!returned.empty() && producer_channel->usable())
+                publish(returned, true);
+            else if (!payloads.empty() && producer_channel->usable())
+                publish(payloads, false);
+        }

        iterateEventLoop();

--- a/src/Storages/RabbitMQ/WriteBufferToRabbitMQProducer.h
+++ b/src/Storages/RabbitMQ/WriteBufferToRabbitMQProducer.h
@ -7,6 +7,7 @@
 #include <atomic>
 #include <amqpcpp.h>
 #include <Storages/RabbitMQ/RabbitMQHandler.h>
+#include <Storages/RabbitMQ/UVLoop.h>
 #include <Common/ConcurrentBoundedQueue.h>
 #include <Core/BackgroundSchedulePool.h>
 #include <Core/Names.h>
@ -69,10 +70,11 @@ private:
    AMQP::Table key_arguments;
    BackgroundSchedulePool::TaskHolder writing_task;

-    std::unique_ptr<uv_loop_t> loop;
+    UVLoop loop;
    std::unique_ptr<RabbitMQHandler> event_handler;
    std::unique_ptr<AMQP::TcpConnection> connection;
    std::unique_ptr<AMQP::TcpChannel> producer_channel;
+    bool producer_ready = false;

    /// Channel errors lead to channel closure, need to count number of recreated channels to update channel id
    UInt64 channel_id_counter = 0;
--- a/src/Storages/StorageJoin.cpp
+++ b/src/Storages/StorageJoin.cpp
@ -79,7 +79,7 @@ void StorageJoin::truncate(
 }


-HashJoinPtr StorageJoin::getJoin(std::shared_ptr<TableJoin> analyzed_join) const
+HashJoinPtr StorageJoin::getJoinLocked(std::shared_ptr<TableJoin> analyzed_join) const
 {
    auto metadata_snapshot = getInMemoryMetadataPtr();
    if (!analyzed_join->sameStrictnessAndKind(strictness, kind))
@ -96,17 +96,47 @@ HashJoinPtr StorageJoin::getJoin(std::shared_ptr<TableJoin> analyzed_join) const
    analyzed_join->setRightKeys(key_names);

    HashJoinPtr join_clone = std::make_shared<HashJoin>(analyzed_join, metadata_snapshot->getSampleBlock().sortColumns());
+    join_clone->setLock(rwlock);
    join_clone->reuseJoinedData(*join);
+
    return join_clone;
 }


-void StorageJoin::insertBlock(const Block & block) { join->addJoinedBlock(block, true); }
+void StorageJoin::insertBlock(const Block & block)
+{
+    std::unique_lock<std::shared_mutex> lock(rwlock);
+    join->addJoinedBlock(block, true);
+}

-size_t StorageJoin::getSize() const { return join->getTotalRowCount(); }
-std::optional<UInt64> StorageJoin::totalRows(const Settings &) const { return join->getTotalRowCount(); }
-std::optional<UInt64> StorageJoin::totalBytes(const Settings &) const { return join->getTotalByteCount(); }
+size_t StorageJoin::getSize() const
+{
+    std::shared_lock<std::shared_mutex> lock(rwlock);
+    return join->getTotalRowCount();
+}

+std::optional<UInt64> StorageJoin::totalRows(const Settings &) const
+{
+    std::shared_lock<std::shared_mutex> lock(rwlock);
+    return join->getTotalRowCount();
+}
+
+std::optional<UInt64> StorageJoin::totalBytes(const Settings &) const
+{
+    std::shared_lock<std::shared_mutex> lock(rwlock);
+    return join->getTotalByteCount();
+}
+
+DataTypePtr StorageJoin::joinGetCheckAndGetReturnType(const DataTypes & data_types, const String & column_name, bool or_null) const
+{
+    return join->joinGetCheckAndGetReturnType(data_types, column_name, or_null);
+}
+
+ColumnWithTypeAndName StorageJoin::joinGet(const Block & block, const Block & block_with_columns_to_add) const
+{
+    std::shared_lock<std::shared_mutex> lock(rwlock);
+    return join->joinGet(block, block_with_columns_to_add);
+}

 void registerStorageJoin(StorageFactory & factory)
 {
@ -264,24 +294,24 @@ size_t rawSize(const StringRef & t)
 class JoinSource : public SourceWithProgress
 {
 public:
-    JoinSource(const HashJoin & parent_, UInt64 max_block_size_, Block sample_block_)
+    JoinSource(HashJoinPtr join_, std::shared_mutex & rwlock, UInt64 max_block_size_, Block sample_block_)
        : SourceWithProgress(sample_block_)
-        , parent(parent_)
-        , lock(parent.data->rwlock)
+        , join(join_)
+        , lock(rwlock)
        , max_block_size(max_block_size_)
        , sample_block(std::move(sample_block_))
    {
        column_indices.resize(sample_block.columns());

-        auto & saved_block = parent.getJoinedData()->sample_block;
+        auto & saved_block = join->getJoinedData()->sample_block;

        for (size_t i = 0; i < sample_block.columns(); ++i)
        {
            auto & [_, type, name] = sample_block.getByPosition(i);
-            if (parent.right_table_keys.has(name))
+            if (join->right_table_keys.has(name))
            {
                key_pos = i;
-                const auto & column = parent.right_table_keys.getByName(name);
+                const auto & column = join->right_table_keys.getByName(name);
                restored_block.insert(column);
            }
            else
@ -300,19 +330,20 @@ public:
 protected:
    Chunk generate() override
    {
-        if (parent.data->blocks.empty())
+        if (join->data->blocks.empty())
            return {};

        Chunk chunk;
-        if (!joinDispatch(parent.kind, parent.strictness, parent.data->maps,
+        if (!joinDispatch(join->kind, join->strictness, join->data->maps,
                [&](auto kind, auto strictness, auto & map) { chunk = createChunk<kind, strictness>(map); }))
            throw Exception("Logical error: unknown JOIN strictness", ErrorCodes::LOGICAL_ERROR);
        return chunk;
    }

 private:
-    const HashJoin & parent;
+    HashJoinPtr join;
    std::shared_lock<std::shared_mutex> lock;
+
    UInt64 max_block_size;
    Block sample_block;
    Block restored_block; /// sample_block with parent column types
@ -330,7 +361,7 @@ private:

        size_t rows_added = 0;

-        switch (parent.data->type)
+        switch (join->data->type)
        {
 #define M(TYPE)                                           \
    case HashJoin::Type::TYPE:                                \
@ -340,7 +371,7 @@ private:
 #undef M

            default:
-                throw Exception("Unsupported JOIN keys in StorageJoin. Type: " + toString(static_cast<UInt32>(parent.data->type)),
+                throw Exception("Unsupported JOIN keys in StorageJoin. Type: " + toString(static_cast<UInt32>(join->data->type)),
                                ErrorCodes::UNSUPPORTED_JOIN_KEYS);
        }

@ -468,7 +499,8 @@ Pipe StorageJoin::read(
 {
    metadata_snapshot->check(column_names, getVirtuals(), getStorageID());

-    return Pipe(std::make_shared<JoinSource>(*join, max_block_size, metadata_snapshot->getSampleBlockForColumns(column_names, getVirtuals(), getStorageID())));
+    Block source_sample_block = metadata_snapshot->getSampleBlockForColumns(column_names, getVirtuals(), getStorageID());
+    return Pipe(std::make_shared<JoinSource>(join, rwlock, max_block_size, source_sample_block));
 }

 }
--- a/src/Storages/StorageJoin.h
+++ b/src/Storages/StorageJoin.h
@ -14,7 +14,6 @@ class TableJoin;
 class HashJoin;
 using HashJoinPtr = std::shared_ptr<HashJoin>;

-
 /** Allows you save the state for later use on the right side of the JOIN.
  * When inserted into a table, the data will be inserted into the state,
  *  and also written to the backup file, to restore after the restart.
@ -30,12 +29,17 @@ public:

    void truncate(const ASTPtr &, const StorageMetadataPtr & metadata_snapshot, const Context &, TableExclusiveLockHolder &) override;

-    /// Access the innards.
-    HashJoinPtr & getJoin() { return join; }
-    HashJoinPtr getJoin(std::shared_ptr<TableJoin> analyzed_join) const;
+    /// Return instance of HashJoin holding lock that protects from insertions to StorageJoin.
+    /// HashJoin relies on structure of hash table that's why we need to return it with locked mutex.
+    HashJoinPtr getJoinLocked(std::shared_ptr<TableJoin> analyzed_join) const;

-    /// Verify that the data structure is suitable for implementing this type of JOIN.
-    void assertCompatible(ASTTableJoin::Kind kind_, ASTTableJoin::Strictness strictness_) const;
+    /// Get result type for function "joinGet(OrNull)"
+    DataTypePtr joinGetCheckAndGetReturnType(const DataTypes & data_types, const String & column_name, bool or_null) const;
+
+    /// Execute function "joinGet(OrNull)" on data block.
+    /// Takes rwlock for read to prevent parallel StorageJoin updates during processing data block
+    /// (but not during processing whole query, it's safe for joinGet that doesn't involve `used_flags` from HashJoin)
+    ColumnWithTypeAndName joinGet(const Block & block, const Block & block_with_columns_to_add) const;

    Pipe read(
        const Names & column_names,
@ -61,6 +65,10 @@ private:
    std::shared_ptr<TableJoin> table_join;
    HashJoinPtr join;

+    /// Protect state for concurrent use in insertFromBlock and joinBlock.
+    /// Lock is stored in HashJoin instance during query and blocks concurrent insertions.
+    mutable std::shared_mutex rwlock;
+
    void insertBlock(const Block & block) override;
    void finishInsert() override {}
    size_t getSize() const override;
--- a/src/Storages/StorageMaterializedView.cpp
+++ b/src/Storages/StorageMaterializedView.cpp
@ -320,6 +320,12 @@ void StorageMaterializedView::checkAlterIsPossible(const AlterCommands & command
    }
 }

+void StorageMaterializedView::checkMutationIsPossible(const MutationCommands & commands, const Settings & settings) const
+{
+    checkStatementCanBeForwarded();
+    getTargetTable()->checkMutationIsPossible(commands, settings);
+}
+
 Pipe StorageMaterializedView::alterPartition(
    const StorageMetadataPtr & metadata_snapshot, const PartitionCommands & commands, const Context & context)
 {
--- a/src/Storages/StorageMaterializedView.h
+++ b/src/Storages/StorageMaterializedView.h
@ -52,6 +52,8 @@ public:

    void alter(const AlterCommands & params, const Context & context, TableLockHolder & table_lock_holder) override;

+    void checkMutationIsPossible(const MutationCommands & commands, const Settings & settings) const override;
+
    void checkAlterIsPossible(const AlterCommands & commands, const Settings & settings) const override;

    Pipe alterPartition(const StorageMetadataPtr & metadata_snapshot, const PartitionCommands & commands, const Context & context) override;
--- a/src/Storages/StorageMemory.cpp
+++ b/src/Storages/StorageMemory.cpp
@ -253,6 +253,11 @@ static inline void updateBlockData(Block & old_block, const Block & new_block)
    }
 }

+void StorageMemory::checkMutationIsPossible(const MutationCommands & /*commands*/, const Settings & /*settings*/) const
+{
+    /// Some validation will be added
+}
+
 void StorageMemory::mutate(const MutationCommands & commands, const Context & context)
 {
    std::lock_guard lock(mutex);
--- a/src/Storages/StorageMemory.h
+++ b/src/Storages/StorageMemory.h
@ -51,6 +51,7 @@ public:

    void drop() override;

+    void checkMutationIsPossible(const MutationCommands & commands, const Settings & settings) const override;
    void mutate(const MutationCommands & commands, const Context & context) override;

    void truncate(const ASTPtr &, const StorageMetadataPtr &, const Context &, TableExclusiveLockHolder &) override;
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@ -749,8 +749,12 @@ void StorageReplicatedMergeTree::drop()
    if (has_metadata_in_zookeeper)
    {
        /// Table can be shut down, restarting thread is not active
-        /// and calling StorageReplicatedMergeTree::getZooKeeper() won't suffice.
-        auto zookeeper = global_context.getZooKeeper();
+        /// and calling StorageReplicatedMergeTree::getZooKeeper()/getAuxiliaryZooKeeper() won't suffice.
+        zkutil::ZooKeeperPtr zookeeper;
+        if (zookeeper_name == default_zookeeper_name)
+            zookeeper = global_context.getZooKeeper();
+        else
+            zookeeper = global_context.getAuxiliaryZooKeeper(zookeeper_name);

        /// If probably there is metadata in ZooKeeper, we don't allow to drop the table.
        if (!zookeeper)
--- a/src/Storages/StorageView.cpp
+++ b/src/Storages/StorageView.cpp
@ -105,6 +105,9 @@ void StorageView::read(

 static ASTTableExpression * getFirstTableExpression(ASTSelectQuery & select_query)
 {
+    if (!select_query.tables() || select_query.tables()->children.empty())
+        throw Exception("Logical error: no table expression in view select AST", ErrorCodes::LOGICAL_ERROR);
+
    auto * select_element = select_query.tables()->children[0]->as<ASTTablesInSelectQueryElement>();

    if (!select_element->table_expression)
--- a/tests/config/config.d/test_keeper_port.xml
+++ b/tests/config/config.d/test_keeper_port.xml
@ -2,12 +2,15 @@
    <test_keeper_server>
        <tcp_port>9181</tcp_port>
        <server_id>1</server_id>
+        <log_storage_path>/var/lib/clickhouse/coordination/log</log_storage_path>

        <coordination_settings>
            <operation_timeout_ms>10000</operation_timeout_ms>
            <session_timeout_ms>30000</session_timeout_ms>
            <snapshot_distance>0</snapshot_distance>
            <reserved_log_items>0</reserved_log_items>
+            <force_sync>false</force_sync>
+            <startup_timeout>60000</startup_timeout>
        </coordination_settings>

        <raft_configuration>
--- a/tests/integration/test_replicated_merge_tree_with_auxiliary_zookeepers/test.py
+++ b/tests/integration/test_replicated_merge_tree_with_auxiliary_zookeepers/test.py
@ -6,7 +6,6 @@ from helpers.cluster import ClickHouseCluster
 from helpers.client import QueryRuntimeException
 from helpers.test_tools import TSV

-cluster = ClickHouseCluster(__file__)
 cluster = ClickHouseCluster(__file__)
 node1 = cluster.add_instance("node1", main_configs=["configs/zookeeper_config.xml", "configs/remote_servers.xml"], with_zookeeper=True)
 node2 = cluster.add_instance("node2", main_configs=["configs/zookeeper_config.xml", "configs/remote_servers.xml"], with_zookeeper=True)
@ -78,3 +77,27 @@ def test_create_replicated_merge_tree_with_not_exists_auxiliary_zookeeper(starte
                ENGINE = ReplicatedMergeTree('zookeeper_not_exits:/clickhouse/tables/test/test_auxiliary_zookeeper', '{replica}')
                ORDER BY a;
            '''.format(replica=node1.name))
+
+# Drop table with auxiliary zookeeper.
+def test_drop_replicated_merge_tree_with_auxiliary_zookeeper(started_cluster):
+    drop_table([node1, node2], "test_auxiliary_zookeeper")
+    for node in [node1, node2]:
+        node.query(
+            '''
+                CREATE TABLE test_auxiliary_zookeeper(a Int32)
+                ENGINE = ReplicatedMergeTree('zookeeper2:/clickhouse/tables/test/test_auxiliary_zookeeper', '{replica}')
+                ORDER BY a;
+            '''.format(replica=node.name))
+
+    # Insert data into node1, and query it from node2.
+    node1.query("INSERT INTO test_auxiliary_zookeeper VALUES (1)")
+    time.sleep(5)
+
+    expected = "1\n"
+    assert TSV(node1.query("SELECT a FROM test_auxiliary_zookeeper")) == TSV(expected)
+    assert TSV(node2.query("SELECT a FROM test_auxiliary_zookeeper")) == TSV(expected)
+
+    zk = cluster.get_kazoo_client('zoo1')
+    assert zk.exists('/clickhouse/tables/test/test_auxiliary_zookeeper')
+    drop_table([node1, node2], "test_auxiliary_zookeeper")
+    assert zk.exists('/clickhouse/tables/test/test_auxiliary_zookeeper') is None
--- a/tests/integration/test_testkeeper_back_to_back/configs/enable_test_keeper.xml
+++ b/tests/integration/test_testkeeper_back_to_back/configs/enable_test_keeper.xml
@ -2,11 +2,13 @@
    <test_keeper_server>
        <tcp_port>9181</tcp_port>
        <server_id>1</server_id>
+        <log_storage_path>/var/lib/clickhouse/coordination/log</log_storage_path>

        <coordination_settings>
            <operation_timeout_ms>5000</operation_timeout_ms>
            <session_timeout_ms>10000</session_timeout_ms>
            <raft_logs_level>trace</raft_logs_level>
+            <force_sync>false</force_sync>
        </coordination_settings>

        <raft_configuration>
--- a/tests/integration/test_testkeeper_back_to_back/configs/use_test_keeper.xml
+++ b/tests/integration/test_testkeeper_back_to_back/configs/use_test_keeper.xml
@ -0,0 +1,8 @@
+<yandex>
+    <zookeeper>
+            <node index="1">
+            <host>node1</host>
+            <port>9181</port>
+        </node>
+    </zookeeper>
+</yandex>
--- a/tests/integration/test_testkeeper_back_to_back/test.py
+++ b/tests/integration/test_testkeeper_back_to_back/test.py
@ -8,32 +8,23 @@ from multiprocessing.dummy import Pool

 cluster = ClickHouseCluster(__file__)
 node = cluster.add_instance('node', main_configs=['configs/enable_test_keeper.xml', 'configs/logs_conf.xml'], with_zookeeper=True)
-from kazoo.client import KazooClient, KazooState
-
-_genuine_zk_instance = None
-_fake_zk_instance = None
+from kazoo.client import KazooClient, KazooState, KeeperState

 def get_genuine_zk():
-    global _genuine_zk_instance
-    if not _genuine_zk_instance:
-        print("Zoo1", cluster.get_instance_ip("zoo1"))
-        _genuine_zk_instance = cluster.get_kazoo_client('zoo1')
-    return _genuine_zk_instance
-
+    print("Zoo1", cluster.get_instance_ip("zoo1"))
+    return cluster.get_kazoo_client('zoo1')

 def get_fake_zk():
-    global _fake_zk_instance
-    if not _fake_zk_instance:
-        print("node", cluster.get_instance_ip("node"))
-        _fake_zk_instance = KazooClient(hosts=cluster.get_instance_ip("node") + ":9181", timeout=30.0)
-        def reset_last_zxid_listener(state):
-            print("Fake zk callback called for state", state)
-            global _fake_zk_instance
-            if state != KazooState.CONNECTED:
-                _fake_zk_instance._reset()
+    print("node", cluster.get_instance_ip("node"))
+    _fake_zk_instance =  KazooClient(hosts=cluster.get_instance_ip("node") + ":9181", timeout=30.0)
+    def reset_last_zxid_listener(state):
+        print("Fake zk callback called for state", state)
+        nonlocal _fake_zk_instance
+        if state != KazooState.CONNECTED:
+            _fake_zk_instance._reset()

-        _fake_zk_instance.add_listener(reset_last_zxid_listener)
-        _fake_zk_instance.start()
+    _fake_zk_instance.add_listener(reset_last_zxid_listener)
+    _fake_zk_instance.start()
    return _fake_zk_instance

 def random_string(length):
@ -44,6 +35,15 @@ def create_random_path(prefix="", depth=1):
        return prefix
    return create_random_path(os.path.join(prefix, random_string(3)), depth - 1)

+def stop_zk(zk):
+    try:
+        if zk:
+            zk.stop()
+            zk.close()
+    except:
+        pass
+
+
@pytest.fixture(scope="module")
 def started_cluster():
    try:
@ -53,44 +53,46 @@ def started_cluster():

    finally:
        cluster.shutdown()
-        if _genuine_zk_instance:
-            _genuine_zk_instance.stop()
-            _genuine_zk_instance.close()
-        if _fake_zk_instance:
-            _fake_zk_instance.stop()
-            _fake_zk_instance.close()


 def test_simple_commands(started_cluster):
-    genuine_zk = get_genuine_zk()
-    fake_zk = get_fake_zk()
+    try:
+        genuine_zk = get_genuine_zk()
+        fake_zk = get_fake_zk()

-    for zk in [genuine_zk, fake_zk]:
-        zk.create("/test_simple_commands", b"")
-        zk.create("/test_simple_commands/somenode1", b"hello")
-        zk.set("/test_simple_commands/somenode1", b"world")
+        for zk in [genuine_zk, fake_zk]:
+            zk.create("/test_simple_commands", b"")
+            zk.create("/test_simple_commands/somenode1", b"hello")
+            zk.set("/test_simple_commands/somenode1", b"world")

-    for zk in [genuine_zk, fake_zk]:
-        assert zk.exists("/test_simple_commands")
-        assert zk.exists("/test_simple_commands/somenode1")
-        print(zk.get("/test_simple_commands/somenode1"))
-        assert zk.get("/test_simple_commands/somenode1")[0] == b"world"
+        for zk in [genuine_zk, fake_zk]:
+            assert zk.exists("/test_simple_commands")
+            assert zk.exists("/test_simple_commands/somenode1")
+            print(zk.get("/test_simple_commands/somenode1"))
+            assert zk.get("/test_simple_commands/somenode1")[0] == b"world"
+    finally:
+        for zk in [genuine_zk, fake_zk]:
+            stop_zk(zk)


 def test_sequential_nodes(started_cluster):
-    genuine_zk = get_genuine_zk()
-    fake_zk = get_fake_zk()
-    genuine_zk.create("/test_sequential_nodes")
-    fake_zk.create("/test_sequential_nodes")
-    for i in range(1, 11):
-        genuine_zk.create("/test_sequential_nodes/" + ("a" * i) + "-", sequence=True)
-        genuine_zk.create("/test_sequential_nodes/" + ("b" * i))
-        fake_zk.create("/test_sequential_nodes/" + ("a" * i) + "-", sequence=True)
-        fake_zk.create("/test_sequential_nodes/" + ("b" * i))
+    try:
+        genuine_zk = get_genuine_zk()
+        fake_zk = get_fake_zk()
+        genuine_zk.create("/test_sequential_nodes")
+        fake_zk.create("/test_sequential_nodes")
+        for i in range(1, 11):
+            genuine_zk.create("/test_sequential_nodes/" + ("a" * i) + "-", sequence=True)
+            genuine_zk.create("/test_sequential_nodes/" + ("b" * i))
+            fake_zk.create("/test_sequential_nodes/" + ("a" * i) + "-", sequence=True)
+            fake_zk.create("/test_sequential_nodes/" + ("b" * i))

-    genuine_childs = list(sorted(genuine_zk.get_children("/test_sequential_nodes")))
-    fake_childs = list(sorted(fake_zk.get_children("/test_sequential_nodes")))
-    assert genuine_childs == fake_childs
+        genuine_childs = list(sorted(genuine_zk.get_children("/test_sequential_nodes")))
+        fake_childs = list(sorted(fake_zk.get_children("/test_sequential_nodes")))
+        assert genuine_childs == fake_childs
+    finally:
+        for zk in [genuine_zk, fake_zk]:
+            stop_zk(zk)


 def assert_eq_stats(stat1, stat2):
@ -102,130 +104,141 @@ def assert_eq_stats(stat1, stat2):
    assert stat1.numChildren == stat2.numChildren

 def test_stats(started_cluster):
-    genuine_zk = get_genuine_zk()
-    fake_zk = get_fake_zk()
-    genuine_zk.create("/test_stats_nodes")
-    fake_zk.create("/test_stats_nodes")
-    genuine_stats = genuine_zk.exists("/test_stats_nodes")
-    fake_stats = fake_zk.exists("/test_stats_nodes")
-    assert_eq_stats(genuine_stats, fake_stats)
-    for i in range(1, 11):
-        genuine_zk.create("/test_stats_nodes/" + ("a" * i) + "-", sequence=True)
-        genuine_zk.create("/test_stats_nodes/" + ("b" * i))
-        fake_zk.create("/test_stats_nodes/" + ("a" * i) + "-", sequence=True)
-        fake_zk.create("/test_stats_nodes/" + ("b" * i))
+    try:
+        genuine_zk = get_genuine_zk()
+        fake_zk = get_fake_zk()
+        genuine_zk.create("/test_stats_nodes")
+        fake_zk.create("/test_stats_nodes")
+        genuine_stats = genuine_zk.exists("/test_stats_nodes")
+        fake_stats = fake_zk.exists("/test_stats_nodes")
+        assert_eq_stats(genuine_stats, fake_stats)
+        for i in range(1, 11):
+            genuine_zk.create("/test_stats_nodes/" + ("a" * i) + "-", sequence=True)
+            genuine_zk.create("/test_stats_nodes/" + ("b" * i))
+            fake_zk.create("/test_stats_nodes/" + ("a" * i) + "-", sequence=True)
+            fake_zk.create("/test_stats_nodes/" + ("b" * i))

-    genuine_stats = genuine_zk.exists("/test_stats_nodes")
-    fake_stats = fake_zk.exists("/test_stats_nodes")
-    assert_eq_stats(genuine_stats, fake_stats)
-    for i in range(1, 11):
-        print("/test_stats_nodes/" + ("a" * i) + "-" + "{:010d}".format((i - 1) * 2))
-        genuine_zk.delete("/test_stats_nodes/" + ("a" * i) + "-" + "{:010d}".format((i - 1) * 2))
-        genuine_zk.delete("/test_stats_nodes/" + ("b" * i))
-        fake_zk.delete("/test_stats_nodes/" + ("a" * i) + "-" + "{:010d}".format((i - 1) * 2))
-        fake_zk.delete("/test_stats_nodes/" + ("b" * i))
+        genuine_stats = genuine_zk.exists("/test_stats_nodes")
+        fake_stats = fake_zk.exists("/test_stats_nodes")
+        assert_eq_stats(genuine_stats, fake_stats)
+        for i in range(1, 11):
+            print("/test_stats_nodes/" + ("a" * i) + "-" + "{:010d}".format((i - 1) * 2))
+            genuine_zk.delete("/test_stats_nodes/" + ("a" * i) + "-" + "{:010d}".format((i - 1) * 2))
+            genuine_zk.delete("/test_stats_nodes/" + ("b" * i))
+            fake_zk.delete("/test_stats_nodes/" + ("a" * i) + "-" + "{:010d}".format((i - 1) * 2))
+            fake_zk.delete("/test_stats_nodes/" + ("b" * i))

-    genuine_stats = genuine_zk.exists("/test_stats_nodes")
-    fake_stats = fake_zk.exists("/test_stats_nodes")
-    print(genuine_stats)
-    print(fake_stats)
-    assert_eq_stats(genuine_stats, fake_stats)
-    for i in range(100):
-        genuine_zk.set("/test_stats_nodes", ("q" * i).encode())
-        fake_zk.set("/test_stats_nodes", ("q" * i).encode())
+        genuine_stats = genuine_zk.exists("/test_stats_nodes")
+        fake_stats = fake_zk.exists("/test_stats_nodes")
+        print(genuine_stats)
+        print(fake_stats)
+        assert_eq_stats(genuine_stats, fake_stats)
+        for i in range(100):
+            genuine_zk.set("/test_stats_nodes", ("q" * i).encode())
+            fake_zk.set("/test_stats_nodes", ("q" * i).encode())

-    genuine_stats = genuine_zk.exists("/test_stats_nodes")
-    fake_stats = fake_zk.exists("/test_stats_nodes")
-    print(genuine_stats)
-    print(fake_stats)
-    assert_eq_stats(genuine_stats, fake_stats)
+        genuine_stats = genuine_zk.exists("/test_stats_nodes")
+        fake_stats = fake_zk.exists("/test_stats_nodes")
+        print(genuine_stats)
+        print(fake_stats)
+        assert_eq_stats(genuine_stats, fake_stats)
+    finally:
+        for zk in [genuine_zk, fake_zk]:
+            stop_zk(zk)

 def test_watchers(started_cluster):
-    genuine_zk = get_genuine_zk()
-    fake_zk = get_fake_zk()
-    genuine_zk.create("/test_data_watches")
-    fake_zk.create("/test_data_watches")
-    genuine_data_watch_data = None
+    try:
+        genuine_zk = get_genuine_zk()
+        fake_zk = get_fake_zk()
+        genuine_zk.create("/test_data_watches")
+        fake_zk.create("/test_data_watches")
+        genuine_data_watch_data = None

-    def genuine_callback(event):
-        print("Genuine data watch called")
-        nonlocal genuine_data_watch_data
-        genuine_data_watch_data = event
+        def genuine_callback(event):
+            print("Genuine data watch called")
+            nonlocal genuine_data_watch_data
+            genuine_data_watch_data = event

-    fake_data_watch_data = None
-    def fake_callback(event):
-        print("Fake data watch called")
-        nonlocal fake_data_watch_data
-        fake_data_watch_data = event
+        fake_data_watch_data = None
+        def fake_callback(event):
+            print("Fake data watch called")
+            nonlocal fake_data_watch_data
+            fake_data_watch_data = event

-    genuine_zk.get("/test_data_watches", watch=genuine_callback)
-    fake_zk.get("/test_data_watches", watch=fake_callback)
+        genuine_zk.get("/test_data_watches", watch=genuine_callback)
+        fake_zk.get("/test_data_watches", watch=fake_callback)

-    print("Calling set genuine")
-    genuine_zk.set("/test_data_watches", b"a")
-    print("Calling set fake")
-    fake_zk.set("/test_data_watches", b"a")
-    time.sleep(3)
+        print("Calling set genuine")
+        genuine_zk.set("/test_data_watches", b"a")
+        print("Calling set fake")
+        fake_zk.set("/test_data_watches", b"a")
+        time.sleep(3)

-    print("Genuine data", genuine_data_watch_data)
-    print("Fake data", fake_data_watch_data)
-    assert genuine_data_watch_data == fake_data_watch_data
+        print("Genuine data", genuine_data_watch_data)
+        print("Fake data", fake_data_watch_data)
+        assert genuine_data_watch_data == fake_data_watch_data

-    genuine_children = None
-    def genuine_child_callback(event):
-        print("Genuine child watch called")
-        nonlocal genuine_children
-        genuine_children = event
+        genuine_children = None
+        def genuine_child_callback(event):
+            print("Genuine child watch called")
+            nonlocal genuine_children
+            genuine_children = event

-    fake_children = None
-    def fake_child_callback(event):
-        print("Fake child watch called")
-        nonlocal fake_children
-        fake_children = event
+        fake_children = None
+        def fake_child_callback(event):
+            print("Fake child watch called")
+            nonlocal fake_children
+            fake_children = event

-    genuine_zk.get_children("/test_data_watches", watch=genuine_child_callback)
-    fake_zk.get_children("/test_data_watches", watch=fake_child_callback)
+        genuine_zk.get_children("/test_data_watches", watch=genuine_child_callback)
+        fake_zk.get_children("/test_data_watches", watch=fake_child_callback)

-    print("Calling genuine child")
-    genuine_zk.create("/test_data_watches/child", b"b")
-    print("Calling fake child")
-    fake_zk.create("/test_data_watches/child", b"b")
+        print("Calling genuine child")
+        genuine_zk.create("/test_data_watches/child", b"b")
+        print("Calling fake child")
+        fake_zk.create("/test_data_watches/child", b"b")

-    time.sleep(3)
+        time.sleep(3)

-    print("Genuine children", genuine_children)
-    print("Fake children", fake_children)
-    assert genuine_children == fake_children
+        print("Genuine children", genuine_children)
+        print("Fake children", fake_children)
+        assert genuine_children == fake_children
+    finally:
+        for zk in [genuine_zk, fake_zk]:
+            stop_zk(zk)

 def test_multitransactions(started_cluster):
-    genuine_zk = get_genuine_zk()
-    fake_zk = get_fake_zk()
-    for zk in [genuine_zk, fake_zk]:
-        zk.create('/test_multitransactions')
-        t = zk.transaction()
-        t.create('/test_multitransactions/freddy')
-        t.create('/test_multitransactions/fred', ephemeral=True)
-        t.create('/test_multitransactions/smith', sequence=True)
-        results = t.commit()
-        assert len(results) == 3
-        assert results[0] == '/test_multitransactions/freddy'
-        assert results[2].startswith('/test_multitransactions/smith0') is True
-
-    from kazoo.exceptions import RolledBackError, NoNodeError
-    for i, zk in enumerate([genuine_zk, fake_zk]):
-        print("Processing ZK", i)
-        t = zk.transaction()
-        t.create('/test_multitransactions/q')
-        t.delete('/test_multitransactions/a')
-        t.create('/test_multitransactions/x')
-        results = t.commit()
-        print("Results", results)
-        assert results[0].__class__ == RolledBackError
-        assert results[1].__class__ == NoNodeError
-        assert zk.exists('/test_multitransactions/q') is None
-        assert zk.exists('/test_multitransactions/a') is None
-        assert zk.exists('/test_multitransactions/x') is None
+    try:
+        genuine_zk = get_genuine_zk()
+        fake_zk = get_fake_zk()
+        for zk in [genuine_zk, fake_zk]:
+            zk.create('/test_multitransactions')
+            t = zk.transaction()
+            t.create('/test_multitransactions/freddy')
+            t.create('/test_multitransactions/fred', ephemeral=True)
+            t.create('/test_multitransactions/smith', sequence=True)
+            results = t.commit()
+            assert len(results) == 3
+            assert results[0] == '/test_multitransactions/freddy'
+            assert results[2].startswith('/test_multitransactions/smith0') is True

+        from kazoo.exceptions import RolledBackError, NoNodeError
+        for i, zk in enumerate([genuine_zk, fake_zk]):
+            print("Processing ZK", i)
+            t = zk.transaction()
+            t.create('/test_multitransactions/q')
+            t.delete('/test_multitransactions/a')
+            t.create('/test_multitransactions/x')
+            results = t.commit()
+            print("Results", results)
+            assert results[0].__class__ == RolledBackError
+            assert results[1].__class__ == NoNodeError
+            assert zk.exists('/test_multitransactions/q') is None
+            assert zk.exists('/test_multitransactions/a') is None
+            assert zk.exists('/test_multitransactions/x') is None
+    finally:
+        for zk in [genuine_zk, fake_zk]:
+            stop_zk(zk)

 def exists(zk, path):
    result = zk.exists(path)
@ -278,13 +291,13 @@ class Request(object):
        arg_str = ', '.join([str(k) + "=" + str(v) for k, v in self.arguments.items()])
        return "ZKRequest name {} with arguments {}".format(self.name, arg_str)

-def generate_requests(iters=1):
+def generate_requests(prefix="/", iters=1):
    requests = []
    existing_paths = []
    for i in range(iters):
        for _ in range(100):
            rand_length = random.randint(0, 10)
-            path = "/"
+            path = prefix
            for j in range(1, rand_length):
                path = create_random_path(path, 1)
                existing_paths.append(path)
@ -322,31 +335,43 @@ def generate_requests(iters=1):


 def test_random_requests(started_cluster):
-    requests = generate_requests(10)
-    genuine_zk = get_genuine_zk()
-    fake_zk = get_fake_zk()
-    for i, request in enumerate(requests):
-        genuine_throw = False
-        fake_throw = False
-        fake_result = None
-        genuine_result = None
-        try:
-            genuine_result = request.callback(genuine_zk)
-        except Exception as ex:
-            genuine_throw = True
+    try:
+        requests = generate_requests("/test_random_requests", 10)
+        print("Generated", len(requests), "requests")
+        genuine_zk = get_genuine_zk()
+        fake_zk = get_fake_zk()
+        genuine_zk.create("/test_random_requests")
+        fake_zk.create("/test_random_requests")
+        for i, request in enumerate(requests):
+            genuine_throw = False
+            fake_throw = False
+            fake_result = None
+            genuine_result = None
+            try:
+                genuine_result = request.callback(genuine_zk)
+            except Exception as ex:
+                print("i", i, "request", request)
+                print("Genuine exception", str(ex))
+                genuine_throw = True

-        try:
-            fake_result = request.callback(fake_zk)
-        except Exception as ex:
-            fake_throw = True
+            try:
+                fake_result = request.callback(fake_zk)
+            except Exception as ex:
+                print("i", i, "request", request)
+                print("Fake exception", str(ex))
+                fake_throw = True

-        assert fake_throw == genuine_throw, "Fake throw genuine not or vise versa"
-        assert fake_result == genuine_result, "Zookeeper results differ"
-    root_children_genuine = [elem for elem in list(sorted(genuine_zk.get_children("/"))) if elem not in ('clickhouse', 'zookeeper')]
-    root_children_fake = [elem for elem in list(sorted(fake_zk.get_children("/"))) if elem not in ('clickhouse', 'zookeeper')]
-    assert root_children_fake == root_children_genuine
+            assert fake_throw == genuine_throw, "Fake throw genuine not or vise versa request {}"
+            assert fake_result == genuine_result, "Zookeeper results differ"
+        root_children_genuine = [elem for elem in list(sorted(genuine_zk.get_children("/test_random_requests"))) if elem not in ('clickhouse', 'zookeeper')]
+        root_children_fake = [elem for elem in list(sorted(fake_zk.get_children("/test_random_requests"))) if elem not in ('clickhouse', 'zookeeper')]
+        assert root_children_fake == root_children_genuine
+    finally:
+        for zk in [genuine_zk, fake_zk]:
+            stop_zk(zk)

 def test_end_of_session(started_cluster):
+
    fake_zk1 = None
    fake_zk2 = None
    genuine_zk1 = None
@ -401,13 +426,8 @@ def test_end_of_session(started_cluster):
        assert fake_ephemeral_event == genuine_ephemeral_event

    finally:
-        try:
-            for zk in [fake_zk1, fake_zk2, genuine_zk1, genuine_zk2]:
-                if zk:
-                    zk.stop()
-                    zk.close()
-        except:
-            pass
+        for zk in [fake_zk1, fake_zk2, genuine_zk1, genuine_zk2]:
+            stop_zk(zk)

 def test_end_of_watches_session(started_cluster):
    fake_zk1 = None
@ -442,91 +462,89 @@ def test_end_of_watches_session(started_cluster):

        assert dummy_set == 2
    finally:
-        try:
-            for zk in [fake_zk1, fake_zk2]:
-                if zk:
-                    zk.stop()
-                    zk.close()
-        except:
-            pass
+        for zk in [fake_zk1, fake_zk2]:
+            stop_zk(zk)

 def test_concurrent_watches(started_cluster):
-    fake_zk = get_fake_zk()
-    fake_zk.restart()
-    global_path = "/test_concurrent_watches_0"
-    fake_zk.create(global_path)
+    try:
+        fake_zk = get_fake_zk()
+        fake_zk.restart()
+        global_path = "/test_concurrent_watches_0"
+        fake_zk.create(global_path)

-    dumb_watch_triggered_counter = 0
-    all_paths_triggered = []
+        dumb_watch_triggered_counter = 0
+        all_paths_triggered = []

-    existing_path = []
-    all_paths_created = []
-    watches_created = 0
-    def create_path_and_watch(i):
-        nonlocal watches_created
-        nonlocal all_paths_created
-        fake_zk.ensure_path(global_path + "/" + str(i))
-        # new function each time
-        def dumb_watch(event):
-            nonlocal dumb_watch_triggered_counter
-            dumb_watch_triggered_counter += 1
-            nonlocal all_paths_triggered
-            all_paths_triggered.append(event.path)
+        existing_path = []
+        all_paths_created = []
+        watches_created = 0
+        def create_path_and_watch(i):
+            nonlocal watches_created
+            nonlocal all_paths_created
+            fake_zk.ensure_path(global_path + "/" + str(i))
+            # new function each time
+            def dumb_watch(event):
+                nonlocal dumb_watch_triggered_counter
+                dumb_watch_triggered_counter += 1
+                nonlocal all_paths_triggered
+                all_paths_triggered.append(event.path)

-        fake_zk.get(global_path + "/" + str(i), watch=dumb_watch)
-        all_paths_created.append(global_path + "/" + str(i))
-        watches_created += 1
-        existing_path.append(i)
+            fake_zk.get(global_path + "/" + str(i), watch=dumb_watch)
+            all_paths_created.append(global_path + "/" + str(i))
+            watches_created += 1
+            existing_path.append(i)

-    trigger_called = 0
-    def trigger_watch(i):
-        nonlocal trigger_called
-        trigger_called += 1
-        fake_zk.set(global_path + "/" + str(i), b"somevalue")
-        try:
-            existing_path.remove(i)
-        except:
-            pass
-
-    def call(total):
-        for i in range(total):
-            create_path_and_watch(random.randint(0, 1000))
-            time.sleep(random.random() % 0.5)
+        trigger_called = 0
+        def trigger_watch(i):
+            nonlocal trigger_called
+            trigger_called += 1
+            fake_zk.set(global_path + "/" + str(i), b"somevalue")
            try:
-                rand_num = random.choice(existing_path)
-                trigger_watch(rand_num)
-            except:
-                pass
-        while existing_path:
-            try:
-                rand_num = random.choice(existing_path)
-                trigger_watch(rand_num)
+                existing_path.remove(i)
            except:
                pass

-    p = Pool(10)
-    arguments = [100] * 10
-    watches_must_be_created = sum(arguments)
-    watches_trigger_must_be_called = sum(arguments)
-    watches_must_be_triggered = sum(arguments)
-    p.map(call, arguments)
-    p.close()
+        def call(total):
+            for i in range(total):
+                create_path_and_watch(random.randint(0, 1000))
+                time.sleep(random.random() % 0.5)
+                try:
+                    rand_num = random.choice(existing_path)
+                    trigger_watch(rand_num)
+                except:
+                    pass
+            while existing_path:
+                try:
+                    rand_num = random.choice(existing_path)
+                    trigger_watch(rand_num)
+                except:
+                    pass

-    # waiting for late watches
-    for i in range(50):
-        if dumb_watch_triggered_counter == watches_must_be_triggered:
-            break
+        p = Pool(10)
+        arguments = [100] * 10
+        watches_must_be_created = sum(arguments)
+        watches_trigger_must_be_called = sum(arguments)
+        watches_must_be_triggered = sum(arguments)
+        p.map(call, arguments)
+        p.close()

-        time.sleep(0.1)
+        # waiting for late watches
+        for i in range(50):
+            if dumb_watch_triggered_counter == watches_must_be_triggered:
+                break

-    assert watches_created == watches_must_be_created
-    assert trigger_called >= watches_trigger_must_be_called
-    assert len(existing_path) == 0
-    if dumb_watch_triggered_counter != watches_must_be_triggered:
-        print("All created paths", all_paths_created)
-        print("All triggerred paths", all_paths_triggered)
-        print("All paths len", len(all_paths_created))
-        print("All triggered len", len(all_paths_triggered))
-        print("Diff", list(set(all_paths_created) - set(all_paths_triggered)))
+            time.sleep(0.1)

-    assert dumb_watch_triggered_counter == watches_must_be_triggered
+        assert watches_created == watches_must_be_created
+        assert trigger_called >= watches_trigger_must_be_called
+        assert len(existing_path) == 0
+        if dumb_watch_triggered_counter != watches_must_be_triggered:
+            print("All created paths", all_paths_created)
+            print("All triggerred paths", all_paths_triggered)
+            print("All paths len", len(all_paths_created))
+            print("All triggered len", len(all_paths_triggered))
+            print("Diff", list(set(all_paths_created) - set(all_paths_triggered)))
+
+        assert dumb_watch_triggered_counter == watches_must_be_triggered
+    finally:
+        stop_zk(fake_zk)
--- a/tests/integration/test_testkeeper_multinode_blocade_leader/configs/enable_test_keeper1.xml
+++ b/tests/integration/test_testkeeper_multinode_blocade_leader/configs/enable_test_keeper1.xml
@ -2,6 +2,7 @@
    <test_keeper_server>
        <tcp_port>9181</tcp_port>
        <server_id>1</server_id>
+        <log_storage_path>/var/lib/clickhouse/coordination/log</log_storage_path>

        <coordination_settings>
            <operation_timeout_ms>5000</operation_timeout_ms>
--- a/tests/integration/test_testkeeper_multinode_blocade_leader/configs/enable_test_keeper2.xml
+++ b/tests/integration/test_testkeeper_multinode_blocade_leader/configs/enable_test_keeper2.xml
@ -2,6 +2,7 @@
    <test_keeper_server>
        <tcp_port>9181</tcp_port>
        <server_id>2</server_id>
+        <log_storage_path>/var/lib/clickhouse/coordination/log</log_storage_path>

        <coordination_settings>
            <operation_timeout_ms>5000</operation_timeout_ms>
--- a/tests/integration/test_testkeeper_multinode_blocade_leader/configs/enable_test_keeper3.xml
+++ b/tests/integration/test_testkeeper_multinode_blocade_leader/configs/enable_test_keeper3.xml
@ -2,6 +2,7 @@
    <test_keeper_server>
        <tcp_port>9181</tcp_port>
        <server_id>3</server_id>
+        <log_storage_path>/var/lib/clickhouse/coordination/log</log_storage_path>

        <coordination_settings>
            <operation_timeout_ms>5000</operation_timeout_ms>
--- a/tests/integration/test_testkeeper_multinode_blocade_leader/test.py
+++ b/tests/integration/test_testkeeper_multinode_blocade_leader/test.py
@ -6,6 +6,7 @@ import os
 import time
 from multiprocessing.dummy import Pool
 from helpers.network import PartitionManager
+from helpers.test_tools import assert_eq_with_retry

 cluster = ClickHouseCluster(__file__)
 node1 = cluster.add_instance('node1', main_configs=['configs/enable_test_keeper1.xml', 'configs/log_conf.xml', 'configs/use_test_keeper.xml'], stay_alive=True)
@ -14,6 +15,18 @@ node3 = cluster.add_instance('node3', main_configs=['configs/enable_test_keeper3

 from kazoo.client import KazooClient, KazooState

+"""
+In this test, we blockade RAFT leader and check that the whole system is
+able to recover. It's not a good test because we use ClickHouse's replicated
+tables to check connectivity, but they may require special operations (or a long
+wait) after session expiration. We don't use kazoo, because this client pretends
+to be very smart: SUSPEND sessions, try to recover them, and so on. The test
+will be even less predictable than with ClickHouse tables.
+
+TODO find (or write) not so smart python client.
+TODO remove this when jepsen tests will be written.
+"""
+
@pytest.fixture(scope="module")
 def started_cluster():
    try:
@ -55,7 +68,6 @@ def get_fake_zk(nodename, timeout=30.0):
    _fake_zk_instance = KazooClient(hosts=cluster.get_instance_ip(nodename) + ":9181", timeout=timeout)
    def reset_listener(state):
        nonlocal _fake_zk_instance
-        print("Fake zk callback called for state", state)
        if state != KazooState.CONNECTED:
            _fake_zk_instance._reset()

@ -67,19 +79,25 @@ def get_fake_zk(nodename, timeout=30.0):
 # in extremely rare case it can take more than 5 minutes in debug build with sanitizer
@pytest.mark.timeout(600)
 def test_blocade_leader(started_cluster):
-    wait_nodes()
-    for i, node in enumerate([node1, node2, node3]):
-        node.query("CREATE DATABASE IF NOT EXISTS ordinary ENGINE=Ordinary")
-        node.query("CREATE TABLE ordinary.t1 (value UInt64) ENGINE = ReplicatedMergeTree('/clickhouse/t1', '{}') ORDER BY tuple()".format(i + 1))
+    for i in range(100):
+        wait_nodes()
+        try:
+            for i, node in enumerate([node1, node2, node3]):
+                node.query("CREATE DATABASE IF NOT EXISTS ordinary ENGINE=Ordinary")
+                node.query("CREATE TABLE IF NOT EXISTS ordinary.t1 (value UInt64) ENGINE = ReplicatedMergeTree('/clickhouse/t1', '{}') ORDER BY tuple()".format(i + 1))
+            break
+        except Exception as ex:
+            print("Got exception from node", smaller_exception(ex))
+            time.sleep(0.1)

    node2.query("INSERT INTO ordinary.t1 SELECT number FROM numbers(10)")

    node1.query("SYSTEM SYNC REPLICA ordinary.t1", timeout=10)
    node3.query("SYSTEM SYNC REPLICA ordinary.t1", timeout=10)

-    assert node1.query("SELECT COUNT() FROM ordinary.t1") == "10\n"
-    assert node2.query("SELECT COUNT() FROM ordinary.t1") == "10\n"
-    assert node3.query("SELECT COUNT() FROM ordinary.t1") == "10\n"
+    assert_eq_with_retry(node1, "SELECT COUNT() FROM ordinary.t1", "10")
+    assert_eq_with_retry(node2, "SELECT COUNT() FROM ordinary.t1", "10")
+    assert_eq_with_retry(node3, "SELECT COUNT() FROM ordinary.t1", "10")

    with PartitionManager() as pm:
        pm.partition_instances(node2, node1)
@ -87,7 +105,7 @@ def test_blocade_leader(started_cluster):

        for i in range(100):
            try:
-                node2.query("SYSTEM RESTART REPLICA ordinary.t1")
+                restart_replica_for_sure(node2, "ordinary.t1", "/clickhouse/t1/replicas/2")
                node2.query("INSERT INTO ordinary.t1 SELECT rand() FROM numbers(100)")
                break
            except Exception as ex:
@ -104,7 +122,7 @@ def test_blocade_leader(started_cluster):

        for i in range(100):
            try:
-                node3.query("SYSTEM RESTART REPLICA ordinary.t1")
+                restart_replica_for_sure(node3, "ordinary.t1", "/clickhouse/t1/replicas/3")
                node3.query("INSERT INTO ordinary.t1 SELECT rand() FROM numbers(100)")
                break
            except Exception as ex:
@ -122,7 +140,7 @@ def test_blocade_leader(started_cluster):
    for n, node in enumerate([node1, node2, node3]):
        for i in range(100):
            try:
-                node.query("SYSTEM RESTART REPLICA ordinary.t1")
+                restart_replica_for_sure(node, "ordinary.t1", "/clickhouse/t1/replicas/{}".format(n + 1))
                break
            except Exception as ex:
                try:
@ -150,7 +168,7 @@ def test_blocade_leader(started_cluster):
    for n, node in enumerate([node1, node2, node3]):
        for i in range(100):
            try:
-                node.query("SYSTEM RESTART REPLICA ordinary.t1")
+                restart_replica_for_sure(node, "ordinary.t1", "/clickhouse/t1/replicas/{}".format(n + 1))
                node.query("SYSTEM SYNC REPLICA ordinary.t1", timeout=10)
                break
            except Exception as ex:
@ -170,9 +188,9 @@ def test_blocade_leader(started_cluster):
        for num, node in enumerate([node1, node2, node3]):
            dump_zk(node, '/clickhouse/t1', '/clickhouse/t1/replicas/{}'.format(num + 1))

-    assert node1.query("SELECT COUNT() FROM ordinary.t1") == "310\n"
-    assert node2.query("SELECT COUNT() FROM ordinary.t1") == "310\n"
-    assert node3.query("SELECT COUNT() FROM ordinary.t1") == "310\n"
+    assert_eq_with_retry(node1, "SELECT COUNT() FROM ordinary.t1", "310")
+    assert_eq_with_retry(node2, "SELECT COUNT() FROM ordinary.t1", "310")
+    assert_eq_with_retry(node3, "SELECT COUNT() FROM ordinary.t1", "310")


 def dump_zk(node, zk_path, replica_path):
@ -188,22 +206,47 @@ def dump_zk(node, zk_path, replica_path):
    print("Parts")
    print(node.query("SELECT name FROM system.zookeeper WHERE path = '{}/parts' FORMAT Vertical".format(replica_path)))

+def restart_replica_for_sure(node, table_name, zk_replica_path):
+    fake_zk = None
+    try:
+        node.query("DETACH TABLE {}".format(table_name))
+        fake_zk = get_fake_zk(node.name)
+        if fake_zk.exists(zk_replica_path + "/is_active") is not None:
+            fake_zk.delete(zk_replica_path + "/is_active")
+
+        node.query("ATTACH TABLE {}".format(table_name))
+    except Exception as ex:
+        print("Exception", ex)
+        raise ex
+    finally:
+        if fake_zk:
+            fake_zk.stop()
+            fake_zk.close()
+
+
+
 # in extremely rare case it can take more than 5 minutes in debug build with sanitizer
@pytest.mark.timeout(600)
 def test_blocade_leader_twice(started_cluster):
-    wait_nodes()
-    for i, node in enumerate([node1, node2, node3]):
-        node.query("CREATE DATABASE IF NOT EXISTS ordinary ENGINE=Ordinary")
-        node.query("CREATE TABLE ordinary.t2 (value UInt64) ENGINE = ReplicatedMergeTree('/clickhouse/t2', '{}') ORDER BY tuple()".format(i + 1))
+    for i in range(100):
+        wait_nodes()
+        try:
+            for i, node in enumerate([node1, node2, node3]):
+                node.query("CREATE DATABASE IF NOT EXISTS ordinary ENGINE=Ordinary")
+                node.query("CREATE TABLE IF NOT EXISTS ordinary.t2 (value UInt64) ENGINE = ReplicatedMergeTree('/clickhouse/t2', '{}') ORDER BY tuple()".format(i + 1))
+            break
+        except Exception as ex:
+            print("Got exception from node", smaller_exception(ex))
+            time.sleep(0.1)

    node2.query("INSERT INTO ordinary.t2 SELECT number FROM numbers(10)")

    node1.query("SYSTEM SYNC REPLICA ordinary.t2", timeout=10)
    node3.query("SYSTEM SYNC REPLICA ordinary.t2", timeout=10)

-    assert node1.query("SELECT COUNT() FROM ordinary.t2") == "10\n"
-    assert node2.query("SELECT COUNT() FROM ordinary.t2") == "10\n"
-    assert node3.query("SELECT COUNT() FROM ordinary.t2") == "10\n"
+    assert_eq_with_retry(node1, "SELECT COUNT() FROM ordinary.t2", "10")
+    assert_eq_with_retry(node2, "SELECT COUNT() FROM ordinary.t2", "10")
+    assert_eq_with_retry(node3, "SELECT COUNT() FROM ordinary.t2", "10")

    with PartitionManager() as pm:
        pm.partition_instances(node2, node1)
@ -211,7 +254,7 @@ def test_blocade_leader_twice(started_cluster):

        for i in range(100):
            try:
-                node2.query("SYSTEM RESTART REPLICA ordinary.t2")
+                restart_replica_for_sure(node2, "ordinary.t2", "/clickhouse/t2/replicas/2")
                node2.query("INSERT INTO ordinary.t2 SELECT rand() FROM numbers(100)")
                break
            except Exception as ex:
@ -228,7 +271,8 @@ def test_blocade_leader_twice(started_cluster):

        for i in range(100):
            try:
-                node3.query("SYSTEM RESTART REPLICA ordinary.t2")
+                restart_replica_for_sure(node3, "ordinary.t2", "/clickhouse/t2/replicas/3")
+                node3.query("SYSTEM SYNC REPLICA ordinary.t2", timeout=10)
                node3.query("INSERT INTO ordinary.t2 SELECT rand() FROM numbers(100)")
                break
            except Exception as ex:
@ -243,6 +287,10 @@ def test_blocade_leader_twice(started_cluster):
                dump_zk(node, '/clickhouse/t2', '/clickhouse/t2/replicas/{}'.format(num + 1))
            assert False, "Cannot reconnect for node3"

+        node2.query("SYSTEM SYNC REPLICA ordinary.t2", timeout=10)
+
+        assert_eq_with_retry(node2, "SELECT COUNT() FROM ordinary.t2", "210")
+        assert_eq_with_retry(node3, "SELECT COUNT() FROM ordinary.t2", "210")

        # Total network partition
        pm.partition_instances(node3, node2)
@ -261,11 +309,10 @@ def test_blocade_leader_twice(started_cluster):
            except Exception as ex:
                time.sleep(0.5)

-
    for n, node in enumerate([node1, node2, node3]):
        for i in range(100):
            try:
-                node.query("SYSTEM RESTART REPLICA ordinary.t2")
+                restart_replica_for_sure(node, "ordinary.t2", "/clickhouse/t2/replicas/{}".format(n + 1))
                break
            except Exception as ex:
                try:
@ -293,29 +340,34 @@ def test_blocade_leader_twice(started_cluster):
                dump_zk(node, '/clickhouse/t2', '/clickhouse/t2/replicas/{}'.format(num + 1))
            assert False, "Cannot reconnect for node{}".format(n + 1)

-    for n, node in enumerate([node1, node2, node3]):
        for i in range(100):
-            try:
-                node.query("SYSTEM RESTART REPLICA ordinary.t2")
-                node.query("SYSTEM SYNC REPLICA ordinary.t2", timeout=10)
-                break
-            except Exception as ex:
+            all_done = True
+            for n, node in enumerate([node1, node2, node3]):
                try:
-                    node.query("ATTACH TABLE ordinary.t2")
-                except Exception as attach_ex:
-                    print("Got exception node{}".format(n + 1), smaller_exception(attach_ex))
+                    restart_replica_for_sure(node, "ordinary.t2", "/clickhouse/t2/replicas/{}".format(n + 1))
+                    node.query("SYSTEM SYNC REPLICA ordinary.t2", timeout=10)
+                    break
+                except Exception as ex:
+                    all_done = False
+                    try:
+                        node.query("ATTACH TABLE ordinary.t2")
+                    except Exception as attach_ex:
+                        print("Got exception node{}".format(n + 1), smaller_exception(attach_ex))

-                print("Got exception node{}".format(n + 1), smaller_exception(ex))
-                time.sleep(0.5)
+                    print("Got exception node{}".format(n + 1), smaller_exception(ex))
+                    time.sleep(0.5)
+
+            if all_done:
+                break
        else:
            for num, node in enumerate([node1, node2, node3]):
                dump_zk(node, '/clickhouse/t2', '/clickhouse/t2/replicas/{}'.format(num + 1))
-            assert False, "Cannot reconnect for node{}".format(n + 1)
+            assert False, "Cannot reconnect in i {} retries".format(i)

-    assert node1.query("SELECT COUNT() FROM ordinary.t2") == "510\n"
+    assert_eq_with_retry(node1, "SELECT COUNT() FROM ordinary.t2", "510")
    if node2.query("SELECT COUNT() FROM ordinary.t2") != "510\n":
        for num, node in enumerate([node1, node2, node3]):
            dump_zk(node, '/clickhouse/t2', '/clickhouse/t2/replicas/{}'.format(num + 1))

-    assert node2.query("SELECT COUNT() FROM ordinary.t2") == "510\n"
-    assert node3.query("SELECT COUNT() FROM ordinary.t2") == "510\n"
+    assert_eq_with_retry(node2, "SELECT COUNT() FROM ordinary.t2", "510")
+    assert_eq_with_retry(node3, "SELECT COUNT() FROM ordinary.t2", "510")
--- a/tests/integration/test_testkeeper_multinode_simple/configs/enable_test_keeper1.xml
+++ b/tests/integration/test_testkeeper_multinode_simple/configs/enable_test_keeper1.xml
@ -2,6 +2,7 @@
    <test_keeper_server>
        <tcp_port>9181</tcp_port>
        <server_id>1</server_id>
+        <log_storage_path>/var/lib/clickhouse/coordination/log</log_storage_path>

        <coordination_settings>
            <operation_timeout_ms>5000</operation_timeout_ms>
--- a/tests/integration/test_testkeeper_multinode_simple/configs/enable_test_keeper2.xml
+++ b/tests/integration/test_testkeeper_multinode_simple/configs/enable_test_keeper2.xml
@ -2,6 +2,7 @@
    <test_keeper_server>
        <tcp_port>9181</tcp_port>
        <server_id>2</server_id>
+        <log_storage_path>/var/lib/clickhouse/coordination/log</log_storage_path>

        <coordination_settings>
            <operation_timeout_ms>5000</operation_timeout_ms>
--- a/tests/integration/test_testkeeper_multinode_simple/configs/enable_test_keeper3.xml
+++ b/tests/integration/test_testkeeper_multinode_simple/configs/enable_test_keeper3.xml
@ -2,6 +2,7 @@
    <test_keeper_server>
        <tcp_port>9181</tcp_port>
        <server_id>3</server_id>
+        <log_storage_path>/var/lib/clickhouse/coordination/log</log_storage_path>

        <coordination_settings>
            <operation_timeout_ms>5000</operation_timeout_ms>
--- a/tests/integration/test_testkeeper_multinode_simple/test.py
+++ b/tests/integration/test_testkeeper_multinode_simple/test.py
@ -6,6 +6,7 @@ import os
 import time
 from multiprocessing.dummy import Pool
 from helpers.network import PartitionManager
+from helpers.test_tools import assert_eq_with_retry

 cluster = ClickHouseCluster(__file__)
 node1 = cluster.add_instance('node1', main_configs=['configs/enable_test_keeper1.xml', 'configs/log_conf.xml', 'configs/use_test_keeper.xml'], stay_alive=True)
@ -234,6 +235,6 @@ def test_simple_replicated_table(started_cluster):
    node1.query("SYSTEM SYNC REPLICA t", timeout=10)
    node3.query("SYSTEM SYNC REPLICA t", timeout=10)

-    assert node1.query("SELECT COUNT() FROM t") == "10\n"
-    assert node2.query("SELECT COUNT() FROM t") == "10\n"
-    assert node3.query("SELECT COUNT() FROM t") == "10\n"
+    assert_eq_with_retry(node1, "SELECT COUNT() FROM t", "10")
+    assert_eq_with_retry(node2, "SELECT COUNT() FROM t", "10")
+    assert_eq_with_retry(node3, "SELECT COUNT() FROM t", "10")
--- a/tests/integration/test_testkeeper_persistent_log/init.py
+++ b/tests/integration/test_testkeeper_persistent_log/init.py
@ -0,0 +1 @@
+#!/usr/bin/env python3
--- a/tests/integration/test_testkeeper_persistent_log/configs/enable_test_keeper.xml
+++ b/tests/integration/test_testkeeper_persistent_log/configs/enable_test_keeper.xml
@ -0,0 +1,21 @@
+<yandex>
+    <test_keeper_server>
+        <tcp_port>9181</tcp_port>
+        <server_id>1</server_id>
+        <log_storage_path>/var/lib/clickhouse/coordination/log</log_storage_path>
+
+        <coordination_settings>
+            <operation_timeout_ms>5000</operation_timeout_ms>
+            <session_timeout_ms>10000</session_timeout_ms>
+            <raft_logs_level>trace</raft_logs_level>
+        </coordination_settings>
+
+        <raft_configuration>
+            <server>
+                <id>1</id>
+                <hostname>localhost</hostname>
+                <port>44444</port>
+            </server>
+        </raft_configuration>
+    </test_keeper_server>
+</yandex>
--- a/tests/integration/test_testkeeper_persistent_log/configs/logs_conf.xml
+++ b/tests/integration/test_testkeeper_persistent_log/configs/logs_conf.xml
@ -0,0 +1,12 @@
+<yandex>
+    <shutdown_wait_unfinished>3</shutdown_wait_unfinished>
+    <logger>
+        <level>trace</level>
+        <log>/var/log/clickhouse-server/log.log</log>
+        <errorlog>/var/log/clickhouse-server/log.err.log</errorlog>
+        <size>1000M</size>
+        <count>10</count>
+        <stderr>/var/log/clickhouse-server/stderr.log</stderr>
+        <stdout>/var/log/clickhouse-server/stdout.log</stdout>
+    </logger>
+</yandex>
--- a/tests/integration/test_testkeeper_persistent_log/configs/use_test_keeper.xml
+++ b/tests/integration/test_testkeeper_persistent_log/configs/use_test_keeper.xml
@ -0,0 +1,8 @@
+<yandex>
+    <zookeeper>
+            <node index="1">
+            <host>node</host>
+            <port>9181</port>
+        </node>
+    </zookeeper>
+</yandex>
--- a/tests/integration/test_testkeeper_persistent_log/test.py
+++ b/tests/integration/test_testkeeper_persistent_log/test.py
@ -0,0 +1,124 @@
+#!/usr/bin/env python3
+import pytest
+from helpers.cluster import ClickHouseCluster
+import random
+import string
+import os
+import time
+from kazoo.client import KazooClient, KazooState
+
+
+cluster = ClickHouseCluster(__file__)
+
+node = cluster.add_instance('node', main_configs=['configs/enable_test_keeper.xml', 'configs/logs_conf.xml', 'configs/use_test_keeper.xml'], stay_alive=True)
+
+
+def random_string(length):
+    return ''.join(random.choices(string.ascii_lowercase + string.digits, k=length))
+
+def create_random_path(prefix="", depth=1):
+    if depth == 0:
+        return prefix
+    return create_random_path(os.path.join(prefix, random_string(3)), depth - 1)
+
+@pytest.fixture(scope="module")
+def started_cluster():
+    try:
+        cluster.start()
+
+        yield cluster
+
+    finally:
+        cluster.shutdown()
+
+def get_connection_zk(nodename, timeout=30.0):
+    _fake_zk_instance = KazooClient(hosts=cluster.get_instance_ip(nodename) + ":9181", timeout=timeout)
+    def reset_listener(state):
+        nonlocal _fake_zk_instance
+        print("Fake zk callback called for state", state)
+        if state != KazooState.CONNECTED:
+            _fake_zk_instance._reset()
+
+    _fake_zk_instance.add_listener(reset_listener)
+    _fake_zk_instance.start()
+    return _fake_zk_instance
+
+def test_state_after_restart(started_cluster):
+    try:
+        node_zk = None
+        node_zk2 = None
+        node_zk = get_connection_zk("node")
+
+        node_zk.create("/test_state_after_restart", b"somevalue")
+        strs = []
+        for i in range(100):
+            strs.append(random_string(123).encode())
+            node_zk.create("/test_state_after_restart/node" + str(i), strs[i])
+
+        for i in range(100):
+            if i % 7 == 0:
+                node_zk.delete("/test_state_after_restart/node" + str(i))
+
+        node.restart_clickhouse(kill=True)
+
+        node_zk2 = get_connection_zk("node")
+
+        assert node_zk2.get("/test_state_after_restart")[0] == b"somevalue"
+        for i in range(100):
+            if i % 7 == 0:
+                assert node_zk2.exists("/test_state_after_restart/node" + str(i)) is None
+            else:
+                assert len(node_zk2.get("/test_state_after_restart/node" + str(i))[0]) == 123
+                assert node_zk2.get("/test_state_after_restart/node" + str(i))[0] == strs[i]
+    finally:
+        try:
+            if node_zk is not None:
+                node_zk.stop()
+                node_zk.close()
+
+            if node_zk2 is not None:
+                node_zk2.stop()
+                node_zk2.close()
+        except:
+            pass
+
+
+# http://zookeeper-user.578899.n2.nabble.com/Why-are-ephemeral-nodes-written-to-disk-tp7583403p7583418.html
+def test_ephemeral_after_restart(started_cluster):
+    try:
+        node_zk = None
+        node_zk2 = None
+        node_zk = get_connection_zk("node")
+
+        node_zk.create("/test_ephemeral_after_restart", b"somevalue")
+        strs = []
+        for i in range(100):
+            strs.append(random_string(123).encode())
+            node_zk.create("/test_ephemeral_after_restart/node" + str(i), strs[i], ephemeral=True)
+
+        for i in range(100):
+            if i % 7 == 0:
+                node_zk.delete("/test_ephemeral_after_restart/node" + str(i))
+
+        node.restart_clickhouse(kill=True)
+
+        node_zk2 = get_connection_zk("node")
+
+        assert node_zk2.get("/test_ephemeral_after_restart")[0] == b"somevalue"
+        for i in range(100):
+            if i % 7 == 0:
+                assert node_zk2.exists("/test_ephemeral_after_restart/node" + str(i)) is None
+            else:
+                assert len(node_zk2.get("/test_ephemeral_after_restart/node" + str(i))[0]) == 123
+                assert node_zk2.get("/test_ephemeral_after_restart/node" + str(i))[0] == strs[i]
+    finally:
+        try:
+            if node_zk is not None:
+                node_zk.stop()
+                node_zk.close()
+
+            if node_zk2 is not None:
+                node_zk2.stop()
+                node_zk2.close()
+        except:
+            pass
--- a/tests/integration/test_testkeeper_persistent_log_multinode/init.py
+++ b/tests/integration/test_testkeeper_persistent_log_multinode/init.py
@ -0,0 +1 @@
+#!/usr/bin/env python3
--- a/tests/integration/test_testkeeper_persistent_log_multinode/configs/enable_test_keeper1.xml
+++ b/tests/integration/test_testkeeper_persistent_log_multinode/configs/enable_test_keeper1.xml
@ -0,0 +1,39 @@
+<yandex>
+    <test_keeper_server>
+        <tcp_port>9181</tcp_port>
+        <server_id>1</server_id>
+        <log_storage_path>/var/lib/clickhouse/coordination/log</log_storage_path>
+
+        <coordination_settings>
+            <operation_timeout_ms>5000</operation_timeout_ms>
+            <session_timeout_ms>10000</session_timeout_ms>
+            <raft_logs_level>trace</raft_logs_level>
+        </coordination_settings>
+
+        <raft_configuration>
+            <server>
+                <id>1</id>
+                <hostname>node1</hostname>
+                <port>44444</port>
+                <can_become_leader>true</can_become_leader>
+                <priority>3</priority>
+            </server>
+            <server>
+                <id>2</id>
+                <hostname>node2</hostname>
+                <port>44444</port>
+                <can_become_leader>true</can_become_leader>
+                <start_as_follower>true</start_as_follower>
+                <priority>2</priority>
+            </server>
+            <server>
+                <id>3</id>
+                <hostname>node3</hostname>
+                <port>44444</port>
+                <can_become_leader>true</can_become_leader>
+                <start_as_follower>true</start_as_follower>
+                <priority>1</priority>
+            </server>
+        </raft_configuration>
+    </test_keeper_server>
+</yandex>
--- a/tests/integration/test_testkeeper_persistent_log_multinode/configs/enable_test_keeper2.xml
+++ b/tests/integration/test_testkeeper_persistent_log_multinode/configs/enable_test_keeper2.xml
@ -0,0 +1,39 @@
+<yandex>
+    <test_keeper_server>
+        <tcp_port>9181</tcp_port>
+        <server_id>2</server_id>
+        <log_storage_path>/var/lib/clickhouse/coordination/log</log_storage_path>
+
+        <coordination_settings>
+            <operation_timeout_ms>5000</operation_timeout_ms>
+            <session_timeout_ms>10000</session_timeout_ms>
+            <raft_logs_level>trace</raft_logs_level>
+        </coordination_settings>
+
+        <raft_configuration>
+            <server>
+                <id>1</id>
+                <hostname>node1</hostname>
+                <port>44444</port>
+                <can_become_leader>true</can_become_leader>
+                <priority>3</priority>
+            </server>
+            <server>
+                <id>2</id>
+                <hostname>node2</hostname>
+                <port>44444</port>
+                <can_become_leader>true</can_become_leader>
+                <start_as_follower>true</start_as_follower>
+                <priority>2</priority>
+            </server>
+            <server>
+                <id>3</id>
+                <hostname>node3</hostname>
+                <port>44444</port>
+                <can_become_leader>true</can_become_leader>
+                <start_as_follower>true</start_as_follower>
+                <priority>1</priority>
+            </server>
+        </raft_configuration>
+    </test_keeper_server>
+</yandex>
--- a/tests/integration/test_testkeeper_persistent_log_multinode/configs/enable_test_keeper3.xml
+++ b/tests/integration/test_testkeeper_persistent_log_multinode/configs/enable_test_keeper3.xml
@ -0,0 +1,39 @@
+<yandex>
+    <test_keeper_server>
+        <tcp_port>9181</tcp_port>
+        <server_id>3</server_id>
+        <log_storage_path>/var/lib/clickhouse/coordination/log</log_storage_path>
+
+        <coordination_settings>
+            <operation_timeout_ms>5000</operation_timeout_ms>
+            <session_timeout_ms>10000</session_timeout_ms>
+            <raft_logs_level>trace</raft_logs_level>
+        </coordination_settings>
+
+        <raft_configuration>
+            <server>
+                <id>1</id>
+                <hostname>node1</hostname>
+                <port>44444</port>
+                <can_become_leader>true</can_become_leader>
+                <priority>3</priority>
+            </server>
+            <server>
+                <id>2</id>
+                <hostname>node2</hostname>
+                <port>44444</port>
+                <can_become_leader>true</can_become_leader>
+                <start_as_follower>true</start_as_follower>
+                <priority>2</priority>
+            </server>
+            <server>
+                <id>3</id>
+                <hostname>node3</hostname>
+                <port>44444</port>
+                <can_become_leader>true</can_become_leader>
+                <start_as_follower>true</start_as_follower>
+                <priority>1</priority>
+            </server>
+        </raft_configuration>
+    </test_keeper_server>
+</yandex>
--- a/tests/integration/test_testkeeper_persistent_log_multinode/configs/log_conf.xml
+++ b/tests/integration/test_testkeeper_persistent_log_multinode/configs/log_conf.xml
@ -0,0 +1,12 @@
+<yandex>
+    <shutdown_wait_unfinished>3</shutdown_wait_unfinished>
+    <logger>
+        <level>trace</level>
+        <log>/var/log/clickhouse-server/log.log</log>
+        <errorlog>/var/log/clickhouse-server/log.err.log</errorlog>
+        <size>1000M</size>
+        <count>10</count>
+        <stderr>/var/log/clickhouse-server/stderr.log</stderr>
+        <stdout>/var/log/clickhouse-server/stdout.log</stdout>
+    </logger>
+</yandex>
--- a/tests/integration/test_testkeeper_persistent_log_multinode/configs/use_test_keeper.xml
+++ b/tests/integration/test_testkeeper_persistent_log_multinode/configs/use_test_keeper.xml
@ -0,0 +1,16 @@
+<yandex>
+    <zookeeper>
+        <node index="1">
+            <host>node1</host>
+            <port>9181</port>
+        </node>
+        <node index="2">
+            <host>node2</host>
+            <port>9181</port>
+        </node>
+        <node index="3">
+            <host>node3</host>
+            <port>9181</port>
+        </node>
+    </zookeeper>
+</yandex>
--- a/tests/integration/test_testkeeper_persistent_log_multinode/test.py
+++ b/tests/integration/test_testkeeper_persistent_log_multinode/test.py
@ -0,0 +1,98 @@
+#!/usr/bin/env python3
+import pytest
+from helpers.cluster import ClickHouseCluster
+import random
+import string
+import os
+import time
+
+cluster = ClickHouseCluster(__file__)
+node1 = cluster.add_instance('node1', main_configs=['configs/enable_test_keeper1.xml', 'configs/log_conf.xml', 'configs/use_test_keeper.xml'], stay_alive=True)
+node2 = cluster.add_instance('node2', main_configs=['configs/enable_test_keeper2.xml', 'configs/log_conf.xml', 'configs/use_test_keeper.xml'], stay_alive=True)
+node3 = cluster.add_instance('node3', main_configs=['configs/enable_test_keeper3.xml', 'configs/log_conf.xml', 'configs/use_test_keeper.xml'], stay_alive=True)
+
+from kazoo.client import KazooClient, KazooState
+
+@pytest.fixture(scope="module")
+def started_cluster():
+    try:
+        cluster.start()
+
+        yield cluster
+
+    finally:
+        cluster.shutdown()
+
+def get_fake_zk(nodename, timeout=30.0):
+    _fake_zk_instance = KazooClient(hosts=cluster.get_instance_ip(nodename) + ":9181", timeout=timeout)
+    def reset_listener(state):
+        nonlocal _fake_zk_instance
+        print("Fake zk callback called for state", state)
+        if state != KazooState.CONNECTED:
+            _fake_zk_instance._reset()
+
+    _fake_zk_instance.add_listener(reset_listener)
+    _fake_zk_instance.start()
+    return _fake_zk_instance
+
+def stop_zk(zk):
+    try:
+        if zk:
+            zk.stop()
+            zk.close()
+    except:
+        pass
+
+def test_restart_multinode(started_cluster):
+    try:
+        node1_zk = node2_zk = node3_zk = None
+
+        node1_zk = get_fake_zk("node1")
+        node2_zk = get_fake_zk("node2")
+        node3_zk = get_fake_zk("node3")
+
+        for i in range(100):
+            node1_zk.create("/test_read_write_multinode_node" + str(i), ("somedata" + str(i)).encode())
+
+        for i in range(100):
+            if i % 10 == 0:
+                node1_zk.delete("/test_read_write_multinode_node" + str(i))
+
+        node2_zk.sync("/test_read_write_multinode_node0")
+        node3_zk.sync("/test_read_write_multinode_node0")
+
+        for i in range(100):
+            if i % 10 != 0:
+                assert node2_zk.get("/test_read_write_multinode_node" + str(i))[0] == ("somedata" + str(i)).encode()
+                assert node3_zk.get("/test_read_write_multinode_node" + str(i))[0] == ("somedata" + str(i)).encode()
+            else:
+                assert node2_zk.exists("/test_read_write_multinode_node" + str(i)) is None
+                assert node3_zk.exists("/test_read_write_multinode_node" + str(i)) is None
+
+    finally:
+        for zk in [node1_zk, node2_zk, node3_zk]:
+            stop_zk(zk)
+
+    node1.restart_clickhouse(kill=True)
+    node2.restart_clickhouse(kill=True)
+    node3.restart_clickhouse(kill=True)
+    for i in range(100):
+        try:
+            node1_zk = get_fake_zk("node1")
+            node2_zk = get_fake_zk("node2")
+            node3_zk = get_fake_zk("node3")
+            for i in range(100):
+                if i % 10 != 0:
+                    assert node1_zk.get("/test_read_write_multinode_node" + str(i))[0] == ("somedata" + str(i)).encode()
+                    assert node2_zk.get("/test_read_write_multinode_node" + str(i))[0] == ("somedata" + str(i)).encode()
+                    assert node3_zk.get("/test_read_write_multinode_node" + str(i))[0] == ("somedata" + str(i)).encode()
+                else:
+                    assert node1_zk.exists("/test_read_write_multinode_node" + str(i)) is None
+                    assert node2_zk.exists("/test_read_write_multinode_node" + str(i)) is None
+                    assert node3_zk.exists("/test_read_write_multinode_node" + str(i)) is None
+            break
+        except Exception as ex:
+            print("Got exception as ex", ex)
+        finally:
+            for zk in [node1_zk, node2_zk, node3_zk]:
+                stop_zk(zk)
--- a/tests/queries/0_stateless/00825_protobuf_format_map.sh
+++ b/tests/queries/0_stateless/00825_protobuf_format_map.sh
@ -1,7 +1,5 @@
 #!/usr/bin/env bash

-# https://github.com/ClickHouse/ClickHouse/issues/6497
-
 CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 SCHEMADIR=$CURDIR/format_schemas
 # shellcheck source=../shell_config.sh
--- a/tests/queries/0_stateless/00825_protobuf_format_nested_in_nested.reference
+++ b/tests/queries/0_stateless/00825_protobuf_format_nested_in_nested.reference
@ -0,0 +1,52 @@
+[[(1),(2)],[(3),(4),(5)]]
+[[(6)]]
+[[]]
+[]
+
+Binary representation:
+00000000  18 0a 08 12 02 18 01 12  02 18 02 0a 0c 12 02 18  |................|
+00000010  03 12 02 18 04 12 02 18  05 06 0a 04 12 02 18 06  |................|
+00000020  02 0a 00 00                                       |....|
+00000024
+
+MESSAGE #1 AT 0x00000001
+x {
+  y {
+    z: 1
+  }
+  y {
+    z: 2
+  }
+}
+x {
+  y {
+    z: 3
+  }
+  y {
+    z: 4
+  }
+  y {
+    z: 5
+  }
+}
+MESSAGE #2 AT 0x0000001A
+x {
+  y {
+    z: 6
+  }
+}
+MESSAGE #3 AT 0x00000021
+x {
+}
+MESSAGE #4 AT 0x00000024
+
+Binary representation is as expected
+
+[[(1),(2)],[(3),(4),(5)]]
+[[(6)]]
+[[]]
+[]
+[[(1),(2)],[(3),(4),(5)]]
+[[(6)]]
+[[]]
+[]
--- a/tests/queries/0_stateless/00825_protobuf_format_nested_in_nested.sh
+++ b/tests/queries/0_stateless/00825_protobuf_format_nested_in_nested.sh
@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+
+# https://github.com/ClickHouse/ClickHouse/issues/11117
+
+CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+SCHEMADIR=$CURDIR/format_schemas
+# shellcheck source=../shell_config.sh
+. "$CURDIR"/../shell_config.sh
+
+set -eo pipefail
+
+# Run the client.
+$CLICKHOUSE_CLIENT --multiquery <<EOF
+DROP TABLE IF EXISTS nested_in_nested_protobuf_00825;
+
+CREATE TABLE nested_in_nested_protobuf_00825 (x Nested (y Nested (z Int64))) ENGINE = MergeTree ORDER BY tuple();
+
+INSERT INTO nested_in_nested_protobuf_00825 VALUES ([[(1),(2)],[(3),(4),(5)]]), ([[(6)]]), ([[]]), ([]);
+
+SELECT * FROM nested_in_nested_protobuf_00825;
+EOF
+
+BINARY_FILE_PATH=$(mktemp "$CURDIR/00825_protobuf_format_nested_in_nested.XXXXXX.binary")
+$CLICKHOUSE_CLIENT --query "SELECT * FROM nested_in_nested_protobuf_00825 FORMAT Protobuf SETTINGS format_schema = '$SCHEMADIR/00825_protobuf_format_nested_in_nested:MessageType'" > "$BINARY_FILE_PATH"
+
+# Check the output in the protobuf format
+echo
+$CURDIR/helpers/protobuf_length_delimited_encoder.py --decode_and_check --format_schema "$SCHEMADIR/00825_protobuf_format_nested_in_nested:MessageType" --input "$BINARY_FILE_PATH"
+
+# Check the input in the protobuf format (now the table contains the same data twice).
+echo
+$CLICKHOUSE_CLIENT --query "INSERT INTO nested_in_nested_protobuf_00825 FORMAT Protobuf SETTINGS format_schema='$SCHEMADIR/00825_protobuf_format_nested_in_nested:MessageType'" < "$BINARY_FILE_PATH"
+$CLICKHOUSE_CLIENT --query "SELECT * FROM nested_in_nested_protobuf_00825"
+
+rm "$BINARY_FILE_PATH"
+$CLICKHOUSE_CLIENT --query "DROP TABLE nested_in_nested_protobuf_00825"
--- a/tests/queries/0_stateless/01732_race_condition_storage_join_long.reference
+++ b/tests/queries/0_stateless/01732_race_condition_storage_join_long.reference
--- a/Show More
+++ b/Show More