From 06e203d00420197d264492c42c67539171646c71 Mon Sep 17 00:00:00 2001
From: Pavel Kovalenko <jokserfn@gmail.com>
Date: Fri, 14 Feb 2020 17:28:33 +0300
Subject: [PATCH 01/40] StorageStripeLog S3 support.

---
 .../CompressedReadBufferFromFile.cpp          | 19 +++++---
 .../CompressedReadBufferFromFile.h            |  8 ++--
 dbms/src/Disks/DiskLocal.cpp                  |  2 +-
 dbms/src/Disks/DiskLocal.h                    |  3 +-
 dbms/src/Disks/DiskMemory.cpp                 | 39 +++++++++++++++-
 dbms/src/Disks/DiskMemory.h                   | 30 ++++++++++++-
 dbms/src/Disks/DiskS3.cpp                     | 44 ++++++++++++-------
 dbms/src/Disks/DiskS3.h                       |  2 +-
 dbms/src/Disks/IDisk.h                        |  4 +-
 .../IO/MMapReadBufferFromFileDescriptor.cpp   |  2 +-
 .../src/IO/MMapReadBufferFromFileDescriptor.h |  4 +-
 dbms/src/IO/ReadBufferAIO.cpp                 |  8 ++--
 dbms/src/IO/ReadBufferAIO.h                   |  4 +-
 dbms/src/IO/ReadBufferFromFileBase.h          |  8 +---
 dbms/src/IO/ReadBufferFromFileDescriptor.h    |  4 +-
 dbms/src/IO/ReadBufferFromMemory.cpp          |  5 +++
 dbms/src/IO/ReadBufferFromMemory.h            |  2 +
 dbms/src/IO/ReadBufferFromS3.cpp              |  6 +++
 dbms/src/IO/ReadBufferFromS3.h                |  1 +
 dbms/src/IO/SeekableReadBuffer.h              |  5 +++
 dbms/src/Storages/StorageLog.cpp              | 25 +++++++++--
 dbms/src/Storages/StorageStripeLog.cpp        | 28 +++++++++---
 dbms/src/Storages/StorageTinyLog.cpp          | 26 ++++++++---
 dbms/tests/config/disks.xml                   |  9 ++++
 .../integration/test_disk_memory/__init__.py  |  0
 .../test_disk_memory/configs/config.xml       | 19 --------
 .../integration/test_disk_memory/test.py      | 30 -------------
 .../integration/test_log_family_s3/test.py    | 12 ++---
 .../01079_log_family_disk_memory.reference    | 18 ++++++++
 .../01079_log_family_disk_memory.sql          | 40 +++++++++++++++++
 docker/test/stateless/Dockerfile              |  1 +
 31 files changed, 286 insertions(+), 122 deletions(-)
 create mode 100644 dbms/tests/config/disks.xml
 delete mode 100644 dbms/tests/integration/test_disk_memory/__init__.py
 delete mode 100644 dbms/tests/integration/test_disk_memory/configs/config.xml
 delete mode 100644 dbms/tests/integration/test_disk_memory/test.py
 create mode 100644 dbms/tests/queries/0_stateless/01079_log_family_disk_memory.reference
 create mode 100644 dbms/tests/queries/0_stateless/01079_log_family_disk_memory.sql
diff --git a/dbms/src/Compression/CompressedReadBufferFromFile.cpp b/dbms/src/Compression/CompressedReadBufferFromFile.cpp
index 63bacde6d78..b87ab5f1528 100644
--- a/dbms/src/Compression/CompressedReadBufferFromFile.cpp
+++ b/dbms/src/Compression/CompressedReadBufferFromFile.cpp
@@ -1,14 +1,13 @@
 #include "CompressedReadBufferFromFile.h"
 
-#include <IO/createReadBufferFromFileBase.h>
-#include <IO/WriteHelpers.h>
 #include <Compression/CompressionInfo.h>
 #include <Compression/LZ4_decompress_faster.h>
+#include <IO/WriteHelpers.h>
+#include <IO/createReadBufferFromFileBase.h>
 
 
 namespace DB
 {
-
 namespace ErrorCodes
 {
     extern const int SEEK_POSITION_OUT_OF_BOUND;
@@ -31,12 +30,18 @@ bool CompressedReadBufferFromFile::nextImpl()
     return true;
 }
 
+CompressedReadBufferFromFile::CompressedReadBufferFromFile(std::unique_ptr<ReadBufferFromFileBase> buf)
+    : BufferWithOwnMemory<ReadBuffer>(0), p_file_in(std::move(buf)), file_in(*p_file_in)
+{
+    compressed_in = &file_in;
+}
+
 
 CompressedReadBufferFromFile::CompressedReadBufferFromFile(
     const std::string & path, size_t estimated_size, size_t aio_threshold, size_t mmap_threshold, size_t buf_size)
-    : BufferWithOwnMemory<ReadBuffer>(0),
-        p_file_in(createReadBufferFromFileBase(path, estimated_size, aio_threshold, mmap_threshold, buf_size)),
-        file_in(*p_file_in)
+    : BufferWithOwnMemory<ReadBuffer>(0)
+    , p_file_in(createReadBufferFromFileBase(path, estimated_size, aio_threshold, mmap_threshold, buf_size))
+    , file_in(*p_file_in)
 {
     compressed_in = &file_in;
 }
@@ -45,7 +50,7 @@ CompressedReadBufferFromFile::CompressedReadBufferFromFile(
 void CompressedReadBufferFromFile::seek(size_t offset_in_compressed_file, size_t offset_in_decompressed_block)
 {
     if (size_compressed &&
-        offset_in_compressed_file == file_in.getPositionInFile() - size_compressed &&
+        offset_in_compressed_file == file_in.getPosition() - size_compressed &&
         offset_in_decompressed_block <= working_buffer.size())
     {
         bytes += offset();
diff --git a/dbms/src/Compression/CompressedReadBufferFromFile.h b/dbms/src/Compression/CompressedReadBufferFromFile.h
index 641e3d6ed1b..33727909a69 100644
--- a/dbms/src/Compression/CompressedReadBufferFromFile.h
+++ b/dbms/src/Compression/CompressedReadBufferFromFile.h
@@ -1,10 +1,10 @@
 #pragma once
 
-#include "CompressedReadBufferBase.h"
-#include <IO/ReadBufferFromFileBase.h>
-#include <time.h>
 #include <memory>
+#include <time.h>
+#include <IO/ReadBufferFromFileBase.h>
 #include <port/clock.h>
+#include "CompressedReadBufferBase.h"
 
 
 namespace DB
@@ -29,6 +29,8 @@ private:
     bool nextImpl() override;
 
 public:
+    CompressedReadBufferFromFile(std::unique_ptr<ReadBufferFromFileBase> buf);
+
     CompressedReadBufferFromFile(
         const std::string & path, size_t estimated_size, size_t aio_threshold, size_t mmap_threshold, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE);
 
diff --git a/dbms/src/Disks/DiskLocal.cpp b/dbms/src/Disks/DiskLocal.cpp
index 421583a4258..74300cf61b5 100644
--- a/dbms/src/Disks/DiskLocal.cpp
+++ b/dbms/src/Disks/DiskLocal.cpp
@@ -200,7 +200,7 @@ void DiskLocal::copyFile(const String & from_path, const String & to_path)
     Poco::File(disk_path + from_path).copyTo(disk_path + to_path);
 }
 
-std::unique_ptr<SeekableReadBuffer> DiskLocal::readFile(const String & path, size_t buf_size) const
+std::unique_ptr<ReadBufferFromFileBase> DiskLocal::readFile(const String & path, size_t buf_size) const
 {
     return std::make_unique<ReadBufferFromFile>(disk_path + path, buf_size);
 }
diff --git a/dbms/src/Disks/DiskLocal.h b/dbms/src/Disks/DiskLocal.h
index 1a9a2e8f6cd..8d9d51fc384 100644
--- a/dbms/src/Disks/DiskLocal.h
+++ b/dbms/src/Disks/DiskLocal.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <Disks/IDisk.h>
+#include <IO/ReadBufferFromFileBase.h>
 #include <IO/ReadBufferFromFile.h>
 #include <IO/WriteBufferFromFile.h>
 
@@ -66,7 +67,7 @@ public:
 
     void copyFile(const String & from_path, const String & to_path) override;
 
-    std::unique_ptr<SeekableReadBuffer> readFile(const String & path, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE) const override;
+    std::unique_ptr<ReadBufferFromFileBase> readFile(const String & path, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE) const override;
 
     std::unique_ptr<WriteBuffer> writeFile(const String & path, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, WriteMode mode = WriteMode::Rewrite) override;
 
diff --git a/dbms/src/Disks/DiskMemory.cpp b/dbms/src/Disks/DiskMemory.cpp
index cc61b0f870d..30509d42b43 100644
--- a/dbms/src/Disks/DiskMemory.cpp
+++ b/dbms/src/Disks/DiskMemory.cpp
@@ -15,6 +15,7 @@ namespace ErrorCodes
     extern const int FILE_ALREADY_EXISTS;
     extern const int DIRECTORY_DOESNT_EXIST;
     extern const int CANNOT_DELETE_DIRECTORY;
+    extern const int CANNOT_SEEK_THROUGH_FILE;
 }
 
 
@@ -37,8 +38,42 @@ private:
     std::vector<String>::iterator iter;
 };
 
+bool ReadIndirectBuffer::nextImpl()
+{
+    if (!initialized)
+    {
+        initialized = true;
+
+        internal_buffer = buf.buffer();
+        working_buffer = internal_buffer;
+
+        return true;
+    }
+
+    return false;
+}
+
+off_t ReadIndirectBuffer::seek(off_t off, int whence)
+{
+    if (whence != SEEK_SET)
+        throw Exception("Only SEEK_SET mode is allowed.", ErrorCodes::CANNOT_SEEK_THROUGH_FILE);
+
+    off_t result = buf.seek(off, whence);
+    pos = buf.position();
+
+    return result;
+}
+
+off_t ReadIndirectBuffer::getPosition()
+{
+    return pos - working_buffer.begin();
+}
+
 void WriteIndirectBuffer::finalize()
 {
+    if (isFinished())
+        return;
+
     next();
     WriteBufferFromVector::finalize();
 
@@ -249,7 +284,7 @@ void DiskMemory::copyFile(const String & /*from_path*/, const String & /*to_path
     throw Exception("Method copyFile is not implemented for memory disks", ErrorCodes::NOT_IMPLEMENTED);
 }
 
-std::unique_ptr<SeekableReadBuffer> DiskMemory::readFile(const String & path, size_t /*buf_size*/) const
+std::unique_ptr<ReadBufferFromFileBase> DiskMemory::readFile(const String & path, size_t /*buf_size*/) const
 {
     std::lock_guard lock(mutex);
 
@@ -257,7 +292,7 @@ std::unique_ptr<SeekableReadBuffer> DiskMemory::readFile(const String & path, si
     if (iter == files.end())
         throw Exception("File '" + path + "' does not exist", ErrorCodes::FILE_DOESNT_EXIST);
 
-    return std::make_unique<ReadBufferFromString>(iter->second.data);
+    return std::make_unique<ReadIndirectBuffer>(path, iter->second.data);
 }
 
 std::unique_ptr<WriteBuffer> DiskMemory::writeFile(const String & path, size_t /*buf_size*/, WriteMode mode)
diff --git a/dbms/src/Disks/DiskMemory.h b/dbms/src/Disks/DiskMemory.h
index 6bbd13d2988..dace7c47da3 100644
--- a/dbms/src/Disks/DiskMemory.h
+++ b/dbms/src/Disks/DiskMemory.h
@@ -5,6 +5,8 @@
 #include <unordered_map>
 #include <utility>
 #include <Disks/IDisk.h>
+#include <IO/ReadBufferFromFileBase.h>
+#include <IO/ReadBufferFromString.h>
 #include <IO/WriteBufferFromString.h>
 
 namespace DB
@@ -13,7 +15,31 @@ class DiskMemory;
 class ReadBuffer;
 class WriteBuffer;
 
-// This class is responsible to update files metadata after buffer is finalized.
+/// Adapter with actual behaviour as ReadBufferFromString.
+class ReadIndirectBuffer : public ReadBufferFromFileBase
+{
+public:
+    ReadIndirectBuffer(String path_, const String & data_)
+        : ReadBufferFromFileBase(), buf(ReadBufferFromString(data_)), path(std::move(path_))
+    {
+    }
+
+    std::string getFileName() const override { return path; }
+
+private:
+    bool nextImpl() override;
+
+public:
+    off_t seek(off_t off, int whence) override;
+    off_t getPosition() override;
+
+private:
+    bool initialized = false;
+    ReadBufferFromString buf;
+    String path;
+};
+
+/// This class is responsible to update files metadata after buffer is finalized.
 class WriteIndirectBuffer : public WriteBufferFromOwnString
 {
 public:
@@ -76,7 +102,7 @@ public:
 
     void copyFile(const String & from_path, const String & to_path) override;
 
-    std::unique_ptr<SeekableReadBuffer> readFile(const String & path, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE) const override;
+    std::unique_ptr<ReadBufferFromFileBase> readFile(const String & path, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE) const override;
 
     std::unique_ptr<WriteBuffer>
     writeFile(const String & path, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, WriteMode mode = WriteMode::Rewrite) override;
diff --git a/dbms/src/Disks/DiskS3.cpp b/dbms/src/Disks/DiskS3.cpp
index 25701507884..7142b6a3643 100644
--- a/dbms/src/Disks/DiskS3.cpp
+++ b/dbms/src/Disks/DiskS3.cpp
@@ -137,17 +137,17 @@ namespace
 
     // Reads data from S3.
     // It supports reading from multiple S3 paths that resides in Metadata.
-    class ReadIndirectBufferFromS3 : public BufferWithOwnMemory<SeekableReadBuffer>
+    class ReadIndirectBufferFromS3 : public ReadBufferFromFileBase
     {
     public:
         ReadIndirectBufferFromS3(
             std::shared_ptr<Aws::S3::S3Client> client_ptr_, const String & bucket_, Metadata metadata_, size_t buf_size_)
-            : BufferWithOwnMemory(buf_size_)
+            : ReadBufferFromFileBase()
             , client_ptr(std::move(client_ptr_))
             , bucket(bucket_)
             , metadata(std::move(metadata_))
             , buf_size(buf_size_)
-            , offset(0)
+            , absolute_position(0)
             , initialized(false)
             , current_buf_idx(0)
             , current_buf(nullptr)
@@ -156,9 +156,6 @@ namespace
 
         off_t seek(off_t offset_, int whence) override
         {
-            if (initialized)
-                throw Exception("Seek is allowed only before first read attempt from the buffer.", ErrorCodes::CANNOT_SEEK_THROUGH_FILE);
-
             if (whence != SEEK_SET)
                 throw Exception("Only SEEK_SET mode is allowed.", ErrorCodes::CANNOT_SEEK_THROUGH_FILE);
 
@@ -169,14 +166,23 @@ namespace
                         + std::to_string(offset_) + ", Max: " + std::to_string(metadata.total_size),
                     ErrorCodes::SEEK_POSITION_OUT_OF_BOUND);
 
-            offset = offset_;
+            absolute_position = offset_;
 
-            return offset;
+            /// TODO: Do not re-initialize buffer if current position within working buffer.
+            current_buf = initialize();
+            pos = working_buffer.end();
+
+            return absolute_position;
         }
 
+        off_t getPosition() override { return absolute_position - available(); }
+
+        std::string getFileName() const override { return metadata.metadata_file_path; }
+
     private:
         std::unique_ptr<ReadBufferFromS3> initialize()
         {
+            size_t offset = absolute_position;
             for (UInt32 i = 0; i < metadata.s3_objects_count; ++i)
             {
                 current_buf_idx = i;
@@ -190,6 +196,7 @@ namespace
                 }
                 offset -= size;
             }
+            initialized = true;
             return nullptr;
         }
 
@@ -199,14 +206,13 @@ namespace
             if (!initialized)
             {
                 current_buf = initialize();
-
-                initialized = true;
             }
 
             // If current buffer has remaining data - use it.
             if (current_buf && current_buf->next())
             {
                 working_buffer = current_buf->buffer();
+                absolute_position += working_buffer.size();
                 return true;
             }
 
@@ -219,6 +225,7 @@ namespace
             current_buf = std::make_unique<ReadBufferFromS3>(client_ptr, bucket, path, buf_size);
             current_buf->next();
             working_buffer = current_buf->buffer();
+            absolute_position += working_buffer.size();
 
             return true;
         }
@@ -229,7 +236,7 @@ namespace
         Metadata metadata;
         size_t buf_size;
 
-        size_t offset;
+        size_t absolute_position = 0;
         bool initialized;
         UInt32 current_buf_idx;
         std::unique_ptr<ReadBufferFromS3> current_buf;
@@ -337,8 +344,13 @@ private:
 };
 
 
-DiskS3::DiskS3(String name_, std::shared_ptr<Aws::S3::S3Client> client_, String bucket_, String s3_root_path_,
-               String metadata_path_, size_t min_upload_part_size_)
+DiskS3::DiskS3(
+    String name_,
+    std::shared_ptr<Aws::S3::S3Client> client_,
+    String bucket_,
+    String s3_root_path_,
+    String metadata_path_,
+    size_t min_upload_part_size_)
     : name(std::move(name_))
     , client(std::move(client_))
     , bucket(std::move(bucket_))
@@ -445,7 +457,7 @@ void DiskS3::copyFile(const String & from_path, const String & to_path)
     to.save();
 }
 
-std::unique_ptr<SeekableReadBuffer> DiskS3::readFile(const String & path, size_t buf_size) const
+std::unique_ptr<ReadBufferFromFileBase> DiskS3::readFile(const String & path, size_t buf_size) const
 {
     Metadata metadata(metadata_path + path);
 
@@ -628,8 +640,8 @@ void registerDiskS3(DiskFactory & factory)
 
         String metadata_path = context.getPath() + "disks/" + name + "/";
 
-        auto s3disk = std::make_shared<DiskS3>(name, client, uri.bucket, uri.key, metadata_path,
-                                               context.getSettingsRef().s3_min_upload_part_size);
+        auto s3disk
+            = std::make_shared<DiskS3>(name, client, uri.bucket, uri.key, metadata_path, context.getSettingsRef().s3_min_upload_part_size);
 
         /// This code is used only to check access to the corresponding disk.
         checkWriteAccess(s3disk);
diff --git a/dbms/src/Disks/DiskS3.h b/dbms/src/Disks/DiskS3.h
index bafeb41a261..759e2f347d9 100644
--- a/dbms/src/Disks/DiskS3.h
+++ b/dbms/src/Disks/DiskS3.h
@@ -62,7 +62,7 @@ public:
 
     void copyFile(const String & from_path, const String & to_path) override;
 
-    std::unique_ptr<SeekableReadBuffer> readFile(const String & path, size_t buf_size) const override;
+    std::unique_ptr<ReadBufferFromFileBase> readFile(const String & path, size_t buf_size) const override;
 
     std::unique_ptr<WriteBuffer> writeFile(const String & path, size_t buf_size, WriteMode mode) override;
 
diff --git a/dbms/src/Disks/IDisk.h b/dbms/src/Disks/IDisk.h
index 76e09dda983..ff920897ebf 100644
--- a/dbms/src/Disks/IDisk.h
+++ b/dbms/src/Disks/IDisk.h
@@ -25,7 +25,7 @@ using DiskDirectoryIteratorPtr = std::unique_ptr<IDiskDirectoryIterator>;
 class IReservation;
 using ReservationPtr = std::unique_ptr<IReservation>;
 
-class SeekableReadBuffer;
+class ReadBufferFromFileBase;
 class WriteBuffer;
 
 /**
@@ -122,7 +122,7 @@ public:
     virtual void copyFile(const String & from_path, const String & to_path) = 0;
 
     /// Open the file for read and return SeekableReadBuffer object.
-    virtual std::unique_ptr<SeekableReadBuffer> readFile(const String & path, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE) const = 0;
+    virtual std::unique_ptr<ReadBufferFromFileBase> readFile(const String & path, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE) const = 0;
 
     /// Open the file for write and return WriteBuffer object.
     virtual std::unique_ptr<WriteBuffer> writeFile(const String & path, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, WriteMode mode = WriteMode::Rewrite) = 0;
diff --git a/dbms/src/IO/MMapReadBufferFromFileDescriptor.cpp b/dbms/src/IO/MMapReadBufferFromFileDescriptor.cpp
index 2d1ddba5f58..027b95bc022 100644
--- a/dbms/src/IO/MMapReadBufferFromFileDescriptor.cpp
+++ b/dbms/src/IO/MMapReadBufferFromFileDescriptor.cpp
@@ -100,7 +100,7 @@ int MMapReadBufferFromFileDescriptor::getFD() const
     return fd;
 }
 
-off_t MMapReadBufferFromFileDescriptor::getPositionInFile()
+off_t MMapReadBufferFromFileDescriptor::getPosition()
 {
     return count();
 }
diff --git a/dbms/src/IO/MMapReadBufferFromFileDescriptor.h b/dbms/src/IO/MMapReadBufferFromFileDescriptor.h
index fb6d3651b41..e409e9d2d0c 100644
--- a/dbms/src/IO/MMapReadBufferFromFileDescriptor.h
+++ b/dbms/src/IO/MMapReadBufferFromFileDescriptor.h
@@ -32,9 +32,9 @@ public:
     /// unmap memory before call to destructor
     void finish();
 
-    off_t getPositionInFile() override;
+    off_t getPosition() override;
     std::string getFileName() const override;
-    int getFD() const override;
+    int getFD() const;
 
 private:
     size_t length = 0;
diff --git a/dbms/src/IO/ReadBufferAIO.cpp b/dbms/src/IO/ReadBufferAIO.cpp
index f7d3cd475af..37a6aae5c05 100644
--- a/dbms/src/IO/ReadBufferAIO.cpp
+++ b/dbms/src/IO/ReadBufferAIO.cpp
@@ -163,17 +163,17 @@ off_t ReadBufferAIO::seek(off_t off, int whence)
     {
         if (off >= 0)
         {
-            if (off > (std::numeric_limits<off_t>::max() - getPositionInFile()))
+            if (off > (std::numeric_limits<off_t>::max() - getPosition()))
                 throw Exception("SEEK_CUR overflow", ErrorCodes::ARGUMENT_OUT_OF_BOUND);
         }
-        else if (off < -getPositionInFile())
+        else if (off < -getPosition())
             throw Exception("SEEK_CUR underflow", ErrorCodes::ARGUMENT_OUT_OF_BOUND);
-        new_pos_in_file = getPositionInFile() + off;
+        new_pos_in_file = getPosition() + off;
     }
     else
         throw Exception("ReadBufferAIO::seek expects SEEK_SET or SEEK_CUR as whence", ErrorCodes::ARGUMENT_OUT_OF_BOUND);
 
-    if (new_pos_in_file != getPositionInFile())
+    if (new_pos_in_file != getPosition())
     {
         off_t first_read_pos_in_file = first_unread_pos_in_file - static_cast<off_t>(working_buffer.size());
         if (hasPendingData() && (new_pos_in_file >= first_read_pos_in_file) && (new_pos_in_file <= first_unread_pos_in_file))
diff --git a/dbms/src/IO/ReadBufferAIO.h b/dbms/src/IO/ReadBufferAIO.h
index 446034e4733..2dd11b44095 100644
--- a/dbms/src/IO/ReadBufferAIO.h
+++ b/dbms/src/IO/ReadBufferAIO.h
@@ -36,9 +36,9 @@ public:
     ReadBufferAIO & operator=(const ReadBufferAIO &) = delete;
 
     void setMaxBytes(size_t max_bytes_read_);
-    off_t getPositionInFile() override { return first_unread_pos_in_file - (working_buffer.end() - pos); }
+    off_t getPosition() override { return first_unread_pos_in_file - (working_buffer.end() - pos); }
     std::string getFileName() const override { return filename; }
-    int getFD() const override { return fd; }
+    int getFD() const { return fd; }
 
     off_t seek(off_t off, int whence) override;
 
diff --git a/dbms/src/IO/ReadBufferFromFileBase.h b/dbms/src/IO/ReadBufferFromFileBase.h
index d9e26d80405..58f54904e85 100644
--- a/dbms/src/IO/ReadBufferFromFileBase.h
+++ b/dbms/src/IO/ReadBufferFromFileBase.h
@@ -1,17 +1,16 @@
 #pragma once
 
-#include <string>
 #include <ctime>
 #include <functional>
+#include <string>
 #include <fcntl.h>
-#include <IO/ReadBuffer.h>
 #include <IO/BufferWithOwnMemory.h>
+#include <IO/ReadBuffer.h>
 #include <port/clock.h>
 #include "SeekableReadBuffer.h"
 
 namespace DB
 {
-
 class ReadBufferFromFileBase : public BufferWithOwnMemory<SeekableReadBuffer>
 {
 public:
@@ -19,9 +18,7 @@ public:
     ReadBufferFromFileBase(size_t buf_size, char * existing_memory, size_t alignment);
     ReadBufferFromFileBase(ReadBufferFromFileBase &&) = default;
     ~ReadBufferFromFileBase() override;
-    virtual off_t getPositionInFile() = 0;
     virtual std::string getFileName() const = 0;
-    virtual int getFD() const = 0;
 
     /// It is possible to get information about the time of each reading.
     struct ProfileInfo
@@ -43,7 +40,6 @@ public:
 protected:
     ProfileCallback profile_callback;
     clockid_t clock_type{};
-
 };
 
 }
diff --git a/dbms/src/IO/ReadBufferFromFileDescriptor.h b/dbms/src/IO/ReadBufferFromFileDescriptor.h
index b19a04aef18..dc2d581ebf1 100644
--- a/dbms/src/IO/ReadBufferFromFileDescriptor.h
+++ b/dbms/src/IO/ReadBufferFromFileDescriptor.h
@@ -27,12 +27,12 @@ public:
 
     ReadBufferFromFileDescriptor(ReadBufferFromFileDescriptor &&) = default;
 
-    int getFD() const override
+    int getFD() const
     {
         return fd;
     }
 
-    off_t getPositionInFile() override
+    off_t getPosition() override
     {
         return pos_in_file - (working_buffer.end() - pos);
     }
diff --git a/dbms/src/IO/ReadBufferFromMemory.cpp b/dbms/src/IO/ReadBufferFromMemory.cpp
index 2097c78c34e..98c39c833b0 100644
--- a/dbms/src/IO/ReadBufferFromMemory.cpp
+++ b/dbms/src/IO/ReadBufferFromMemory.cpp
@@ -43,4 +43,9 @@ off_t ReadBufferFromMemory::seek(off_t offset, int whence)
         throw Exception("Only SEEK_SET and SEEK_CUR seek modes allowed.", ErrorCodes::CANNOT_SEEK_THROUGH_FILE);
 }
 
+off_t ReadBufferFromMemory::getPosition()
+{
+    return pos - working_buffer.begin();
+}
+
 }
diff --git a/dbms/src/IO/ReadBufferFromMemory.h b/dbms/src/IO/ReadBufferFromMemory.h
index ba79c9e56bf..c4330f312f9 100644
--- a/dbms/src/IO/ReadBufferFromMemory.h
+++ b/dbms/src/IO/ReadBufferFromMemory.h
@@ -25,6 +25,8 @@ public:
     }
 
     off_t seek(off_t off, int whence) override;
+
+    off_t getPosition() override;
 };
 
 }
diff --git a/dbms/src/IO/ReadBufferFromS3.cpp b/dbms/src/IO/ReadBufferFromS3.cpp
index a77d742370c..1fa318cf629 100644
--- a/dbms/src/IO/ReadBufferFromS3.cpp
+++ b/dbms/src/IO/ReadBufferFromS3.cpp
@@ -58,6 +58,12 @@ off_t ReadBufferFromS3::seek(off_t offset_, int whence)
     return offset;
 }
 
+
+off_t ReadBufferFromS3::getPosition()
+{
+    return offset + count();
+}
+
 std::unique_ptr<ReadBuffer> ReadBufferFromS3::initialize()
 {
     LOG_TRACE(log, "Read S3 object. Bucket: " + bucket + ", Key: " + key + ", Offset: " + std::to_string(offset));
diff --git a/dbms/src/IO/ReadBufferFromS3.h b/dbms/src/IO/ReadBufferFromS3.h
index f04794d4458..15d3b4f0beb 100644
--- a/dbms/src/IO/ReadBufferFromS3.h
+++ b/dbms/src/IO/ReadBufferFromS3.h
@@ -45,6 +45,7 @@ public:
     bool nextImpl() override;
 
     off_t seek(off_t off, int whence) override;
+    off_t getPosition() override;
 
 private:
     std::unique_ptr<ReadBuffer> initialize();
diff --git a/dbms/src/IO/SeekableReadBuffer.h b/dbms/src/IO/SeekableReadBuffer.h
index 7deb30ed28f..be06887e61f 100644
--- a/dbms/src/IO/SeekableReadBuffer.h
+++ b/dbms/src/IO/SeekableReadBuffer.h
@@ -20,6 +20,11 @@ public:
      * @return New position from the begging of underlying buffer / file.
      */
     virtual off_t seek(off_t off, int whence) = 0;
+
+    /**
+     * @return Offset from the begging of underlying buffer / file corresponds to buffer current position.
+     */
+    virtual off_t getPosition() = 0;
 };
 
 }
diff --git a/dbms/src/Storages/StorageLog.cpp b/dbms/src/Storages/StorageLog.cpp
index 549dd352a48..da3023a5776 100644
--- a/dbms/src/Storages/StorageLog.cpp
+++ b/dbms/src/Storages/StorageLog.cpp
@@ -5,6 +5,9 @@
 #include <Common/StringUtils/StringUtils.h>
 #include <Common/typeid_cast.h>
 
+#include <Interpreters/evaluateConstantExpression.h>
+
+#include <IO/ReadBufferFromFileBase.h>
 #include <Compression/CompressedReadBuffer.h>
 #include <Compression/CompressedWriteBuffer.h>
 #include <IO/ReadHelpers.h>
@@ -18,6 +21,7 @@
 #include <Columns/ColumnArray.h>
 
 #include <Interpreters/Context.h>
+#include <Parsers/ASTLiteral.h>
 
 
 #define DBMS_STORAGE_LOG_DATA_FILE_EXTENSION ".bin"
@@ -88,7 +92,7 @@ private:
                 plain->seek(offset, SEEK_SET);
         }
 
-        std::unique_ptr<SeekableReadBuffer> plain;
+        std::unique_ptr<ReadBufferFromFileBase> plain;
         CompressedReadBuffer compressed;
     };
 
@@ -625,13 +629,26 @@ void registerStorageLog(StorageFactory & factory)
 {
     factory.registerStorage("Log", [](const StorageFactory::Arguments & args)
     {
-        if (!args.engine_args.empty())
+        ASTs & engine_args = args.engine_args;
+
+        if (engine_args.size() > 1)
             throw Exception(
-                "Engine " + args.engine_name + " doesn't support any arguments (" + toString(args.engine_args.size()) + " given)",
+                "Engine " + args.engine_name + " requires 0 or 1 arguments: [disk_name] (" + toString(args.engine_args.size()) + " given)",
                 ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
 
+        for (size_t i = 0; i < engine_args.size(); ++i)
+            engine_args[i] = evaluateConstantExpressionOrIdentifierAsLiteral(engine_args[i], args.local_context);
+
+        DiskPtr disk = args.context.getDefaultDisk();
+
+        if (engine_args.size() == 1)
+        {
+            String disk_name = engine_args[0]->as<ASTLiteral &>().value.safeGet<String>();
+            disk = args.context.getDisk(disk_name);
+        }
+
         return StorageLog::create(
-            args.context.getDefaultDisk(), args.relative_data_path, args.table_id, args.columns, args.constraints,
+            disk, args.relative_data_path, args.table_id, args.columns, args.constraints,
             args.context.getSettings().max_compress_block_size);
     });
 }
diff --git a/dbms/src/Storages/StorageStripeLog.cpp b/dbms/src/Storages/StorageStripeLog.cpp
index 5aa3b52f969..9912cf47a89 100644
--- a/dbms/src/Storages/StorageStripeLog.cpp
+++ b/dbms/src/Storages/StorageStripeLog.cpp
@@ -8,7 +8,6 @@
 #include <Common/escapeForFileName.h>
 #include <Common/Exception.h>
 
-#include <Compression/CompressedReadBuffer.h>
 #include <Compression/CompressedReadBufferFromFile.h>
 #include <Compression/CompressedWriteBuffer.h>
 #include <IO/ReadHelpers.h>
@@ -26,8 +25,10 @@
 
 #include <Interpreters/Context.h>
 
-#include <Storages/StorageStripeLog.h>
+#include <Interpreters/evaluateConstantExpression.h>
+#include <Parsers/ASTLiteral.h>
 #include <Storages/StorageFactory.h>
+#include <Storages/StorageStripeLog.h>
 
 
 namespace DB
@@ -120,7 +121,7 @@ private:
             String data_file_path = storage.table_path + "data.bin";
             size_t buffer_size = std::min(max_read_buffer_size, storage.disk->getFileSize(data_file_path));
 
-            data_in.emplace(fullPath(storage.disk, data_file_path), 0, 0, buffer_size);
+            data_in.emplace(storage.disk->readFile(data_file_path, buffer_size));
             block_in.emplace(*data_in, 0, index_begin, index_end);
         }
     }
@@ -253,7 +254,7 @@ BlockInputStreams StorageStripeLog::read(
     if (!disk->exists(index_file))
         return { std::make_shared<NullBlockInputStream>(getSampleBlockForColumns(column_names)) };
 
-    CompressedReadBufferFromFile index_in(fullPath(disk, index_file), 0, 0, 0, INDEX_BUFFER_SIZE);
+    CompressedReadBufferFromFile index_in(disk->readFile(index_file, INDEX_BUFFER_SIZE));
     std::shared_ptr<const IndexForNativeFormat> index{std::make_shared<IndexForNativeFormat>(index_in, column_names_set)};
 
     BlockInputStreams res;
@@ -307,13 +308,26 @@ void registerStorageStripeLog(StorageFactory & factory)
 {
     factory.registerStorage("StripeLog", [](const StorageFactory::Arguments & args)
     {
-        if (!args.engine_args.empty())
+        ASTs & engine_args = args.engine_args;
+
+        if (engine_args.size() > 1)
             throw Exception(
-                "Engine " + args.engine_name + " doesn't support any arguments (" + toString(args.engine_args.size()) + " given)",
+                "Engine " + args.engine_name + " requires 0 or 1 arguments: [disk_name] (" + toString(args.engine_args.size()) + " given)",
                 ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
 
+        for (size_t i = 0; i < engine_args.size(); ++i)
+            engine_args[i] = evaluateConstantExpressionOrIdentifierAsLiteral(engine_args[i], args.local_context);
+
+        DiskPtr disk = args.context.getDefaultDisk();
+
+        if (engine_args.size() == 1)
+        {
+            String disk_name = engine_args[0]->as<ASTLiteral &>().value.safeGet<String>();
+            disk = args.context.getDisk(disk_name);
+        }
+
         return StorageStripeLog::create(
-            args.context.getDefaultDisk(), args.relative_data_path, args.table_id, args.columns, args.constraints,
+            disk, args.relative_data_path, args.table_id, args.columns, args.constraints,
             args.attach, args.context.getSettings().max_compress_block_size);
     });
 }
diff --git a/dbms/src/Storages/StorageTinyLog.cpp b/dbms/src/Storages/StorageTinyLog.cpp
index aeb90399816..db90ba3e41d 100644
--- a/dbms/src/Storages/StorageTinyLog.cpp
+++ b/dbms/src/Storages/StorageTinyLog.cpp
@@ -10,6 +10,7 @@
 #include <Common/Exception.h>
 #include <Common/typeid_cast.h>
 
+#include <IO/ReadBufferFromFileBase.h>
 #include <Compression/CompressionFactory.h>
 #include <Compression/CompressedReadBuffer.h>
 #include <Compression/CompressedWriteBuffer.h>
@@ -25,9 +26,11 @@
 
 #include <Interpreters/Context.h>
 
-#include <Storages/StorageTinyLog.h>
-#include <Storages/StorageFactory.h>
+#include <Interpreters/evaluateConstantExpression.h>
+#include <Parsers/ASTLiteral.h>
 #include <Storages/CheckResults.h>
+#include <Storages/StorageFactory.h>
+#include <Storages/StorageTinyLog.h>
 
 #define DBMS_STORAGE_LOG_DATA_FILE_EXTENSION ".bin"
 
@@ -436,13 +439,26 @@ void registerStorageTinyLog(StorageFactory & factory)
 {
     factory.registerStorage("TinyLog", [](const StorageFactory::Arguments & args)
     {
-        if (!args.engine_args.empty())
+        ASTs & engine_args = args.engine_args;
+
+        if (engine_args.size() > 1)
             throw Exception(
-                "Engine " + args.engine_name + " doesn't support any arguments (" + toString(args.engine_args.size()) + " given)",
+                "Engine " + args.engine_name + " requires 0 or 1 arguments: [disk_name] (" + toString(args.engine_args.size()) + " given)",
                 ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
 
+        for (size_t i = 0; i < engine_args.size(); ++i)
+            engine_args[i] = evaluateConstantExpressionOrIdentifierAsLiteral(engine_args[i], args.local_context);
+
+        DiskPtr disk = args.context.getDefaultDisk();
+
+        if (engine_args.size() == 1)
+        {
+            String disk_name = engine_args[0]->as<ASTLiteral &>().value.safeGet<String>();
+            disk = args.context.getDisk(disk_name);
+        }
+
         return StorageTinyLog::create(
-            args.context.getDefaultDisk(), args.relative_data_path, args.table_id, args.columns, args.constraints,
+            disk, args.relative_data_path, args.table_id, args.columns, args.constraints,
             args.attach, args.context.getSettings().max_compress_block_size);
     });
 }
diff --git a/dbms/tests/config/disks.xml b/dbms/tests/config/disks.xml
new file mode 100644
index 00000000000..1387c7f76f1
--- /dev/null
+++ b/dbms/tests/config/disks.xml
@@ -0,0 +1,9 @@
+<yandex>
+    <storage_configuration>
+        <disks>
+            <disk_memory>
+                <type>memory</type>
+            </disk_memory>
+        </disks>
+    </storage_configuration>
+</yandex>
\ No newline at end of file
diff --git a/dbms/tests/integration/test_disk_memory/__init__.py b/dbms/tests/integration/test_disk_memory/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/dbms/tests/integration/test_disk_memory/configs/config.xml b/dbms/tests/integration/test_disk_memory/configs/config.xml
deleted file mode 100644
index 4573ddf9cfd..00000000000
--- a/dbms/tests/integration/test_disk_memory/configs/config.xml
+++ /dev/null
@@ -1,19 +0,0 @@
-<yandex>
-    <logger>
-        <level>trace</level>
-        <log>/var/log/clickhouse-server/clickhouse-server.log</log>
-        <errorlog>/var/log/clickhouse-server/clickhouse-server.err.log</errorlog>
-        <size>1000M</size>
-        <count>10</count>
-        <stderr>/var/log/clickhouse-server/stderr.log</stderr>
-        <stdout>/var/log/clickhouse-server/stdout.log</stdout>
-    </logger>
-
-    <storage_configuration>
-        <disks>
-            <default>
-                <type>memory</type>
-            </default>
-        </disks>
-    </storage_configuration>
-</yandex>
diff --git a/dbms/tests/integration/test_disk_memory/test.py b/dbms/tests/integration/test_disk_memory/test.py
deleted file mode 100644
index ea9309fcbe1..00000000000
--- a/dbms/tests/integration/test_disk_memory/test.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import pytest
-
-from helpers.cluster import ClickHouseCluster
-from helpers.test_tools import TSV
-
-cluster = ClickHouseCluster(__file__)
-node = cluster.add_instance("node", main_configs=['configs/config.xml'])
-
-
-@pytest.fixture(scope="module")
-def started_cluster():
-    try:
-        cluster.start()
-        yield cluster
-    finally:
-        cluster.shutdown()
-
-
-def test_tinylog(started_cluster):
-    node.query('''CREATE DATABASE IF NOT EXISTS test''')
-
-    node.query('''CREATE TABLE test.tinylog (s String, n UInt8) ENGINE = TinyLog''')
-
-    node.query('''INSERT INTO test.tinylog SELECT toString(number), number * 2 FROM system.numbers LIMIT 5''')
-    assert TSV(node.query('''SELECT * FROM test.tinylog''')) == TSV('0\t0\n1\t2\n2\t4\n3\t6\n4\t8')
-
-    node.query('''TRUNCATE TABLE test.tinylog''')
-    assert TSV(node.query('''SELECT * FROM test.tinylog''')) == TSV('')
-
-    node.query('''DROP TABLE test.tinylog''')
diff --git a/dbms/tests/integration/test_log_family_s3/test.py b/dbms/tests/integration/test_log_family_s3/test.py
index 09002a95f2c..63272a7c58e 100644
--- a/dbms/tests/integration/test_log_family_s3/test.py
+++ b/dbms/tests/integration/test_log_family_s3/test.py
@@ -34,8 +34,10 @@ def cluster():
         cluster.shutdown()
 
 
-@pytest.mark.parametrize("log_engine,files_overhead", [("TinyLog", 1), ("Log", 2)])
-def test_log_family_s3(cluster, log_engine, files_overhead):
+@pytest.mark.parametrize(
+    "log_engine,files_overhead,files_overhead_per_insert",
+    [("TinyLog", 1, 1), ("Log", 2, 1), ("StripeLog", 1, 2)])
+def test_log_family_s3(cluster, log_engine, files_overhead, files_overhead_per_insert):
     node = cluster.instances["node"]
     minio = cluster.minio_client
 
@@ -43,15 +45,15 @@ def test_log_family_s3(cluster, log_engine, files_overhead):
 
     node.query("INSERT INTO s3_test SELECT number FROM numbers(5)")
     assert node.query("SELECT * FROM s3_test") == "0\n1\n2\n3\n4\n"
-    assert len(list(minio.list_objects(cluster.minio_bucket, 'data/'))) == 1 + files_overhead
+    assert len(list(minio.list_objects(cluster.minio_bucket, 'data/'))) == files_overhead_per_insert + files_overhead
 
     node.query("INSERT INTO s3_test SELECT number + 5 FROM numbers(3)")
     assert node.query("SELECT * FROM s3_test order by id") == "0\n1\n2\n3\n4\n5\n6\n7\n"
-    assert len(list(minio.list_objects(cluster.minio_bucket, 'data/'))) == 2 + files_overhead
+    assert len(list(minio.list_objects(cluster.minio_bucket, 'data/'))) == files_overhead_per_insert * 2 + files_overhead
 
     node.query("INSERT INTO s3_test SELECT number + 8 FROM numbers(1)")
     assert node.query("SELECT * FROM s3_test order by id") == "0\n1\n2\n3\n4\n5\n6\n7\n8\n"
-    assert len(list(minio.list_objects(cluster.minio_bucket, 'data/'))) == 3 + files_overhead
+    assert len(list(minio.list_objects(cluster.minio_bucket, 'data/'))) == files_overhead_per_insert * 3 + files_overhead
 
     node.query("TRUNCATE TABLE s3_test")
     assert len(list(minio.list_objects(cluster.minio_bucket, 'data/'))) == 0
diff --git a/dbms/tests/queries/0_stateless/01079_log_family_disk_memory.reference b/dbms/tests/queries/0_stateless/01079_log_family_disk_memory.reference
new file mode 100644
index 00000000000..9d8e6d18e1d
--- /dev/null
+++ b/dbms/tests/queries/0_stateless/01079_log_family_disk_memory.reference
@@ -0,0 +1,18 @@
+0
+0
+1
+0
+1
+2
+0
+0
+1
+0
+1
+2
+0
+0
+1
+0
+1
+2
diff --git a/dbms/tests/queries/0_stateless/01079_log_family_disk_memory.sql b/dbms/tests/queries/0_stateless/01079_log_family_disk_memory.sql
new file mode 100644
index 00000000000..bbc75f1d3bd
--- /dev/null
+++ b/dbms/tests/queries/0_stateless/01079_log_family_disk_memory.sql
@@ -0,0 +1,40 @@
+DROP TABLE IF EXISTS log;
+
+CREATE TABLE log (x UInt8) ENGINE = StripeLog ('disk_memory');
+
+SELECT * FROM log ORDER BY x;
+INSERT INTO log VALUES (0);
+SELECT * FROM log ORDER BY x;
+INSERT INTO log VALUES (1);
+SELECT * FROM log ORDER BY x;
+INSERT INTO log VALUES (2);
+SELECT * FROM log ORDER BY x;
+
+TRUNCATE TABLE log;
+DROP TABLE log;
+
+CREATE TABLE log (x UInt8) ENGINE = TinyLog ('disk_memory');
+
+SELECT * FROM log ORDER BY x;
+INSERT INTO log VALUES (0);
+SELECT * FROM log ORDER BY x;
+INSERT INTO log VALUES (1);
+SELECT * FROM log ORDER BY x;
+INSERT INTO log VALUES (2);
+SELECT * FROM log ORDER BY x;
+
+TRUNCATE TABLE log;
+DROP TABLE log;
+
+CREATE TABLE log (x UInt8) ENGINE = Log ('disk_memory');
+
+SELECT * FROM log ORDER BY x;
+INSERT INTO log VALUES (0);
+SELECT * FROM log ORDER BY x;
+INSERT INTO log VALUES (1);
+SELECT * FROM log ORDER BY x;
+INSERT INTO log VALUES (2);
+SELECT * FROM log ORDER BY x;
+
+TRUNCATE TABLE log;
+DROP TABLE log;
diff --git a/docker/test/stateless/Dockerfile b/docker/test/stateless/Dockerfile
index 51e92bfa5a3..c30f63c0c13 100644
--- a/docker/test/stateless/Dockerfile
+++ b/docker/test/stateless/Dockerfile
@@ -56,5 +56,6 @@ CMD dpkg -i package_folder/clickhouse-common-static_*.deb; \
     ln -s /usr/share/clickhouse-test/config/strings_dictionary.xml /etc/clickhouse-server/; \
     ln -s /usr/share/clickhouse-test/config/decimals_dictionary.xml /etc/clickhouse-server/; \
     ln -s /usr/share/clickhouse-test/config/macros.xml /etc/clickhouse-server/config.d/; \
+    ln -s /usr/share/clickhouse-test/config/disks.xml /etc/clickhouse-server/config.d/; \
     service zookeeper start; sleep 5; \
     service clickhouse-server start && sleep 5 && clickhouse-test --testname --shard --zookeeper $ADDITIONAL_OPTIONS $SKIP_TESTS_OPTION 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee test_output/test_result.txt

From f3bec50d2298d01a1ae4ff24b8cc8838d79a881c Mon Sep 17 00:00:00 2001
From: Pavel Kovalenko <jokserfn@gmail.com>
Date: Mon, 17 Feb 2020 13:49:36 +0300
Subject: [PATCH 02/40] Fixed getPositionInFile() method usages.

---
 dbms/src/IO/tests/read_buffer_aio.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dbms/src/IO/tests/read_buffer_aio.cpp b/dbms/src/IO/tests/read_buffer_aio.cpp
index adb2f7d5458..04aafd1fee2 100644
--- a/dbms/src/IO/tests/read_buffer_aio.cpp
+++ b/dbms/src/IO/tests/read_buffer_aio.cpp
@@ -283,14 +283,14 @@ bool test6(const std::string & filename, const std::string & buf)
 
     DB::ReadBufferAIO in(filename, 3 * DEFAULT_AIO_FILE_BLOCK_SIZE);
 
-    if (in.getPositionInFile() != 0)
+    if (in.getPosition() != 0)
         return false;
 
     size_t count = in.read(newbuf.data(), newbuf.length());
     if (count != newbuf.length())
         return false;
 
-    if (static_cast<size_t>(in.getPositionInFile()) != buf.length())
+    if (static_cast<size_t>(in.getPosition()) != buf.length())
         return false;
 
     return true;
@@ -646,7 +646,7 @@ bool test20(const std::string & filename, const std::string & buf)
             return false;
     }
 
-    (void) in.getPositionInFile();
+    (void) in.getPosition();
 
     {
         std::string newbuf;

From cceaf69264b043958a9a3c4bf081eb4ae3ade4ed Mon Sep 17 00:00:00 2001
From: Pavel Kovalenko <jokserfn@gmail.com>
Date: Mon, 17 Feb 2020 13:51:16 +0300
Subject: [PATCH 03/40] Change log_family_disk_memory test number.

---
 ...sk_memory.reference => 01082_log_family_disk_memory.reference} | 0
 ...og_family_disk_memory.sql => 01082_log_family_disk_memory.sql} | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename dbms/tests/queries/0_stateless/{01079_log_family_disk_memory.reference => 01082_log_family_disk_memory.reference} (100%)
 rename dbms/tests/queries/0_stateless/{01079_log_family_disk_memory.sql => 01082_log_family_disk_memory.sql} (100%)

diff --git a/dbms/tests/queries/0_stateless/01079_log_family_disk_memory.reference b/dbms/tests/queries/0_stateless/01082_log_family_disk_memory.reference
similarity index 100%
rename from dbms/tests/queries/0_stateless/01079_log_family_disk_memory.reference
rename to dbms/tests/queries/0_stateless/01082_log_family_disk_memory.reference
diff --git a/dbms/tests/queries/0_stateless/01079_log_family_disk_memory.sql b/dbms/tests/queries/0_stateless/01082_log_family_disk_memory.sql
similarity index 100%
rename from dbms/tests/queries/0_stateless/01079_log_family_disk_memory.sql
rename to dbms/tests/queries/0_stateless/01082_log_family_disk_memory.sql

From cf474e1f09becd101c84baa20188237da1ef20fb Mon Sep 17 00:00:00 2001
From: Pavel Kovalenko <jokserfn@gmail.com>
Date: Mon, 17 Feb 2020 18:00:39 +0300
Subject: [PATCH 04/40] Fixed DiskMemory->readFile() test.

---
 dbms/src/Disks/DiskMemory.cpp       | 8 +++++---
 dbms/src/Disks/tests/gtest_disk.cpp | 8 +++++---
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/dbms/src/Disks/DiskMemory.cpp b/dbms/src/Disks/DiskMemory.cpp
index 30509d42b43..776ea264940 100644
--- a/dbms/src/Disks/DiskMemory.cpp
+++ b/dbms/src/Disks/DiskMemory.cpp
@@ -55,11 +55,13 @@ bool ReadIndirectBuffer::nextImpl()
 
 off_t ReadIndirectBuffer::seek(off_t off, int whence)
 {
-    if (whence != SEEK_SET)
-        throw Exception("Only SEEK_SET mode is allowed.", ErrorCodes::CANNOT_SEEK_THROUGH_FILE);
+    /// Synchronize position in working buffer and string buffer.
+    buf.seek(offset(), SEEK_SET);
 
+    /// Seek string buffer position.
     off_t result = buf.seek(off, whence);
-    pos = buf.position();
+
+    pos = working_buffer.begin() + result;
 
     return result;
 }
diff --git a/dbms/src/Disks/tests/gtest_disk.cpp b/dbms/src/Disks/tests/gtest_disk.cpp
index 7c487c9a428..ea67545e5fa 100644
--- a/dbms/src/Disks/tests/gtest_disk.cpp
+++ b/dbms/src/Disks/tests/gtest_disk.cpp
@@ -108,11 +108,13 @@ TYPED_TEST(DiskTest, readFile)
 
     // Test SEEK_SET
     {
-        DB::String data;
+        String buf(4, '0');
         std::unique_ptr<DB::SeekableReadBuffer> in = disk->readFile("test_file");
+
         in->seek(5, SEEK_SET);
-        readString(data, *in);
-        EXPECT_EQ("data", data);
+
+        in->readStrict(buf.data(), 4);
+        EXPECT_EQ("data", buf);
     }
 
     // Test SEEK_CUR

From e849654628dbd767a966fac3d622ee2868e18ff9 Mon Sep 17 00:00:00 2001
From: Pavel Kovalenko <jokserfn@gmail.com>
Date: Mon, 17 Feb 2020 23:08:35 +0300
Subject: [PATCH 05/40] Reworked ReadIndirectBuffer for DiskMemory.

---
 dbms/src/Disks/DiskMemory.cpp | 62 ++++++++++++++++++++++-------------
 dbms/src/Disks/DiskMemory.h   | 11 +------
 2 files changed, 40 insertions(+), 33 deletions(-)

diff --git a/dbms/src/Disks/DiskMemory.cpp b/dbms/src/Disks/DiskMemory.cpp
index 776ea264940..2b249858f6b 100644
--- a/dbms/src/Disks/DiskMemory.cpp
+++ b/dbms/src/Disks/DiskMemory.cpp
@@ -16,6 +16,7 @@ namespace ErrorCodes
     extern const int DIRECTORY_DOESNT_EXIST;
     extern const int CANNOT_DELETE_DIRECTORY;
     extern const int CANNOT_SEEK_THROUGH_FILE;
+    extern const int SEEK_POSITION_OUT_OF_BOUND;
 }
 
 
@@ -38,32 +39,39 @@ private:
     std::vector<String>::iterator iter;
 };
 
-bool ReadIndirectBuffer::nextImpl()
+off_t ReadIndirectBuffer::seek(off_t offset, int whence)
 {
-    if (!initialized)
+    if (whence == SEEK_SET)
     {
-        initialized = true;
-
-        internal_buffer = buf.buffer();
-        working_buffer = internal_buffer;
-
-        return true;
+        if (offset >= 0 && working_buffer.begin() + offset < working_buffer.end())
+        {
+            pos = working_buffer.begin() + offset;
+            return size_t(pos - working_buffer.begin());
+        }
+        else
+            throw Exception(
+                "Seek position is out of bounds. "
+                "Offset: "
+                    + std::to_string(offset) + ", Max: " + std::to_string(size_t(working_buffer.end() - working_buffer.begin())),
+                ErrorCodes::SEEK_POSITION_OUT_OF_BOUND);
     }
-
-    return false;
-}
-
-off_t ReadIndirectBuffer::seek(off_t off, int whence)
-{
-    /// Synchronize position in working buffer and string buffer.
-    buf.seek(offset(), SEEK_SET);
-
-    /// Seek string buffer position.
-    off_t result = buf.seek(off, whence);
-
-    pos = working_buffer.begin() + result;
-
-    return result;
+    else if (whence == SEEK_CUR)
+    {
+        Position new_pos = pos + offset;
+        if (new_pos >= working_buffer.begin() && new_pos < working_buffer.end())
+        {
+            pos = new_pos;
+            return size_t(pos - working_buffer.begin());
+        }
+        else
+            throw Exception(
+                "Seek position is out of bounds. "
+                "Offset: "
+                    + std::to_string(offset) + ", Max: " + std::to_string(size_t(working_buffer.end() - working_buffer.begin())),
+                ErrorCodes::SEEK_POSITION_OUT_OF_BOUND);
+    }
+    else
+        throw Exception("Only SEEK_SET and SEEK_CUR seek modes allowed.", ErrorCodes::CANNOT_SEEK_THROUGH_FILE);
 }
 
 off_t ReadIndirectBuffer::getPosition()
@@ -71,6 +79,14 @@ off_t ReadIndirectBuffer::getPosition()
     return pos - working_buffer.begin();
 }
 
+ReadIndirectBuffer::ReadIndirectBuffer(String path_, const String & data_)
+    : ReadBufferFromFileBase(), buf(ReadBufferFromString(data_)), path(std::move(path_))
+{
+    internal_buffer = buf.buffer();
+    working_buffer = internal_buffer;
+    pos = working_buffer.begin();
+}
+
 void WriteIndirectBuffer::finalize()
 {
     if (isFinished())
diff --git a/dbms/src/Disks/DiskMemory.h b/dbms/src/Disks/DiskMemory.h
index dace7c47da3..2dd93e58549 100644
--- a/dbms/src/Disks/DiskMemory.h
+++ b/dbms/src/Disks/DiskMemory.h
@@ -19,22 +19,13 @@ class WriteBuffer;
 class ReadIndirectBuffer : public ReadBufferFromFileBase
 {
 public:
-    ReadIndirectBuffer(String path_, const String & data_)
-        : ReadBufferFromFileBase(), buf(ReadBufferFromString(data_)), path(std::move(path_))
-    {
-    }
+    ReadIndirectBuffer(String path_, const String & data_);
 
     std::string getFileName() const override { return path; }
-
-private:
-    bool nextImpl() override;
-
-public:
     off_t seek(off_t off, int whence) override;
     off_t getPosition() override;
 
 private:
-    bool initialized = false;
     ReadBufferFromString buf;
     String path;
 };

From a8cffd55aab25b5cbed47f13bc66ee27e07c11ba Mon Sep 17 00:00:00 2001
From: Pavel Kovalenko <jokserfn@gmail.com>
Date: Tue, 18 Feb 2020 12:51:22 +0300
Subject: [PATCH 06/40] Formatting issues.

---
 dbms/src/Disks/DiskMemory.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/dbms/src/Disks/DiskMemory.cpp b/dbms/src/Disks/DiskMemory.cpp
index 2b249858f6b..ca79cab79af 100644
--- a/dbms/src/Disks/DiskMemory.cpp
+++ b/dbms/src/Disks/DiskMemory.cpp
@@ -39,6 +39,14 @@ private:
     std::vector<String>::iterator iter;
 };
 
+ReadIndirectBuffer::ReadIndirectBuffer(String path_, const String & data_)
+    : ReadBufferFromFileBase(), buf(ReadBufferFromString(data_)), path(std::move(path_))
+{
+    internal_buffer = buf.buffer();
+    working_buffer = internal_buffer;
+    pos = working_buffer.begin();
+}
+
 off_t ReadIndirectBuffer::seek(off_t offset, int whence)
 {
     if (whence == SEEK_SET)
@@ -79,14 +87,6 @@ off_t ReadIndirectBuffer::getPosition()
     return pos - working_buffer.begin();
 }
 
-ReadIndirectBuffer::ReadIndirectBuffer(String path_, const String & data_)
-    : ReadBufferFromFileBase(), buf(ReadBufferFromString(data_)), path(std::move(path_))
-{
-    internal_buffer = buf.buffer();
-    working_buffer = internal_buffer;
-    pos = working_buffer.begin();
-}
-
 void WriteIndirectBuffer::finalize()
 {
     if (isFinished())

From 0040ad32892b2072454f09bd09ca9d2eba241644 Mon Sep 17 00:00:00 2001
From: Pavel Kovalenko <jokserfn@gmail.com>
Date: Tue, 18 Feb 2020 15:24:20 +0300
Subject: [PATCH 07/40] Fixed includes.

---
 dbms/src/Compression/CompressedReadBufferFromFile.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/dbms/src/Compression/CompressedReadBufferFromFile.h b/dbms/src/Compression/CompressedReadBufferFromFile.h
index 33727909a69..1729490f606 100644
--- a/dbms/src/Compression/CompressedReadBufferFromFile.h
+++ b/dbms/src/Compression/CompressedReadBufferFromFile.h
@@ -1,10 +1,9 @@
 #pragma once
 
-#include <memory>
-#include <time.h>
-#include <IO/ReadBufferFromFileBase.h>
-#include <port/clock.h>
 #include "CompressedReadBufferBase.h"
+#include <IO/ReadBufferFromFileBase.h>
+#include <time.h>
+#include <memory>
 
 
 namespace DB

From 05ceef4ef889e503cde03159465a20cd44b9e47a Mon Sep 17 00:00:00 2001
From: Pavel Kovalenko <jokserfn@gmail.com>
Date: Tue, 18 Feb 2020 17:41:30 +0300
Subject: [PATCH 08/40] Log family storages disk support via 'SETTINGS' clause.

---
 dbms/src/Storages/StorageLog.cpp              | 26 +++++++------------
 dbms/src/Storages/StorageLogSettings.cpp      | 22 ++++++++++++++++
 dbms/src/Storages/StorageLogSettings.h        | 10 +++++++
 dbms/src/Storages/StorageStripeLog.cpp        | 25 +++++++-----------
 dbms/src/Storages/StorageTinyLog.cpp          | 25 +++++++-----------
 ...=> 01083_log_family_disk_memory.reference} |  0
 ...y.sql => 01083_log_family_disk_memory.sql} |  6 ++---
 7 files changed, 65 insertions(+), 49 deletions(-)
 create mode 100644 dbms/src/Storages/StorageLogSettings.cpp
 create mode 100644 dbms/src/Storages/StorageLogSettings.h
 rename dbms/tests/queries/0_stateless/{01082_log_family_disk_memory.reference => 01083_log_family_disk_memory.reference} (100%)
 rename dbms/tests/queries/0_stateless/{01082_log_family_disk_memory.sql => 01083_log_family_disk_memory.sql} (75%)

diff --git a/dbms/src/Storages/StorageLog.cpp b/dbms/src/Storages/StorageLog.cpp
index da3023a5776..53adbf47203 100644
--- a/dbms/src/Storages/StorageLog.cpp
+++ b/dbms/src/Storages/StorageLog.cpp
@@ -22,7 +22,7 @@
 
 #include <Interpreters/Context.h>
 #include <Parsers/ASTLiteral.h>
-
+#include "StorageLogSettings.h"
 
 #define DBMS_STORAGE_LOG_DATA_FILE_EXTENSION ".bin"
 #define DBMS_STORAGE_LOG_MARKS_FILE_NAME "__marks.mrk"
@@ -627,30 +627,24 @@ CheckResults StorageLog::checkData(const ASTPtr & /* query */, const Context & /
 
 void registerStorageLog(StorageFactory & factory)
 {
+    StorageFactory::StorageFeatures features{
+        .supports_settings = true
+    };
+
     factory.registerStorage("Log", [](const StorageFactory::Arguments & args)
     {
-        ASTs & engine_args = args.engine_args;
-
-        if (engine_args.size() > 1)
+        if (!args.engine_args.empty())
             throw Exception(
-                "Engine " + args.engine_name + " requires 0 or 1 arguments: [disk_name] (" + toString(args.engine_args.size()) + " given)",
+                "Engine " + args.engine_name + " doesn't support any arguments (" + toString(args.engine_args.size()) + " given)",
                 ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
 
-        for (size_t i = 0; i < engine_args.size(); ++i)
-            engine_args[i] = evaluateConstantExpressionOrIdentifierAsLiteral(engine_args[i], args.local_context);
-
-        DiskPtr disk = args.context.getDefaultDisk();
-
-        if (engine_args.size() == 1)
-        {
-            String disk_name = engine_args[0]->as<ASTLiteral &>().value.safeGet<String>();
-            disk = args.context.getDisk(disk_name);
-        }
+        String disk_name = getDiskName(*args.storage_def);
+        DiskPtr disk = args.context.getDisk(disk_name);
 
         return StorageLog::create(
             disk, args.relative_data_path, args.table_id, args.columns, args.constraints,
             args.context.getSettings().max_compress_block_size);
-    });
+    }, features);
 }
 
 }
diff --git a/dbms/src/Storages/StorageLogSettings.cpp b/dbms/src/Storages/StorageLogSettings.cpp
new file mode 100644
index 00000000000..5472c10001c
--- /dev/null
+++ b/dbms/src/Storages/StorageLogSettings.cpp
@@ -0,0 +1,22 @@
+#include "StorageLogSettings.h"
+#include <Parsers/ASTCreateQuery.h>
+#include <Parsers/ASTSetQuery.h>
+
+namespace DB
+{
+String getDiskName(ASTStorage & storage_def)
+{
+    if (storage_def.settings)
+    {
+        SettingsChanges changes = storage_def.settings->changes;
+        for (auto it = changes.begin(); it != changes.end(); ++it)
+        {
+            if (it->name == "disk_name")
+                return it->value.safeGet<String>();
+        }
+    }
+
+    return "default";
+}
+
+}
diff --git a/dbms/src/Storages/StorageLogSettings.h b/dbms/src/Storages/StorageLogSettings.h
new file mode 100644
index 00000000000..c970cd6be37
--- /dev/null
+++ b/dbms/src/Storages/StorageLogSettings.h
@@ -0,0 +1,10 @@
+#pragma once
+
+#include <Core/Types.h>
+
+namespace DB
+{
+    class ASTStorage;
+
+    String getDiskName(ASTStorage & storage_def);
+}
diff --git a/dbms/src/Storages/StorageStripeLog.cpp b/dbms/src/Storages/StorageStripeLog.cpp
index 9912cf47a89..84e625c89cc 100644
--- a/dbms/src/Storages/StorageStripeLog.cpp
+++ b/dbms/src/Storages/StorageStripeLog.cpp
@@ -29,6 +29,7 @@
 #include <Parsers/ASTLiteral.h>
 #include <Storages/StorageFactory.h>
 #include <Storages/StorageStripeLog.h>
+#include "StorageLogSettings.h"
 
 
 namespace DB
@@ -306,30 +307,24 @@ void StorageStripeLog::truncate(const ASTPtr &, const Context &, TableStructureW
 
 void registerStorageStripeLog(StorageFactory & factory)
 {
+    StorageFactory::StorageFeatures features{
+        .supports_settings = true
+    };
+
     factory.registerStorage("StripeLog", [](const StorageFactory::Arguments & args)
     {
-        ASTs & engine_args = args.engine_args;
-
-        if (engine_args.size() > 1)
+        if (!args.engine_args.empty())
             throw Exception(
-                "Engine " + args.engine_name + " requires 0 or 1 arguments: [disk_name] (" + toString(args.engine_args.size()) + " given)",
+                "Engine " + args.engine_name + " doesn't support any arguments (" + toString(args.engine_args.size()) + " given)",
                 ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
 
-        for (size_t i = 0; i < engine_args.size(); ++i)
-            engine_args[i] = evaluateConstantExpressionOrIdentifierAsLiteral(engine_args[i], args.local_context);
-
-        DiskPtr disk = args.context.getDefaultDisk();
-
-        if (engine_args.size() == 1)
-        {
-            String disk_name = engine_args[0]->as<ASTLiteral &>().value.safeGet<String>();
-            disk = args.context.getDisk(disk_name);
-        }
+        String disk_name = getDiskName(*args.storage_def);
+        DiskPtr disk = args.context.getDisk(disk_name);
 
         return StorageStripeLog::create(
             disk, args.relative_data_path, args.table_id, args.columns, args.constraints,
             args.attach, args.context.getSettings().max_compress_block_size);
-    });
+    }, features);
 }
 
 }
diff --git a/dbms/src/Storages/StorageTinyLog.cpp b/dbms/src/Storages/StorageTinyLog.cpp
index db90ba3e41d..f3d7750fd10 100644
--- a/dbms/src/Storages/StorageTinyLog.cpp
+++ b/dbms/src/Storages/StorageTinyLog.cpp
@@ -31,6 +31,7 @@
 #include <Storages/CheckResults.h>
 #include <Storages/StorageFactory.h>
 #include <Storages/StorageTinyLog.h>
+#include "StorageLogSettings.h"
 
 #define DBMS_STORAGE_LOG_DATA_FILE_EXTENSION ".bin"
 
@@ -437,30 +438,24 @@ void StorageTinyLog::drop(TableStructureWriteLockHolder &)
 
 void registerStorageTinyLog(StorageFactory & factory)
 {
+    StorageFactory::StorageFeatures features{
+        .supports_settings = true
+    };
+
     factory.registerStorage("TinyLog", [](const StorageFactory::Arguments & args)
     {
-        ASTs & engine_args = args.engine_args;
-
-        if (engine_args.size() > 1)
+        if (!args.engine_args.empty())
             throw Exception(
-                "Engine " + args.engine_name + " requires 0 or 1 arguments: [disk_name] (" + toString(args.engine_args.size()) + " given)",
+                "Engine " + args.engine_name + " doesn't support any arguments (" + toString(args.engine_args.size()) + " given)",
                 ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
 
-        for (size_t i = 0; i < engine_args.size(); ++i)
-            engine_args[i] = evaluateConstantExpressionOrIdentifierAsLiteral(engine_args[i], args.local_context);
-
-        DiskPtr disk = args.context.getDefaultDisk();
-
-        if (engine_args.size() == 1)
-        {
-            String disk_name = engine_args[0]->as<ASTLiteral &>().value.safeGet<String>();
-            disk = args.context.getDisk(disk_name);
-        }
+        String disk_name = getDiskName(*args.storage_def);
+        DiskPtr disk = args.context.getDisk(disk_name);
 
         return StorageTinyLog::create(
             disk, args.relative_data_path, args.table_id, args.columns, args.constraints,
             args.attach, args.context.getSettings().max_compress_block_size);
-    });
+    }, features);
 }
 
 }
diff --git a/dbms/tests/queries/0_stateless/01082_log_family_disk_memory.reference b/dbms/tests/queries/0_stateless/01083_log_family_disk_memory.reference
similarity index 100%
rename from dbms/tests/queries/0_stateless/01082_log_family_disk_memory.reference
rename to dbms/tests/queries/0_stateless/01083_log_family_disk_memory.reference
diff --git a/dbms/tests/queries/0_stateless/01082_log_family_disk_memory.sql b/dbms/tests/queries/0_stateless/01083_log_family_disk_memory.sql
similarity index 75%
rename from dbms/tests/queries/0_stateless/01082_log_family_disk_memory.sql
rename to dbms/tests/queries/0_stateless/01083_log_family_disk_memory.sql
index bbc75f1d3bd..b5aa101faf3 100644
--- a/dbms/tests/queries/0_stateless/01082_log_family_disk_memory.sql
+++ b/dbms/tests/queries/0_stateless/01083_log_family_disk_memory.sql
@@ -1,6 +1,6 @@
 DROP TABLE IF EXISTS log;
 
-CREATE TABLE log (x UInt8) ENGINE = StripeLog ('disk_memory');
+CREATE TABLE log (x UInt8) ENGINE = StripeLog () SETTINGS disk_name = 'disk_memory';
 
 SELECT * FROM log ORDER BY x;
 INSERT INTO log VALUES (0);
@@ -13,7 +13,7 @@ SELECT * FROM log ORDER BY x;
 TRUNCATE TABLE log;
 DROP TABLE log;
 
-CREATE TABLE log (x UInt8) ENGINE = TinyLog ('disk_memory');
+CREATE TABLE log (x UInt8) ENGINE = TinyLog () SETTINGS disk_name = 'disk_memory';
 
 SELECT * FROM log ORDER BY x;
 INSERT INTO log VALUES (0);
@@ -26,7 +26,7 @@ SELECT * FROM log ORDER BY x;
 TRUNCATE TABLE log;
 DROP TABLE log;
 
-CREATE TABLE log (x UInt8) ENGINE = Log ('disk_memory');
+CREATE TABLE log (x UInt8) ENGINE = Log () SETTINGS disk_name = 'disk_memory';
 
 SELECT * FROM log ORDER BY x;
 INSERT INTO log VALUES (0);

From ff50a51757cfc4967cb1176575e33b4bf3a4e30e Mon Sep 17 00:00:00 2001
From: Pavel Kovalenko <jokserfn@gmail.com>
Date: Tue, 18 Feb 2020 22:13:37 +0300
Subject: [PATCH 09/40] Formatting issues.

---
 dbms/src/Storages/StorageLogSettings.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/dbms/src/Storages/StorageLogSettings.cpp b/dbms/src/Storages/StorageLogSettings.cpp
index 5472c10001c..d2d8e2138d8 100644
--- a/dbms/src/Storages/StorageLogSettings.cpp
+++ b/dbms/src/Storages/StorageLogSettings.cpp
@@ -15,7 +15,6 @@ String getDiskName(ASTStorage & storage_def)
                 return it->value.safeGet<String>();
         }
     }
-
     return "default";
 }
 

From 39123b25b4b529ad4ffb1a4d0418593a12173faf Mon Sep 17 00:00:00 2001
From: "dependabot-preview[bot]"
 <27856297+dependabot-preview[bot]@users.noreply.github.com>
Date: Wed, 19 Feb 2020 07:43:19 +0000
Subject: [PATCH 10/40] Bump commonmark from 0.5.4 to 0.9.1 in /docs/tools

Bumps [commonmark](https://github.com/rtfd/commonmark.py) from 0.5.4 to 0.9.1.
- [Release notes](https://github.com/rtfd/commonmark.py/releases)
- [Changelog](https://github.com/readthedocs/commonmark.py/blob/master/CHANGELOG.md)
- [Commits](https://github.com/rtfd/commonmark.py/commits/0.9.1)

Signed-off-by: dependabot-preview[bot] <support@dependabot.com>
---
 docs/tools/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/tools/requirements.txt b/docs/tools/requirements.txt
index 32b8e4426ee..f53d273c405 100644
--- a/docs/tools/requirements.txt
+++ b/docs/tools/requirements.txt
@@ -5,7 +5,7 @@ beautifulsoup4==4.8.2
 certifi==2017.11.5
 chardet==3.0.4
 click==6.7
-CommonMark==0.5.4
+CommonMark==0.9.1
 cssmin==0.2.0
 docutils==0.16
 futures==3.1.1

From d96255ccc4d86cdaa150a542ba3f53c27b0b0265 Mon Sep 17 00:00:00 2001
From: "dependabot-preview[bot]"
 <27856297+dependabot-preview[bot]@users.noreply.github.com>
Date: Wed, 19 Feb 2020 07:44:01 +0000
Subject: [PATCH 11/40] Bump unidecode from 1.0.23 to 1.1.1 in /docs/tools

Bumps [unidecode](https://github.com/kmike/text-unidecode) from 1.0.23 to 1.1.1.
- [Release notes](https://github.com/kmike/text-unidecode/releases)
- [Commits](https://github.com/kmike/text-unidecode/commits)

Signed-off-by: dependabot-preview[bot] <support@dependabot.com>
---
 docs/tools/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/tools/requirements.txt b/docs/tools/requirements.txt
index 32b8e4426ee..c117f7539f5 100644
--- a/docs/tools/requirements.txt
+++ b/docs/tools/requirements.txt
@@ -31,6 +31,6 @@ Sphinx==1.6.5
 sphinxcontrib-websupport==1.0.1
 tornado==5.1
 typing==3.7.4.1
-Unidecode==1.0.23
+Unidecode==1.1.1
 urllib3==1.25.8
 gitpython==2.1.14

From 9d7d89676099fc46f6faa21c6b352594c4bb0342 Mon Sep 17 00:00:00 2001
From: "dependabot-preview[bot]"
 <27856297+dependabot-preview[bot]@users.noreply.github.com>
Date: Wed, 19 Feb 2020 07:44:25 +0000
Subject: [PATCH 12/40] Bump markupsafe from 1.0 to 1.1.1 in /docs/tools

Bumps [markupsafe](https://github.com/pallets/markupsafe) from 1.0 to 1.1.1.
- [Release notes](https://github.com/pallets/markupsafe/releases)
- [Changelog](https://github.com/pallets/markupsafe/blob/master/CHANGES.rst)
- [Commits](https://github.com/pallets/markupsafe/compare/1.0...1.1.1)

Signed-off-by: dependabot-preview[bot] <support@dependabot.com>
---
 docs/tools/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/tools/requirements.txt b/docs/tools/requirements.txt
index 32b8e4426ee..b23b6cd1932 100644
--- a/docs/tools/requirements.txt
+++ b/docs/tools/requirements.txt
@@ -16,7 +16,7 @@ Jinja2==2.11.1
 jsmin==2.2.2
 livereload==2.5.1
 Markdown==2.6.11
-MarkupSafe==1.0
+MarkupSafe==1.1.1
 mkdocs==1.0.4
 Pygments==2.5.2
 python-slugify==1.2.6

From b2a52a2d6980e4ccf4cfd4f2d71b3da291616527 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Wed, 19 Feb 2020 11:49:26 +0300
Subject: [PATCH 13/40] Remove useless code #9210

---
 dbms/src/Compression/ICompressionCodec.cpp | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/dbms/src/Compression/ICompressionCodec.cpp b/dbms/src/Compression/ICompressionCodec.cpp
index d55b50caf33..ea3a74c18a5 100644
--- a/dbms/src/Compression/ICompressionCodec.cpp
+++ b/dbms/src/Compression/ICompressionCodec.cpp
@@ -9,23 +9,12 @@
 #include <Compression/CompressionFactory.h>
 
 
-namespace ProfileEvents
-{
-    extern const Event ReadCompressedBytes;
-    extern const Event CompressedReadBufferBlocks;
-    extern const Event CompressedReadBufferBytes;
-}
-
 namespace DB
 {
 
 namespace ErrorCodes
 {
-    extern const int CHECKSUM_DOESNT_MATCH;
-    extern const int TOO_LARGE_SIZE_COMPRESSED;
-    extern const int UNKNOWN_COMPRESSION_METHOD;
     extern const int CANNOT_DECOMPRESS;
-    extern const int SEEK_POSITION_OUT_OF_BOUND;
     extern const int CORRUPTED_DATA;
 }
 

From 811197558cc2695e1ff352dafbd06f4bab78b087 Mon Sep 17 00:00:00 2001
From: Nicolae Vartolomei <me@nvartolomei.com>
Date: Wed, 19 Feb 2020 13:33:31 +0000
Subject: [PATCH 14/40] Improve stack trace formatting for Poco and std
 exceptions

Before:

```
<Error> Application: Caught exception while loading metadata: Poco::Exception. Code: 1000, e.code() = 0, e.displayText() = Access to file denied: boo0. /home/nv/clickhouse-master-clion/contrib/poco/Foundation/src/Exception.cpp:27: Poco::FileAccessDeniedException::FileAccessDeniedException(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, int) @ 0xbb598cc in /state/home/nv/clickhouse-builds/clickhouse-master-clion-gcc/dbms/programs/clickhous
```

After:

```
<Error> Application: Caught exception while loading metadata: Poco::Exception. Code: 1000, e.code() = 0, e.displayText() = Access to file denied: boo, Stack trace (when copying this message, always include the lines below):

0. /home/nv/clickhouse-master-clion/contrib/poco/Foundation/src/Exception.cpp:27: Poco::FileAccessDeniedException::FileAccessDeniedException(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, int) @ 0xbb5987c in /state/home/nv/clickhouse-builds/clickhouse-master-clion-gcc/dbms/programs/clickhous
```
---
 dbms/src/Common/Exception.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dbms/src/Common/Exception.cpp b/dbms/src/Common/Exception.cpp
index 318da1a27f2..00ef520f37c 100644
--- a/dbms/src/Common/Exception.cpp
+++ b/dbms/src/Common/Exception.cpp
@@ -193,7 +193,7 @@ std::string getCurrentExceptionMessage(bool with_stacktrace, bool check_embedded
         {
             stream << "Poco::Exception. Code: " << ErrorCodes::POCO_EXCEPTION << ", e.code() = " << e.code()
                 << ", e.displayText() = " << e.displayText()
-                << (with_stacktrace ? getExceptionStackTraceString(e) : "")
+                << (with_stacktrace ? ", Stack trace (when copying this message, always include the lines below):\n\n" + getExceptionStackTraceString(e) : "")
                 << (with_extra_info ? getExtraExceptionInfo(e) : "")
                 << " (version " << VERSION_STRING << VERSION_OFFICIAL << ")";
         }
@@ -210,9 +210,9 @@ std::string getCurrentExceptionMessage(bool with_stacktrace, bool check_embedded
                 name += " (demangling status: " + toString(status) + ")";
 
             stream << "std::exception. Code: " << ErrorCodes::STD_EXCEPTION << ", type: " << name << ", e.what() = " << e.what()
-                << (with_stacktrace ? getExceptionStackTraceString(e) : "")
+                << (with_stacktrace ? ", Stack trace (when copying this message, always include the lines below):\n\n" + getExceptionStackTraceString(e) : "")
                 << (with_extra_info ? getExtraExceptionInfo(e) : "")
-                << ", version = " << VERSION_STRING << VERSION_OFFICIAL;
+                << " (version " << VERSION_STRING << VERSION_OFFICIAL << ")";
         }
         catch (...) {}
     }

From add281dfe31492f19fff0a651c02cccb028a174c Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Wed, 19 Feb 2020 16:39:20 +0300
Subject: [PATCH 15/40] Enable metric_log by default

---
 dbms/programs/server/config.xml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/dbms/programs/server/config.xml b/dbms/programs/server/config.xml
index 27a6876ca13..ae15a583fcd 100644
--- a/dbms/programs/server/config.xml
+++ b/dbms/programs/server/config.xml
@@ -403,15 +403,13 @@
     </text_log>
     -->
 
-    <!-- Uncomment to write metric log into table.
-         Metric log contains rows with current values of ProfileEvents, CurrentMetrics collected with "collect_interval_milliseconds" interval.
+    <!-- Metric log contains rows with current values of ProfileEvents, CurrentMetrics collected with "collect_interval_milliseconds" interval. -->
     <metric_log>
         <database>system</database>
         <table>metric_log</table>
         <flush_interval_milliseconds>7500</flush_interval_milliseconds>
         <collect_interval_milliseconds>1000</collect_interval_milliseconds>
     </metric_log>
-    -->
 
     <!-- Parameters for embedded dictionaries, used in Yandex.Metrica.
          See https://clickhouse.yandex/docs/en/dicts/internal_dicts/

From d4fa60f52dc4777543bf527629635e16a7c20157 Mon Sep 17 00:00:00 2001
From: Pavel Kovalenko <jokserfn@gmail.com>
Date: Wed, 19 Feb 2020 17:50:39 +0300
Subject: [PATCH 16/40] Rename disk_name -> disk.

---
 dbms/src/Storages/StorageLogSettings.cpp                    | 2 +-
 .../queries/0_stateless/01083_log_family_disk_memory.sql    | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/dbms/src/Storages/StorageLogSettings.cpp b/dbms/src/Storages/StorageLogSettings.cpp
index d2d8e2138d8..19cd0dd34e3 100644
--- a/dbms/src/Storages/StorageLogSettings.cpp
+++ b/dbms/src/Storages/StorageLogSettings.cpp
@@ -11,7 +11,7 @@ String getDiskName(ASTStorage & storage_def)
         SettingsChanges changes = storage_def.settings->changes;
         for (auto it = changes.begin(); it != changes.end(); ++it)
         {
-            if (it->name == "disk_name")
+            if (it->name == "disk")
                 return it->value.safeGet<String>();
         }
     }
diff --git a/dbms/tests/queries/0_stateless/01083_log_family_disk_memory.sql b/dbms/tests/queries/0_stateless/01083_log_family_disk_memory.sql
index b5aa101faf3..8fcd03522ed 100644
--- a/dbms/tests/queries/0_stateless/01083_log_family_disk_memory.sql
+++ b/dbms/tests/queries/0_stateless/01083_log_family_disk_memory.sql
@@ -1,6 +1,6 @@
 DROP TABLE IF EXISTS log;
 
-CREATE TABLE log (x UInt8) ENGINE = StripeLog () SETTINGS disk_name = 'disk_memory';
+CREATE TABLE log (x UInt8) ENGINE = StripeLog () SETTINGS disk = 'disk_memory';
 
 SELECT * FROM log ORDER BY x;
 INSERT INTO log VALUES (0);
@@ -13,7 +13,7 @@ SELECT * FROM log ORDER BY x;
 TRUNCATE TABLE log;
 DROP TABLE log;
 
-CREATE TABLE log (x UInt8) ENGINE = TinyLog () SETTINGS disk_name = 'disk_memory';
+CREATE TABLE log (x UInt8) ENGINE = TinyLog () SETTINGS disk = 'disk_memory';
 
 SELECT * FROM log ORDER BY x;
 INSERT INTO log VALUES (0);
@@ -26,7 +26,7 @@ SELECT * FROM log ORDER BY x;
 TRUNCATE TABLE log;
 DROP TABLE log;
 
-CREATE TABLE log (x UInt8) ENGINE = Log () SETTINGS disk_name = 'disk_memory';
+CREATE TABLE log (x UInt8) ENGINE = Log () SETTINGS disk = 'disk_memory';
 
 SELECT * FROM log ORDER BY x;
 INSERT INTO log VALUES (0);

From 24cbd0d6d137e0d24d60d01cda470ed6c1aea6fe Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <mikhaylovnikitka@gmail.com>
Date: Wed, 19 Feb 2020 18:01:08 +0300
Subject: [PATCH 17/40] squashed commits

---
 dbms/programs/copier/Aliases.h            |   26 +
 dbms/programs/copier/CMakeLists.txt       |   16 +-
 dbms/programs/copier/ClusterCopier.cpp    | 3579 +++++++--------------
 dbms/programs/copier/ClusterCopier.h      |  218 +-
 dbms/programs/copier/ClusterCopierApp.cpp |  172 +
 dbms/programs/copier/ClusterCopierApp.h   |   90 +
 dbms/programs/copier/ClusterPartition.h   |   16 +
 dbms/programs/copier/Internals.cpp        |  129 +
 dbms/programs/copier/Internals.h          |  176 +
 dbms/programs/copier/ShardPartition.h     |   68 +
 dbms/programs/copier/TaskCluster.h        |   96 +
 dbms/programs/copier/TaskTableAndShard.h  |  269 ++
 dbms/programs/copier/ZooKeeperStaff.h     |  224 ++
 13 files changed, 2645 insertions(+), 2434 deletions(-)
 create mode 100644 dbms/programs/copier/Aliases.h
 create mode 100644 dbms/programs/copier/ClusterCopierApp.cpp
 create mode 100644 dbms/programs/copier/ClusterCopierApp.h
 create mode 100644 dbms/programs/copier/ClusterPartition.h
 create mode 100644 dbms/programs/copier/Internals.cpp
 create mode 100644 dbms/programs/copier/Internals.h
 create mode 100644 dbms/programs/copier/ShardPartition.h
 create mode 100644 dbms/programs/copier/TaskCluster.h
 create mode 100644 dbms/programs/copier/TaskTableAndShard.h
 create mode 100644 dbms/programs/copier/ZooKeeperStaff.h

diff --git a/dbms/programs/copier/Aliases.h b/dbms/programs/copier/Aliases.h
new file mode 100644
index 00000000000..4beff891bf4
--- /dev/null
+++ b/dbms/programs/copier/Aliases.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <Interpreters/Cluster.h>
+
+namespace DB
+
+{
+    using ConfigurationPtr = Poco::AutoPtr<Poco::Util::AbstractConfiguration>;
+
+    using DatabaseAndTableName = std::pair<String, String>;
+
+    /// Hierarchical description of the tasks
+    struct ShardPartition;
+    struct TaskShard;
+    struct TaskTable;
+    struct TaskCluster;
+    struct ClusterPartition;
+
+    using TasksPartition = std::map<String, ShardPartition, std::greater<>>;
+    using ShardInfo = Cluster::ShardInfo;
+    using TaskShardPtr = std::shared_ptr<TaskShard>;
+    using TasksShard = std::vector<TaskShardPtr>;
+    using TasksTable = std::list<TaskTable>;
+    using ClusterPartitions = std::map<String, ClusterPartition, std::greater<>>;
+}
+
diff --git a/dbms/programs/copier/CMakeLists.txt b/dbms/programs/copier/CMakeLists.txt
index 8e13040b29d..ff9ba2f250f 100644
--- a/dbms/programs/copier/CMakeLists.txt
+++ b/dbms/programs/copier/CMakeLists.txt
@@ -1,5 +1,17 @@
-set(CLICKHOUSE_COPIER_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/ClusterCopier.cpp)
-set(CLICKHOUSE_COPIER_LINK PRIVATE clickhouse_common_zookeeper clickhouse_parsers clickhouse_functions clickhouse_table_functions clickhouse_aggregate_functions clickhouse_dictionaries string_utils ${Poco_XML_LIBRARY} PUBLIC daemon)
+set(CLICKHOUSE_COPIER_SOURCES
+        ${CMAKE_CURRENT_SOURCE_DIR}/ClusterCopierApp.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/ClusterCopier.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/Internals.cpp)
+
+set(CLICKHOUSE_COPIER_LINK PRIVATE
+        clickhouse_common_zookeeper
+        clickhouse_parsers
+        clickhouse_functions
+        clickhouse_table_functions
+        clickhouse_aggregate_functions
+        clickhouse_dictionaries
+        string_utils ${Poco_XML_LIBRARY} PUBLIC daemon)
+
 set(CLICKHOUSE_COPIER_INCLUDE SYSTEM PRIVATE ${PCG_RANDOM_INCLUDE_DIR})
 
 clickhouse_program_add(copier)
diff --git a/dbms/programs/copier/ClusterCopier.cpp b/dbms/programs/copier/ClusterCopier.cpp
index 2c6b16a7ae4..f26997be64c 100644
--- a/dbms/programs/copier/ClusterCopier.cpp
+++ b/dbms/programs/copier/ClusterCopier.cpp
@@ -1,2533 +1,1380 @@
 #include "ClusterCopier.h"
 
-#include <chrono>
-#include <optional>
-#include <Poco/Util/XMLConfiguration.h>
-#include <Poco/Logger.h>
-#include <Poco/ConsoleChannel.h>
-#include <Poco/FormattingChannel.h>
-#include <Poco/PatternFormatter.h>
-#include <Poco/UUIDGenerator.h>
-#include <Poco/File.h>
-#include <Poco/Process.h>
-#include <Poco/FileChannel.h>
-#include <Poco/SplitterChannel.h>
-#include <Poco/Util/HelpFormatter.h>
-#include <boost/algorithm/string.hpp>
-#include <pcg_random.hpp>
-#include <common/logger_useful.h>
-#include <Common/ThreadPool.h>
-#include <Common/Exception.h>
+#include "Internals.h"
+
 #include <Common/ZooKeeper/ZooKeeper.h>
 #include <Common/ZooKeeper/KeeperException.h>
-#include <Common/getFQDNOrHostName.h>
-#include <Common/isLocalAddress.h>
-#include <Common/typeid_cast.h>
-#include <Common/ClickHouseRevision.h>
-#include <Common/formatReadable.h>
-#include <Common/DNSResolver.h>
-#include <Common/CurrentThread.h>
-#include <Common/escapeForFileName.h>
-#include <Common/getNumberOfPhysicalCPUCores.h>
-#include <Common/ThreadStatus.h>
-#include <Client/Connection.h>
-#include <Interpreters/Context.h>
-#include <Interpreters/Cluster.h>
-#include <Interpreters/InterpreterFactory.h>
-#include <Interpreters/InterpreterExistsQuery.h>
-#include <Interpreters/InterpreterShowCreateQuery.h>
-#include <Interpreters/InterpreterDropQuery.h>
-#include <Interpreters/InterpreterCreateQuery.h>
-#include <Columns/ColumnString.h>
-#include <Columns/ColumnsNumber.h>
-#include <DataTypes/DataTypeString.h>
-#include <Parsers/ParserCreateQuery.h>
-#include <Parsers/parseQuery.h>
-#include <Parsers/ParserQuery.h>
-#include <Parsers/ASTCreateQuery.h>
-#include <Parsers/queryToString.h>
-#include <Parsers/ASTDropQuery.h>
-#include <Parsers/ASTLiteral.h>
-#include <Parsers/ASTExpressionList.h>
-#include <Formats/FormatSettings.h>
-#include <DataStreams/RemoteBlockInputStream.h>
-#include <DataStreams/SquashingBlockInputStream.h>
-#include <DataStreams/AsynchronousBlockInputStream.h>
-#include <DataStreams/copyData.h>
-#include <DataStreams/NullBlockOutputStream.h>
-#include <IO/ConnectionTimeouts.h>
-#include <IO/Operators.h>
-#include <IO/ReadBufferFromString.h>
-#include <IO/ReadBufferFromFile.h>
-#include <Functions/registerFunctions.h>
-#include <TableFunctions/registerTableFunctions.h>
-#include <AggregateFunctions/registerAggregateFunctions.h>
-#include <Storages/registerStorages.h>
-#include <Storages/StorageDistributed.h>
-#include <Dictionaries/registerDictionaries.h>
-#include <Disks/registerDisks.h>
-#include <Databases/DatabaseMemory.h>
-#include <Common/StatusFile.h>
-
 
 namespace DB
+
 {
 
-namespace ErrorCodes
+void ClusterCopier::init()
 {
-    extern const int NO_ZOOKEEPER;
-    extern const int BAD_ARGUMENTS;
-    extern const int UNKNOWN_TABLE;
-    extern const int UNFINISHED;
-    extern const int UNKNOWN_ELEMENT_IN_CONFIG;
-}
+    auto zookeeper = context.getZooKeeper();
 
-
-using ConfigurationPtr = Poco::AutoPtr<Poco::Util::AbstractConfiguration>;
-
-static ConfigurationPtr getConfigurationFromXMLString(const std::string & xml_data)
-{
-    std::stringstream ss(xml_data);
-    Poco::XML::InputSource input_source{ss};
-    return {new Poco::Util::XMLConfiguration{&input_source}};
-}
-
-namespace
-{
-
-
-using DatabaseAndTableName = std::pair<String, String>;
-
-String getQuotedTable(const String & database, const String & table)
-{
-    if (database.empty())
+    task_description_watch_callback = [this] (const Coordination::WatchResponse & response)
     {
-        return backQuoteIfNeed(table);
+        if (response.error != Coordination::ZOK)
+            return;
+        UInt64 version = ++task_descprtion_version;
+        LOG_DEBUG(log, "Task description should be updated, local version " << version);
+    };
+
+    task_description_path = task_zookeeper_path + "/description";
+    task_cluster = std::make_unique<TaskCluster>(task_zookeeper_path, working_database_name);
+
+    reloadTaskDescription();
+    task_cluster_initial_config = task_cluster_current_config;
+
+    task_cluster->loadTasks(*task_cluster_initial_config);
+    context.setClustersConfig(task_cluster_initial_config, task_cluster->clusters_prefix);
+
+    /// Set up shards and their priority
+    task_cluster->random_engine.seed(task_cluster->random_device());
+    for (auto & task_table : task_cluster->table_tasks)
+    {
+        task_table.cluster_pull = context.getCluster(task_table.cluster_pull_name);
+        task_table.cluster_push = context.getCluster(task_table.cluster_push_name);
+        task_table.initShards(task_cluster->random_engine);
     }
 
-    return backQuoteIfNeed(database) + "." + backQuoteIfNeed(table);
+    LOG_DEBUG(log, "Will process " << task_cluster->table_tasks.size() << " table tasks");
+
+    /// Do not initialize tables, will make deferred initialization in process()
+
+    zookeeper->createAncestors(getWorkersPathVersion() + "/");
+    zookeeper->createAncestors(getWorkersPath() + "/");
 }
 
-String getQuotedTable(const DatabaseAndTableName & db_and_table)
+template <typename T>
+decltype(auto) ClusterCopier::retry(T && func, UInt64 max_tries)
 {
-    return getQuotedTable(db_and_table.first, db_and_table.second);
-}
+    std::exception_ptr exception;
 
-
-enum class TaskState
-{
-    Started = 0,
-    Finished,
-    Unknown
-};
-
-/// Used to mark status of shard partition tasks
-struct TaskStateWithOwner
-{
-    TaskStateWithOwner() = default;
-    TaskStateWithOwner(TaskState state_, const String & owner_) : state(state_), owner(owner_) {}
-
-    TaskState state{TaskState::Unknown};
-    String owner;
-
-    static String getData(TaskState state, const String & owner)
+    for (UInt64 try_number = 1; try_number <= max_tries; ++try_number)
     {
-        return TaskStateWithOwner(state, owner).toString();
+        try
+        {
+            return func();
+        }
+        catch (...)
+        {
+            exception = std::current_exception();
+            if (try_number < max_tries)
+            {
+                tryLogCurrentException(log, "Will retry");
+                std::this_thread::sleep_for(default_sleep_time);
+            }
+        }
     }
 
-    String toString()
+    std::rethrow_exception(exception);
+}
+
+
+void ClusterCopier::discoverShardPartitions(const ConnectionTimeouts & timeouts, const TaskShardPtr & task_shard)
+{
+    TaskTable & task_table = task_shard->task_table;
+
+    LOG_INFO(log, "Discover partitions of shard " << task_shard->getDescription());
+
+    auto get_partitions = [&] () { return getShardPartitions(timeouts, *task_shard); };
+    auto existing_partitions_names = retry(get_partitions, 60);
+    Strings filtered_partitions_names;
+    Strings missing_partitions;
+
+    /// Check that user specified correct partition names
+    auto check_partition_format = [] (const DataTypePtr & type, const String & partition_text_quoted)
     {
-        WriteBufferFromOwnString wb;
-        wb << static_cast<UInt32>(state) << "\n" << escape << owner;
-        return wb.str();
-    }
+        MutableColumnPtr column_dummy = type->createColumn();
+        ReadBufferFromString rb(partition_text_quoted);
 
-    static TaskStateWithOwner fromString(const String & data)
+        try
+        {
+            type->deserializeAsTextQuoted(*column_dummy, rb, FormatSettings());
+        }
+        catch (Exception & e)
+        {
+            throw Exception("Partition " + partition_text_quoted + " has incorrect format. " + e.displayText(), ErrorCodes::BAD_ARGUMENTS);
+        }
+    };
+
+    if (task_table.has_enabled_partitions)
     {
-        ReadBufferFromString rb(data);
-        TaskStateWithOwner res;
-        UInt32 state;
-
-        rb >> state >> "\n" >> escape >> res.owner;
-
-        if (state >= static_cast<int>(TaskState::Unknown))
-            throw Exception("Unknown state " + data, ErrorCodes::LOGICAL_ERROR);
-
-        res.state = static_cast<TaskState>(state);
-        return res;
-    }
-};
-
-
-/// Hierarchical description of the tasks
-struct ShardPartition;
-struct TaskShard;
-struct TaskTable;
-struct TaskCluster;
-struct ClusterPartition;
-
-using TasksPartition = std::map<String, ShardPartition, std::greater<>>;
-using ShardInfo = Cluster::ShardInfo;
-using TaskShardPtr = std::shared_ptr<TaskShard>;
-using TasksShard = std::vector<TaskShardPtr>;
-using TasksTable = std::list<TaskTable>;
-using ClusterPartitions = std::map<String, ClusterPartition, std::greater<>>;
-
-
-/// Just destination partition of a shard
-struct ShardPartition
-{
-    ShardPartition(TaskShard & parent, const String & name_quoted_) : task_shard(parent), name(name_quoted_) {}
-
-    String getPartitionPath() const;
-    String getPartitionCleanStartPath() const;
-    String getCommonPartitionIsDirtyPath() const;
-    String getCommonPartitionIsCleanedPath() const;
-    String getPartitionActiveWorkersPath() const;
-    String getActiveWorkerPath() const;
-    String getPartitionShardsPath() const;
-    String getShardStatusPath() const;
-
-    TaskShard & task_shard;
-    String name;
-};
-
-
-struct ShardPriority
-{
-    UInt8 is_remote = 1;
-    size_t hostname_difference = 0;
-    UInt8 random = 0;
-
-    static bool greaterPriority(const ShardPriority & current, const ShardPriority & other)
-    {
-        return std::forward_as_tuple(current.is_remote, current.hostname_difference, current.random)
-               < std::forward_as_tuple(other.is_remote, other.hostname_difference, other.random);
-    }
-};
-
-
-struct TaskShard
-{
-    TaskShard(TaskTable & parent, const ShardInfo & info_) : task_table(parent), info(info_) {}
-
-    TaskTable & task_table;
-
-    ShardInfo info;
-    UInt32 numberInCluster() const { return info.shard_num; }
-    UInt32 indexInCluster() const { return info.shard_num - 1; }
-
-    String getDescription() const;
-    String getHostNameExample() const;
-
-    /// Used to sort clusters by their proximity
-    ShardPriority priority;
-
-    /// Column with unique destination partitions (computed from engine_push_partition_key expr.) in the shard
-    ColumnWithTypeAndName partition_key_column;
-
-    /// There is a task for each destination partition
-    TasksPartition partition_tasks;
-
-    /// Which partitions have been checked for existence
-    /// If some partition from this lists is exists, it is in partition_tasks
-    std::set<String> checked_partitions;
-
-    /// Last CREATE TABLE query of the table of the shard
-    ASTPtr current_pull_table_create_query;
-
-    /// Internal distributed tables
-    DatabaseAndTableName table_read_shard;
-    DatabaseAndTableName table_split_shard;
-};
-
-
-/// Contains info about all shards that contain a partition
-struct ClusterPartition
-{
-    double elapsed_time_seconds = 0;
-    UInt64 bytes_copied = 0;
-    UInt64 rows_copied = 0;
-    UInt64 blocks_copied = 0;
-
-    UInt64 total_tries = 0;
-};
-
-
-struct TaskTable
-{
-    TaskTable(TaskCluster & parent, const Poco::Util::AbstractConfiguration & config, const String & prefix,
-                  const String & table_key);
-
-    TaskCluster & task_cluster;
-
-    String getPartitionPath(const String & partition_name) const;
-    String getPartitionIsDirtyPath(const String & partition_name) const;
-    String getPartitionIsCleanedPath(const String & partition_name) const;
-    String getPartitionTaskStatusPath(const String & partition_name) const;
-
-    String name_in_config;
-
-    /// Used as task ID
-    String table_id;
-
-    /// Source cluster and table
-    String cluster_pull_name;
-    DatabaseAndTableName table_pull;
-
-    /// Destination cluster and table
-    String cluster_push_name;
-    DatabaseAndTableName table_push;
-
-    /// Storage of destination table
-    String engine_push_str;
-    ASTPtr engine_push_ast;
-    ASTPtr engine_push_partition_key_ast;
-
-    /// A Distributed table definition used to split data
-    String sharding_key_str;
-    ASTPtr sharding_key_ast;
-    ASTPtr engine_split_ast;
-
-    /// Additional WHERE expression to filter input data
-    String where_condition_str;
-    ASTPtr where_condition_ast;
-
-    /// Resolved clusters
-    ClusterPtr cluster_pull;
-    ClusterPtr cluster_push;
-
-    /// Filter partitions that should be copied
-    bool has_enabled_partitions = false;
-    Strings enabled_partitions;
-    NameSet enabled_partitions_set;
-
-    /// Prioritized list of shards
-    TasksShard all_shards;
-    TasksShard local_shards;
-
-    ClusterPartitions cluster_partitions;
-    NameSet finished_cluster_partitions;
-
-    /// Parition names to process in user-specified order
-    Strings ordered_partition_names;
-
-    ClusterPartition & getClusterPartition(const String & partition_name)
-    {
-        auto it = cluster_partitions.find(partition_name);
-        if (it == cluster_partitions.end())
-            throw Exception("There are no cluster partition " + partition_name + " in " + table_id, ErrorCodes::LOGICAL_ERROR);
-        return it->second;
-    }
-
-    Stopwatch watch;
-    UInt64 bytes_copied = 0;
-    UInt64 rows_copied = 0;
-
-    template <typename RandomEngine>
-    void initShards(RandomEngine && random_engine);
-};
-
-
-struct TaskCluster
-{
-    TaskCluster(const String & task_zookeeper_path_, const String & default_local_database_)
-        : task_zookeeper_path(task_zookeeper_path_), default_local_database(default_local_database_) {}
-
-    void loadTasks(const Poco::Util::AbstractConfiguration & config, const String & base_key = "");
-
-    /// Set (or update) settings and max_workers param
-    void reloadSettings(const Poco::Util::AbstractConfiguration & config, const String & base_key = "");
-
-    /// Base node for all tasks. Its structure:
-    ///  workers/ - directory with active workers (amount of them is less or equal max_workers)
-    ///  description - node with task configuration
-    ///  table_table1/ - directories with per-partition copying status
-    String task_zookeeper_path;
-
-    /// Database used to create temporary Distributed tables
-    String default_local_database;
-
-    /// Limits number of simultaneous workers
-    UInt64 max_workers = 0;
-
-    /// Base settings for pull and push
-    Settings settings_common;
-    /// Settings used to fetch data
-    Settings settings_pull;
-    /// Settings used to insert data
-    Settings settings_push;
-
-    String clusters_prefix;
-
-    /// Subtasks
-    TasksTable table_tasks;
-
-    std::random_device random_device;
-    pcg64 random_engine;
-};
-
-
-struct MultiTransactionInfo
-{
-    int32_t code;
-    Coordination::Requests requests;
-    Coordination::Responses responses;
-};
-
-// Creates AST representing 'ENGINE = Distributed(cluster, db, table, [sharding_key])
-std::shared_ptr<ASTStorage> createASTStorageDistributed(
-    const String & cluster_name, const String & database, const String & table, const ASTPtr & sharding_key_ast = nullptr)
-{
-    auto args = std::make_shared<ASTExpressionList>();
-    args->children.emplace_back(std::make_shared<ASTLiteral>(cluster_name));
-    args->children.emplace_back(std::make_shared<ASTIdentifier>(database));
-    args->children.emplace_back(std::make_shared<ASTIdentifier>(table));
-    if (sharding_key_ast)
-        args->children.emplace_back(sharding_key_ast);
-
-    auto engine = std::make_shared<ASTFunction>();
-    engine->name = "Distributed";
-    engine->arguments = args;
-
-    auto storage = std::make_shared<ASTStorage>();
-    storage->set(storage->engine, engine);
-
-    return storage;
-}
-
-
-BlockInputStreamPtr squashStreamIntoOneBlock(const BlockInputStreamPtr & stream)
-{
-    return std::make_shared<SquashingBlockInputStream>(
-        stream,
-        std::numeric_limits<size_t>::max(),
-        std::numeric_limits<size_t>::max());
-}
-
-Block getBlockWithAllStreamData(const BlockInputStreamPtr & stream)
-{
-    return squashStreamIntoOneBlock(stream)->read();
-}
-
-
-/// Path getters
-
-String TaskTable::getPartitionPath(const String & partition_name) const
-{
-    return task_cluster.task_zookeeper_path             // root
-           + "/tables/" + table_id                      // tables/dst_cluster.merge.hits
-           + "/" + escapeForFileName(partition_name);   // 201701
-}
-
-String ShardPartition::getPartitionCleanStartPath() const
-{
-    return getPartitionPath() + "/clean_start";
-}
-
-String ShardPartition::getPartitionPath() const
-{
-    return task_shard.task_table.getPartitionPath(name);
-}
-
-String ShardPartition::getShardStatusPath() const
-{
-    // schema: /<root...>/tables/<table>/<partition>/shards/<shard>
-    // e.g. /root/table_test.hits/201701/shards/1
-    return getPartitionShardsPath() + "/" + toString(task_shard.numberInCluster());
-}
-
-String ShardPartition::getPartitionShardsPath() const
-{
-    return getPartitionPath() + "/shards";
-}
-
-String ShardPartition::getPartitionActiveWorkersPath() const
-{
-    return getPartitionPath() + "/partition_active_workers";
-}
-
-String ShardPartition::getActiveWorkerPath() const
-{
-    return getPartitionActiveWorkersPath() + "/" + toString(task_shard.numberInCluster());
-}
-
-String ShardPartition::getCommonPartitionIsDirtyPath() const
-{
-    return getPartitionPath() + "/is_dirty";
-}
-
-String ShardPartition::getCommonPartitionIsCleanedPath() const
-{
-    return getCommonPartitionIsDirtyPath() + "/cleaned";
-}
-
-String TaskTable::getPartitionIsDirtyPath(const String & partition_name) const
-{
-    return getPartitionPath(partition_name) + "/is_dirty";
-}
-
-String TaskTable::getPartitionIsCleanedPath(const String & partition_name) const
-{
-    return getPartitionIsDirtyPath(partition_name) + "/cleaned";
-}
-
-String TaskTable::getPartitionTaskStatusPath(const String & partition_name) const
-{
-    return getPartitionPath(partition_name) + "/shards";
-}
-
-String DB::TaskShard::getDescription() const
-{
-    std::stringstream ss;
-    ss << "N" << numberInCluster()
-       << " (having a replica " << getHostNameExample()
-       << ", pull table " + getQuotedTable(task_table.table_pull)
-       << " of cluster " + task_table.cluster_pull_name << ")";
-    return ss.str();
-}
-
-String DB::TaskShard::getHostNameExample() const
-{
-    auto & replicas = task_table.cluster_pull->getShardsAddresses().at(indexInCluster());
-    return replicas.at(0).readableString();
-}
-
-
-static bool isExtendedDefinitionStorage(const ASTPtr & storage_ast)
-{
-    const auto & storage = storage_ast->as<ASTStorage &>();
-    return storage.partition_by || storage.order_by || storage.sample_by;
-}
-
-static ASTPtr extractPartitionKey(const ASTPtr & storage_ast)
-{
-    String storage_str = queryToString(storage_ast);
-
-    const auto & storage = storage_ast->as<ASTStorage &>();
-    const auto & engine = storage.engine->as<ASTFunction &>();
-
-    if (!endsWith(engine.name, "MergeTree"))
-    {
-        throw Exception("Unsupported engine was specified in " + storage_str + ", only *MergeTree engines are supported",
-                        ErrorCodes::BAD_ARGUMENTS);
-    }
-
-    if (isExtendedDefinitionStorage(storage_ast))
-    {
-        if (storage.partition_by)
-            return storage.partition_by->clone();
-
-        static const char * all = "all";
-        return std::make_shared<ASTLiteral>(Field(all, strlen(all)));
+        /// Process partition in order specified by <enabled_partitions/>
+        for (const String & partition_name : task_table.enabled_partitions)
+        {
+            /// Check that user specified correct partition names
+            check_partition_format(task_shard->partition_key_column.type, partition_name);
+
+            auto it = existing_partitions_names.find(partition_name);
+
+            /// Do not process partition if it is not in enabled_partitions list
+            if (it == existing_partitions_names.end())
+            {
+                missing_partitions.emplace_back(partition_name);
+                continue;
+            }
+
+            filtered_partitions_names.emplace_back(*it);
+        }
+
+        for (const String & partition_name : existing_partitions_names)
+        {
+            if (!task_table.enabled_partitions_set.count(partition_name))
+            {
+                LOG_DEBUG(log, "Partition " << partition_name << " will not be processed, since it is not in "
+                                            << "enabled_partitions of " << task_table.table_id);
+            }
+        }
     }
     else
     {
-        bool is_replicated = startsWith(engine.name, "Replicated");
-        size_t min_args = is_replicated ? 3 : 1;
+        for (const String & partition_name : existing_partitions_names)
+            filtered_partitions_names.emplace_back(partition_name);
+    }
 
-        if (!engine.arguments)
-            throw Exception("Expected arguments in " + storage_str, ErrorCodes::BAD_ARGUMENTS);
+    for (const String & partition_name : filtered_partitions_names)
+    {
+        task_shard->partition_tasks.emplace(partition_name, ShardPartition(*task_shard, partition_name));
+        task_shard->checked_partitions.emplace(partition_name, true);
+    }
 
-        ASTPtr arguments_ast = engine.arguments->clone();
-        ASTs & arguments = arguments_ast->children;
+    if (!missing_partitions.empty())
+    {
+        std::stringstream ss;
+        for (const String & missing_partition : missing_partitions)
+            ss << " " << missing_partition;
 
-        if (arguments.size() < min_args)
-            throw Exception("Expected at least " + toString(min_args) + " arguments in " + storage_str, ErrorCodes::BAD_ARGUMENTS);
+        LOG_WARNING(log, "There are no " << missing_partitions.size() << " partitions from enabled_partitions in shard "
+                         << task_shard->getDescription() << " :" << ss.str());
+    }
 
-        ASTPtr & month_arg = is_replicated ? arguments[2] : arguments[1];
-        return makeASTFunction("toYYYYMM", month_arg->clone());
+    LOG_DEBUG(log, "Will copy " << task_shard->partition_tasks.size() << " partitions from shard " << task_shard->getDescription());
+}
+
+void ClusterCopier::discoverTablePartitions(const ConnectionTimeouts & timeouts, TaskTable & task_table, UInt64 num_threads)
+{
+    /// Fetch partitions list from a shard
+    {
+        ThreadPool thread_pool(num_threads ? num_threads : 2 * getNumberOfPhysicalCPUCores());
+
+        for (const TaskShardPtr & task_shard : task_table.all_shards)
+            thread_pool.scheduleOrThrowOnError([this, timeouts, task_shard]() { discoverShardPartitions(timeouts, task_shard); });
+
+        LOG_DEBUG(log, "Waiting for " << thread_pool.active() << " setup jobs");
+        thread_pool.wait();
     }
 }
 
-
-TaskTable::TaskTable(TaskCluster & parent, const Poco::Util::AbstractConfiguration & config, const String & prefix_,
-                     const String & table_key)
-: task_cluster(parent)
+void ClusterCopier::uploadTaskDescription(const std::string & task_path, const std::string & task_file, const bool force)
 {
-    String table_prefix = prefix_ + "." + table_key + ".";
+    auto local_task_description_path = task_path + "/description";
 
-    name_in_config = table_key;
-
-    cluster_pull_name = config.getString(table_prefix + "cluster_pull");
-    cluster_push_name = config.getString(table_prefix + "cluster_push");
-
-    table_pull.first = config.getString(table_prefix + "database_pull");
-    table_pull.second = config.getString(table_prefix + "table_pull");
-
-    table_push.first = config.getString(table_prefix + "database_push");
-    table_push.second = config.getString(table_prefix + "table_push");
-
-    /// Used as node name in ZooKeeper
-    table_id = escapeForFileName(cluster_push_name)
-               + "." + escapeForFileName(table_push.first)
-               + "." + escapeForFileName(table_push.second);
-
-    engine_push_str = config.getString(table_prefix + "engine");
+    String task_config_str;
     {
-        ParserStorage parser_storage;
-        engine_push_ast = parseQuery(parser_storage, engine_push_str, 0);
-        engine_push_partition_key_ast = extractPartitionKey(engine_push_ast);
+        ReadBufferFromFile in(task_file);
+        readStringUntilEOF(task_config_str, in);
     }
+    if (task_config_str.empty())
+        return;
 
-    sharding_key_str = config.getString(table_prefix + "sharding_key");
-    {
-        ParserExpressionWithOptionalAlias parser_expression(false);
-        sharding_key_ast = parseQuery(parser_expression, sharding_key_str, 0);
-        engine_split_ast = createASTStorageDistributed(cluster_push_name, table_push.first, table_push.second, sharding_key_ast);
-    }
+    auto zookeeper = context.getZooKeeper();
 
-    where_condition_str = config.getString(table_prefix + "where_condition", "");
-    if (!where_condition_str.empty())
-    {
-        ParserExpressionWithOptionalAlias parser_expression(false);
-        where_condition_ast = parseQuery(parser_expression, where_condition_str, 0);
+    zookeeper->createAncestors(local_task_description_path);
+    auto code = zookeeper->tryCreate(local_task_description_path, task_config_str, zkutil::CreateMode::Persistent);
+    if (code && force)
+        zookeeper->createOrUpdate(local_task_description_path, task_config_str, zkutil::CreateMode::Persistent);
 
-        // Will use canonical expression form
-        where_condition_str = queryToString(where_condition_ast);
-    }
-
-    String enabled_partitions_prefix = table_prefix + "enabled_partitions";
-    has_enabled_partitions = config.has(enabled_partitions_prefix);
-
-    if (has_enabled_partitions)
-    {
-        Strings keys;
-        config.keys(enabled_partitions_prefix, keys);
-
-        if (keys.empty())
-        {
-            /// Parse list of partition from space-separated string
-            String partitions_str = config.getString(table_prefix + "enabled_partitions");
-            boost::trim_if(partitions_str, isWhitespaceASCII);
-            boost::split(enabled_partitions, partitions_str, isWhitespaceASCII, boost::token_compress_on);
-        }
-        else
-        {
-            /// Parse sequence of <partition>...</partition>
-            for (const String & key : keys)
-            {
-                if (!startsWith(key, "partition"))
-                    throw Exception("Unknown key " + key + " in " + enabled_partitions_prefix, ErrorCodes::UNKNOWN_ELEMENT_IN_CONFIG);
-
-                enabled_partitions.emplace_back(config.getString(enabled_partitions_prefix + "." + key));
-            }
-        }
-
-        std::copy(enabled_partitions.begin(), enabled_partitions.end(), std::inserter(enabled_partitions_set, enabled_partitions_set.begin()));
-    }
+    LOG_DEBUG(log, "Task description " << ((code && !force) ? "not " : "") << "uploaded to " << local_task_description_path << " with result " << code << " ("<< zookeeper->error2string(code) << ")");
 }
 
-
-static ShardPriority getReplicasPriority(const Cluster::Addresses & replicas, const std::string & local_hostname, UInt8 random)
+void ClusterCopier::reloadTaskDescription()
 {
-    ShardPriority res;
+    auto zookeeper = context.getZooKeeper();
+    task_description_watch_zookeeper = zookeeper;
 
-    if (replicas.empty())
-        return res;
+    String task_config_str;
+    Coordination::Stat stat;
+    int code;
 
-    res.is_remote = 1;
-    for (auto & replica : replicas)
-    {
-        if (isLocalAddress(DNSResolver::instance().resolveHost(replica.host_name)))
-        {
-            res.is_remote = 0;
-            break;
-        }
-    }
+    zookeeper->tryGetWatch(task_description_path, task_config_str, &stat, task_description_watch_callback, &code);
+    if (code)
+        throw Exception("Can't get description node " + task_description_path, ErrorCodes::BAD_ARGUMENTS);
 
-    res.hostname_difference = std::numeric_limits<size_t>::max();
-    for (auto & replica : replicas)
-    {
-        size_t difference = getHostNameDifference(local_hostname, replica.host_name);
-        res.hostname_difference = std::min(difference, res.hostname_difference);
-    }
+    LOG_DEBUG(log, "Loading description, zxid=" << task_descprtion_current_stat.czxid);
+    auto config = getConfigurationFromXMLString(task_config_str);
 
-    res.random = random;
-    return res;
+    /// Setup settings
+    task_cluster->reloadSettings(*config);
+    context.getSettingsRef() = task_cluster->settings_common;
+
+    task_cluster_current_config = config;
+    task_descprtion_current_stat = stat;
 }
 
-template<typename RandomEngine>
-void TaskTable::initShards(RandomEngine && random_engine)
+void ClusterCopier::updateConfigIfNeeded()
 {
-    const String & fqdn_name = getFQDNOrHostName();
-    std::uniform_int_distribution<UInt8> get_urand(0, std::numeric_limits<UInt8>::max());
+    UInt64 version_to_update = task_descprtion_version;
+    bool is_outdated_version = task_descprtion_current_version != version_to_update;
+    bool is_expired_session = !task_description_watch_zookeeper || task_description_watch_zookeeper->expired();
 
-    // Compute the priority
-    for (auto & shard_info : cluster_pull->getShardsInfo())
-    {
-        TaskShardPtr task_shard = std::make_shared<TaskShard>(*this, shard_info);
-        const auto & replicas = cluster_pull->getShardsAddresses().at(task_shard->indexInCluster());
-        task_shard->priority = getReplicasPriority(replicas, fqdn_name, get_urand(random_engine));
+    if (!is_outdated_version && !is_expired_session)
+        return;
 
-        all_shards.emplace_back(task_shard);
-    }
+    LOG_DEBUG(log, "Updating task description");
+    reloadTaskDescription();
 
-    // Sort by priority
-    std::sort(all_shards.begin(), all_shards.end(),
-        [] (const TaskShardPtr & lhs, const TaskShardPtr & rhs)
-        {
-            return ShardPriority::greaterPriority(lhs->priority, rhs->priority);
-        });
-
-    // Cut local shards
-    auto it_first_remote = std::lower_bound(all_shards.begin(), all_shards.end(), 1,
-        [] (const TaskShardPtr & lhs, UInt8 is_remote)
-        {
-            return lhs->priority.is_remote < is_remote;
-        });
-
-    local_shards.assign(all_shards.begin(), it_first_remote);
+    task_descprtion_current_version = version_to_update;
 }
 
-
-void DB::TaskCluster::loadTasks(const Poco::Util::AbstractConfiguration & config, const String & base_key)
+void ClusterCopier::process(const ConnectionTimeouts & timeouts)
 {
-    String prefix = base_key.empty() ? "" : base_key + ".";
-
-    clusters_prefix = prefix + "remote_servers";
-    if (!config.has(clusters_prefix))
-        throw Exception("You should specify list of clusters in " + clusters_prefix, ErrorCodes::BAD_ARGUMENTS);
-
-    Poco::Util::AbstractConfiguration::Keys tables_keys;
-    config.keys(prefix + "tables", tables_keys);
-
-    for (const auto & table_key : tables_keys)
+    for (TaskTable & task_table : task_cluster->table_tasks)
     {
-        table_tasks.emplace_back(*this, config, prefix + "tables", table_key);
-    }
-}
+        LOG_INFO(log, "Process table task " << task_table.table_id << " with "
+                      << task_table.all_shards.size() << " shards, " << task_table.local_shards.size() << " of them are local ones");
 
-void DB::TaskCluster::reloadSettings(const Poco::Util::AbstractConfiguration & config, const String & base_key)
-{
-    String prefix = base_key.empty() ? "" : base_key + ".";
+        if (task_table.all_shards.empty())
+            continue;
 
-    max_workers = config.getUInt64(prefix + "max_workers");
-
-    settings_common = Settings();
-    if (config.has(prefix + "settings"))
-        settings_common.loadSettingsFromConfig(prefix + "settings", config);
-
-    settings_pull = settings_common;
-    if (config.has(prefix + "settings_pull"))
-        settings_pull.loadSettingsFromConfig(prefix + "settings_pull", config);
-
-    settings_push = settings_common;
-    if (config.has(prefix + "settings_push"))
-        settings_push.loadSettingsFromConfig(prefix + "settings_push", config);
-
-    auto set_default_value = [] (auto && setting, auto && default_value)
-    {
-        setting = setting.changed ? setting.value : default_value;
-    };
-
-    /// Override important settings
-    settings_pull.readonly = 1;
-    settings_push.insert_distributed_sync = 1;
-    set_default_value(settings_pull.load_balancing, LoadBalancing::NEAREST_HOSTNAME);
-    set_default_value(settings_pull.max_threads, 1);
-    set_default_value(settings_pull.max_block_size, 8192UL);
-    set_default_value(settings_pull.preferred_block_size_bytes, 0);
-    set_default_value(settings_push.insert_distributed_timeout, 0);
-}
-
-
-} // end of an anonymous namespace
-
-
-class ClusterCopier
-{
-public:
-
-    ClusterCopier(const String & task_path_,
-                  const String & host_id_,
-                  const String & proxy_database_name_,
-                  Context & context_)
-    :
-        task_zookeeper_path(task_path_),
-        host_id(host_id_),
-        working_database_name(proxy_database_name_),
-        context(context_),
-        log(&Poco::Logger::get("ClusterCopier"))
-    {
-    }
-
-    void init()
-    {
-        auto zookeeper = context.getZooKeeper();
-
-        task_description_watch_callback = [this] (const Coordination::WatchResponse & response)
+        /// Discover partitions of each shard and total set of partitions
+        if (!task_table.has_enabled_partitions)
         {
-            if (response.error != Coordination::ZOK)
-                return;
-            UInt64 version = ++task_descprtion_version;
-            LOG_DEBUG(log, "Task description should be updated, local version " << version);
-        };
-
-        task_description_path = task_zookeeper_path + "/description";
-        task_cluster = std::make_unique<TaskCluster>(task_zookeeper_path, working_database_name);
-
-        reloadTaskDescription();
-        task_cluster_initial_config = task_cluster_current_config;
-
-        task_cluster->loadTasks(*task_cluster_initial_config);
-        context.setClustersConfig(task_cluster_initial_config, task_cluster->clusters_prefix);
-
-        /// Set up shards and their priority
-        task_cluster->random_engine.seed(task_cluster->random_device());
-        for (auto & task_table : task_cluster->table_tasks)
-        {
-            task_table.cluster_pull = context.getCluster(task_table.cluster_pull_name);
-            task_table.cluster_push = context.getCluster(task_table.cluster_push_name);
-            task_table.initShards(task_cluster->random_engine);
-        }
-
-        LOG_DEBUG(log, "Will process " << task_cluster->table_tasks.size() << " table tasks");
-
-        /// Do not initialize tables, will make deferred initialization in process()
-
-        zookeeper->createAncestors(getWorkersPathVersion() + "/");
-        zookeeper->createAncestors(getWorkersPath() + "/");
-    }
-
-    template <typename T>
-    decltype(auto) retry(T && func, UInt64 max_tries = 100)
-    {
-        std::exception_ptr exception;
-
-        for (UInt64 try_number = 1; try_number <= max_tries; ++try_number)
-        {
-            try
-            {
-                return func();
-            }
-            catch (...)
-            {
-                exception = std::current_exception();
-                if (try_number < max_tries)
-                {
-                    tryLogCurrentException(log, "Will retry");
-                    std::this_thread::sleep_for(default_sleep_time);
-                }
-            }
-        }
-
-        std::rethrow_exception(exception);
-    }
-
-
-    void discoverShardPartitions(const ConnectionTimeouts & timeouts, const TaskShardPtr & task_shard)
-    {
-        TaskTable & task_table = task_shard->task_table;
-
-        LOG_INFO(log, "Discover partitions of shard " << task_shard->getDescription());
-
-        auto get_partitions = [&] () { return getShardPartitions(timeouts, *task_shard); };
-        auto existing_partitions_names = retry(get_partitions, 60);
-        Strings filtered_partitions_names;
-        Strings missing_partitions;
-
-        /// Check that user specified correct partition names
-        auto check_partition_format = [] (const DataTypePtr & type, const String & partition_text_quoted)
-        {
-            MutableColumnPtr column_dummy = type->createColumn();
-            ReadBufferFromString rb(partition_text_quoted);
-
-            try
-            {
-                type->deserializeAsTextQuoted(*column_dummy, rb, FormatSettings());
-            }
-            catch (Exception & e)
-            {
-                throw Exception("Partition " + partition_text_quoted + " has incorrect format. " + e.displayText(), ErrorCodes::BAD_ARGUMENTS);
-            }
-        };
-
-        if (task_table.has_enabled_partitions)
-        {
-            /// Process partition in order specified by <enabled_partitions/>
-            for (const String & partition_name : task_table.enabled_partitions)
-            {
-                /// Check that user specified correct partition names
-                check_partition_format(task_shard->partition_key_column.type, partition_name);
-
-                auto it = existing_partitions_names.find(partition_name);
-
-                /// Do not process partition if it is not in enabled_partitions list
-                if (it == existing_partitions_names.end())
-                {
-                    missing_partitions.emplace_back(partition_name);
-                    continue;
-                }
-
-                filtered_partitions_names.emplace_back(*it);
-            }
-
-            for (const String & partition_name : existing_partitions_names)
-            {
-                if (!task_table.enabled_partitions_set.count(partition_name))
-                {
-                    LOG_DEBUG(log, "Partition " << partition_name << " will not be processed, since it is not in "
-                                                << "enabled_partitions of " << task_table.table_id);
-                }
-            }
-        }
-        else
-        {
-            for (const String & partition_name : existing_partitions_names)
-                filtered_partitions_names.emplace_back(partition_name);
-        }
-
-        for (const String & partition_name : filtered_partitions_names)
-        {
-            task_shard->partition_tasks.emplace(partition_name, ShardPartition(*task_shard, partition_name));
-            task_shard->checked_partitions.emplace(partition_name, true);
-        }
-
-        if (!missing_partitions.empty())
-        {
-            std::stringstream ss;
-            for (const String & missing_partition : missing_partitions)
-                ss << " " << missing_partition;
-
-            LOG_WARNING(log, "There are no " << missing_partitions.size() << " partitions from enabled_partitions in shard "
-                             << task_shard->getDescription() << " :" << ss.str());
-        }
-
-        LOG_DEBUG(log, "Will copy " << task_shard->partition_tasks.size() << " partitions from shard " << task_shard->getDescription());
-    }
-
-    /// Compute set of partitions, assume set of partitions aren't changed during the processing
-    void discoverTablePartitions(const ConnectionTimeouts & timeouts, TaskTable & task_table, UInt64 num_threads = 0)
-    {
-        /// Fetch partitions list from a shard
-        {
-            ThreadPool thread_pool(num_threads ? num_threads : 2 * getNumberOfPhysicalCPUCores());
+            /// If there are no specified enabled_partitions, we must discover them manually
+            discoverTablePartitions(timeouts, task_table);
 
+            /// After partitions of each shard are initialized, initialize cluster partitions
             for (const TaskShardPtr & task_shard : task_table.all_shards)
-                thread_pool.scheduleOrThrowOnError([this, timeouts, task_shard]() { discoverShardPartitions(timeouts, task_shard); });
-
-            LOG_DEBUG(log, "Waiting for " << thread_pool.active() << " setup jobs");
-            thread_pool.wait();
-        }
-    }
-
-    void uploadTaskDescription(const std::string & task_path, const std::string & task_file, const bool force)
-    {
-        auto local_task_description_path = task_path + "/description";
-
-        String task_config_str;
-        {
-            ReadBufferFromFile in(task_file);
-            readStringUntilEOF(task_config_str, in);
-        }
-        if (task_config_str.empty())
-            return;
-
-        auto zookeeper = context.getZooKeeper();
-
-        zookeeper->createAncestors(local_task_description_path);
-        auto code = zookeeper->tryCreate(local_task_description_path, task_config_str, zkutil::CreateMode::Persistent);
-        if (code && force)
-            zookeeper->createOrUpdate(local_task_description_path, task_config_str, zkutil::CreateMode::Persistent);
-
-        LOG_DEBUG(log, "Task description " << ((code && !force) ? "not " : "") << "uploaded to " << local_task_description_path << " with result " << code << " ("<< zookeeper->error2string(code) << ")");
-    }
-
-    void reloadTaskDescription()
-    {
-        auto zookeeper = context.getZooKeeper();
-        task_description_watch_zookeeper = zookeeper;
-
-        String task_config_str;
-        Coordination::Stat stat;
-        int code;
-
-        zookeeper->tryGetWatch(task_description_path, task_config_str, &stat, task_description_watch_callback, &code);
-        if (code)
-            throw Exception("Can't get description node " + task_description_path, ErrorCodes::BAD_ARGUMENTS);
-
-        LOG_DEBUG(log, "Loading description, zxid=" << task_descprtion_current_stat.czxid);
-        auto config = getConfigurationFromXMLString(task_config_str);
-
-        /// Setup settings
-        task_cluster->reloadSettings(*config);
-        context.getSettingsRef() = task_cluster->settings_common;
-
-        task_cluster_current_config = config;
-        task_descprtion_current_stat = stat;
-    }
-
-    void updateConfigIfNeeded()
-    {
-        UInt64 version_to_update = task_descprtion_version;
-        bool is_outdated_version = task_descprtion_current_version != version_to_update;
-        bool is_expired_session = !task_description_watch_zookeeper || task_description_watch_zookeeper->expired();
-
-        if (!is_outdated_version && !is_expired_session)
-            return;
-
-        LOG_DEBUG(log, "Updating task description");
-        reloadTaskDescription();
-
-        task_descprtion_current_version = version_to_update;
-    }
-
-    void process(const ConnectionTimeouts & timeouts)
-    {
-        for (TaskTable & task_table : task_cluster->table_tasks)
-        {
-            LOG_INFO(log, "Process table task " << task_table.table_id << " with "
-                          << task_table.all_shards.size() << " shards, " << task_table.local_shards.size() << " of them are local ones");
-
-            if (task_table.all_shards.empty())
-                continue;
-
-            /// Discover partitions of each shard and total set of partitions
-            if (!task_table.has_enabled_partitions)
             {
-                /// If there are no specified enabled_partitions, we must discover them manually
-                discoverTablePartitions(timeouts, task_table);
-
-                /// After partitions of each shard are initialized, initialize cluster partitions
-                for (const TaskShardPtr & task_shard : task_table.all_shards)
-                {
-                    for (const auto & partition_elem : task_shard->partition_tasks)
-                    {
-                        const String & partition_name = partition_elem.first;
-                        task_table.cluster_partitions.emplace(partition_name, ClusterPartition{});
-                    }
-                }
-
-                for (auto & partition_elem : task_table.cluster_partitions)
+                for (const auto & partition_elem : task_shard->partition_tasks)
                 {
                     const String & partition_name = partition_elem.first;
-
-                    for (const TaskShardPtr & task_shard : task_table.all_shards)
-                        task_shard->checked_partitions.emplace(partition_name);
-
-                    task_table.ordered_partition_names.emplace_back(partition_name);
-                }
-            }
-            else
-            {
-                /// If enabled_partitions are specified, assume that each shard has all partitions
-                /// We will refine partition set of each shard in future
-
-                for (const String & partition_name : task_table.enabled_partitions)
-                {
                     task_table.cluster_partitions.emplace(partition_name, ClusterPartition{});
-                    task_table.ordered_partition_names.emplace_back(partition_name);
                 }
             }
 
-            task_table.watch.restart();
-
-            /// Retry table processing
-            bool table_is_done = false;
-            for (UInt64 num_table_tries = 0; num_table_tries < max_table_tries; ++num_table_tries)
+            for (auto & partition_elem : task_table.cluster_partitions)
             {
-                if (tryProcessTable(timeouts, task_table))
-                {
-                    table_is_done = true;
-                    break;
-                }
-            }
+                const String & partition_name = partition_elem.first;
 
-            if (!table_is_done)
-            {
-                throw Exception("Too many tries to process table " + task_table.table_id + ". Abort remaining execution",
-                                ErrorCodes::UNFINISHED);
+                for (const TaskShardPtr & task_shard : task_table.all_shards)
+                    task_shard->checked_partitions.emplace(partition_name);
+
+                task_table.ordered_partition_names.emplace_back(partition_name);
             }
         }
-    }
-
-    /// Disables DROP PARTITION commands that used to clear data after errors
-    void setSafeMode(bool is_safe_mode_ = true)
-    {
-        is_safe_mode = is_safe_mode_;
-    }
-
-    void setCopyFaultProbability(double copy_fault_probability_)
-    {
-        copy_fault_probability = copy_fault_probability_;
-    }
-
-
-protected:
-
-    String getWorkersPath() const
-    {
-        return task_cluster->task_zookeeper_path + "/task_active_workers";
-    }
-
-    String getWorkersPathVersion() const
-    {
-        return getWorkersPath() + "_version";
-    }
-
-    String getCurrentWorkerNodePath() const
-    {
-        return getWorkersPath() + "/" + host_id;
-    }
-
-    zkutil::EphemeralNodeHolder::Ptr createTaskWorkerNodeAndWaitIfNeed(
-        const zkutil::ZooKeeperPtr & zookeeper,
-        const String & description,
-        bool unprioritized)
-    {
-        std::chrono::milliseconds current_sleep_time = default_sleep_time;
-        static constexpr std::chrono::milliseconds max_sleep_time(30000); // 30 sec
-
-        if (unprioritized)
-            std::this_thread::sleep_for(current_sleep_time);
-
-        String workers_version_path = getWorkersPathVersion();
-        String workers_path = getWorkersPath();
-        String current_worker_path = getCurrentWorkerNodePath();
-
-        UInt64 num_bad_version_errors = 0;
-
-        while (true)
+        else
         {
-            updateConfigIfNeeded();
+            /// If enabled_partitions are specified, assume that each shard has all partitions
+            /// We will refine partition set of each shard in future
 
-            Coordination::Stat stat;
-            zookeeper->get(workers_version_path, &stat);
-            auto version = stat.version;
-            zookeeper->get(workers_path, &stat);
-
-            if (static_cast<UInt64>(stat.numChildren) >= task_cluster->max_workers)
+            for (const String & partition_name : task_table.enabled_partitions)
             {
-                LOG_DEBUG(log, "Too many workers (" << stat.numChildren << ", maximum " << task_cluster->max_workers << ")"
-                    << ". Postpone processing " << description);
-
-                if (unprioritized)
-                    current_sleep_time = std::min(max_sleep_time, current_sleep_time + default_sleep_time);
-
-                std::this_thread::sleep_for(current_sleep_time);
-                num_bad_version_errors = 0;
-            }
-            else
-            {
-                Coordination::Requests ops;
-                ops.emplace_back(zkutil::makeSetRequest(workers_version_path, description, version));
-                ops.emplace_back(zkutil::makeCreateRequest(current_worker_path, description, zkutil::CreateMode::Ephemeral));
-                Coordination::Responses responses;
-                auto code = zookeeper->tryMulti(ops, responses);
-
-                if (code == Coordination::ZOK || code == Coordination::ZNODEEXISTS)
-                    return std::make_shared<zkutil::EphemeralNodeHolder>(current_worker_path, *zookeeper, false, false, description);
-
-                if (code == Coordination::ZBADVERSION)
-                {
-                    ++num_bad_version_errors;
-
-                    /// Try to make fast retries
-                    if (num_bad_version_errors > 3)
-                    {
-                        LOG_DEBUG(log, "A concurrent worker has just been added, will check free worker slots again");
-                        std::chrono::milliseconds random_sleep_time(std::uniform_int_distribution<int>(1, 1000)(task_cluster->random_engine));
-                        std::this_thread::sleep_for(random_sleep_time);
-                        num_bad_version_errors = 0;
-                    }
-                }
-                else
-                    throw Coordination::Exception(code);
-            }
-        }
-    }
-
-    /** Checks that the whole partition of a table was copied. We should do it carefully due to dirty lock.
-     * State of some task could change during the processing.
-     * We have to ensure that all shards have the finished state and there is no dirty flag.
-     * Moreover, we have to check status twice and check zxid, because state can change during the checking.
-     */
-    bool checkPartitionIsDone(const TaskTable & task_table, const String & partition_name, const TasksShard & shards_with_partition)
-    {
-        LOG_DEBUG(log, "Check that all shards processed partition " << partition_name << " successfully");
-
-        auto zookeeper = context.getZooKeeper();
-
-        Strings status_paths;
-        for (auto & shard : shards_with_partition)
-        {
-            ShardPartition & task_shard_partition = shard->partition_tasks.find(partition_name)->second;
-            status_paths.emplace_back(task_shard_partition.getShardStatusPath());
-        }
-
-        std::vector<int64_t> zxid1, zxid2;
-
-        try
-        {
-            std::vector<zkutil::ZooKeeper::FutureGet> get_futures;
-            for (const String & path : status_paths)
-                get_futures.emplace_back(zookeeper->asyncGet(path));
-
-            // Check that state is Finished and remember zxid
-            for (auto & future : get_futures)
-            {
-                auto res = future.get();
-
-                TaskStateWithOwner status = TaskStateWithOwner::fromString(res.data);
-                if (status.state != TaskState::Finished)
-                {
-                    LOG_INFO(log, "The task " << res.data << " is being rewritten by " << status.owner << ". Partition will be rechecked");
-                    return false;
-                }
-
-                zxid1.push_back(res.stat.pzxid);
-            }
-
-            // Check that partition is not dirty
-            {
-                CleanStateClock clean_state_clock (
-                                                   zookeeper,
-                                                   task_table.getPartitionIsDirtyPath(partition_name),
-                                                   task_table.getPartitionIsCleanedPath(partition_name)
-                                                   );
-                Coordination::Stat stat;
-                LogicalClock task_start_clock;
-                if (zookeeper->exists(task_table.getPartitionTaskStatusPath(partition_name), &stat))
-                    task_start_clock = LogicalClock(stat.mzxid);
-                zookeeper->get(task_table.getPartitionTaskStatusPath(partition_name), &stat);
-                if (!clean_state_clock.is_clean() || task_start_clock <= clean_state_clock.discovery_zxid)
-                {
-                    LOG_INFO(log, "Partition " << partition_name << " become dirty");
-                    return false;
-                }
-            }
-
-            get_futures.clear();
-            for (const String & path : status_paths)
-                get_futures.emplace_back(zookeeper->asyncGet(path));
-
-            // Remember zxid of states again
-            for (auto & future : get_futures)
-            {
-                auto res = future.get();
-                zxid2.push_back(res.stat.pzxid);
-            }
-        }
-        catch (const Coordination::Exception & e)
-        {
-            LOG_INFO(log, "A ZooKeeper error occurred while checking partition " << partition_name
-                          << ". Will recheck the partition. Error: " << e.displayText());
-            return false;
-        }
-
-        // If all task is finished and zxid is not changed then partition could not become dirty again
-        for (UInt64 shard_num = 0; shard_num < status_paths.size(); ++shard_num)
-        {
-            if (zxid1[shard_num] != zxid2[shard_num])
-            {
-                LOG_INFO(log, "The task " << status_paths[shard_num] << " is being modified now. Partition will be rechecked");
-                return false;
+                task_table.cluster_partitions.emplace(partition_name, ClusterPartition{});
+                task_table.ordered_partition_names.emplace_back(partition_name);
             }
         }
 
-        LOG_INFO(log, "Partition " << partition_name << " is copied successfully");
-        return true;
-    }
+        task_table.watch.restart();
 
-    /// Removes MATERIALIZED and ALIAS columns from create table query
-    static ASTPtr removeAliasColumnsFromCreateQuery(const ASTPtr & query_ast)
-    {
-        const ASTs & column_asts = query_ast->as<ASTCreateQuery &>().columns_list->columns->children;
-        auto new_columns = std::make_shared<ASTExpressionList>();
-
-        for (const ASTPtr & column_ast : column_asts)
+        /// Retry table processing
+        bool table_is_done = false;
+        for (UInt64 num_table_tries = 0; num_table_tries < max_table_tries; ++num_table_tries)
         {
-            const auto & column = column_ast->as<ASTColumnDeclaration &>();
-
-            if (!column.default_specifier.empty())
+            if (tryProcessTable(timeouts, task_table))
             {
-                ColumnDefaultKind kind = columnDefaultKindFromString(column.default_specifier);
-                if (kind == ColumnDefaultKind::Materialized || kind == ColumnDefaultKind::Alias)
-                    continue;
-            }
-
-            new_columns->children.emplace_back(column_ast->clone());
-        }
-
-        ASTPtr new_query_ast = query_ast->clone();
-        auto & new_query = new_query_ast->as<ASTCreateQuery &>();
-
-        auto new_columns_list = std::make_shared<ASTColumns>();
-        new_columns_list->set(new_columns_list->columns, new_columns);
-        if (auto indices = query_ast->as<ASTCreateQuery>()->columns_list->indices)
-            new_columns_list->set(new_columns_list->indices, indices->clone());
-
-        new_query.replace(new_query.columns_list, new_columns_list);
-
-        return new_query_ast;
-    }
-
-    /// Replaces ENGINE and table name in a create query
-    std::shared_ptr<ASTCreateQuery> rewriteCreateQueryStorage(const ASTPtr & create_query_ast, const DatabaseAndTableName & new_table, const ASTPtr & new_storage_ast)
-    {
-        const auto & create = create_query_ast->as<ASTCreateQuery &>();
-        auto res = std::make_shared<ASTCreateQuery>(create);
-
-        if (create.storage == nullptr || new_storage_ast == nullptr)
-            throw Exception("Storage is not specified", ErrorCodes::LOGICAL_ERROR);
-
-        res->database = new_table.first;
-        res->table = new_table.second;
-
-        res->children.clear();
-        res->set(res->columns_list, create.columns_list->clone());
-        res->set(res->storage, new_storage_ast->clone());
-
-        return res;
-    }
-
-    /** Allows to compare two incremental counters of type UInt32 in presence of possible overflow.
-      * We assume that we compare values that are not too far away.
-      * For example, when we increment 0xFFFFFFFF, we get 0. So, 0xFFFFFFFF is less than 0.
-      */
-    class WrappingUInt32
-    {
-    public:
-        UInt32 value;
-
-        WrappingUInt32(UInt32 _value)
-            : value(_value)
-        {}
-
-        bool operator<(const WrappingUInt32 & other) const
-        {
-            return value != other.value && *this <= other;
-        }
-
-        bool operator<=(const WrappingUInt32 & other) const
-        {
-            const UInt32 HALF = 1 << 31;
-            return (value <= other.value && other.value - value < HALF)
-                || (value > other.value && value - other.value > HALF);
-        }
-
-        bool operator==(const WrappingUInt32 & other) const
-        {
-            return value == other.value;
-        }
-    };
-
-    /** Conforming Zxid definition.
-      * cf. https://github.com/apache/zookeeper/blob/631d1b284f0edb1c4f6b0fb221bf2428aec71aaa/zookeeper-docs/src/main/resources/markdown/zookeeperInternals.md#guarantees-properties-and-definitions
-      */
-    class Zxid
-    {
-    public:
-        WrappingUInt32 epoch;
-        WrappingUInt32 counter;
-        Zxid(UInt64 _zxid)
-            : epoch(_zxid >> 32)
-            , counter(_zxid)
-        {}
-
-        bool operator<=(const Zxid & other) const
-        {
-            return (epoch < other.epoch)
-                || (epoch == other.epoch && counter <= other.counter);
-        }
-
-        bool operator==(const Zxid & other) const
-        {
-            return epoch == other.epoch && counter == other.counter;
-        }
-    };
-
-    class LogicalClock
-    {
-    public:
-        std::optional<Zxid> zxid;
-
-        LogicalClock() = default;
-
-        LogicalClock(UInt64 _zxid)
-            : zxid(_zxid)
-        {}
-
-        bool hasHappened() const
-        {
-            return bool(zxid);
-        }
-
-        // happens-before relation with a reasonable time bound
-        bool happensBefore(const LogicalClock & other) const
-        {
-            return !zxid
-                || (other.zxid && *zxid <= *other.zxid);
-        }
-
-        bool operator<=(const LogicalClock & other) const
-        {
-            return happensBefore(other);
-        }
-
-        // strict equality check
-        bool operator==(const LogicalClock & other) const
-        {
-            return zxid == other.zxid;
-        }
-    };
-
-    class CleanStateClock
-    {
-    public:
-        LogicalClock discovery_zxid;
-        std::optional<UInt32> discovery_version;
-
-        LogicalClock clean_state_zxid;
-        std::optional<UInt32> clean_state_version;
-
-        std::shared_ptr<std::atomic_bool> stale;
-
-        bool is_clean() const
-        {
-            return
-                !is_stale()
-                && (
-                    !discovery_zxid.hasHappened()
-                    || (clean_state_zxid.hasHappened() && discovery_zxid <= clean_state_zxid));
-        }
-
-        bool is_stale() const
-        {
-            return stale->load();
-        }
-
-        CleanStateClock(
-                        const zkutil::ZooKeeperPtr & zookeeper,
-                        const String & discovery_path,
-                        const String & clean_state_path)
-            : stale(std::make_shared<std::atomic_bool>(false))
-        {
-            Coordination::Stat stat;
-            String _some_data;
-            auto watch_callback =
-                [stale = stale] (const Coordination::WatchResponse & rsp)
-                {
-                    auto logger = &Poco::Logger::get("ClusterCopier");
-                    if (rsp.error == Coordination::ZOK)
-                    {
-                        switch (rsp.type)
-                        {
-                        case Coordination::CREATED:
-                            LOG_DEBUG(logger, "CleanStateClock change: CREATED, at " << rsp.path);
-                            stale->store(true);
-                            break;
-                        case Coordination::CHANGED:
-                            LOG_DEBUG(logger, "CleanStateClock change: CHANGED, at" << rsp.path);
-                            stale->store(true);
-                        }
-                    }
-                };
-            if (zookeeper->tryGetWatch(discovery_path, _some_data, &stat, watch_callback))
-            {
-                discovery_zxid = LogicalClock(stat.mzxid);
-                discovery_version = stat.version;
-            }
-            if (zookeeper->tryGetWatch(clean_state_path, _some_data, &stat, watch_callback))
-            {
-                clean_state_zxid = LogicalClock(stat.mzxid);
-                clean_state_version = stat.version;
+                table_is_done = true;
+                break;
             }
         }
 
-        bool operator==(const CleanStateClock & other) const
-        {
-            return !is_stale()
-                && !other.is_stale()
-                && discovery_zxid == other.discovery_zxid
-                && discovery_version == other.discovery_version
-                && clean_state_zxid == other.clean_state_zxid
-                && clean_state_version == other.clean_state_version;
-        }
-
-        bool operator!=(const CleanStateClock & other) const
-        {
-            return !(*this == other);
-        }
-    };
-
-    bool tryDropPartition(ShardPartition & task_partition, const zkutil::ZooKeeperPtr & zookeeper, const CleanStateClock & clean_state_clock)
-    {
-        if (is_safe_mode)
-            throw Exception("DROP PARTITION is prohibited in safe mode", ErrorCodes::NOT_IMPLEMENTED);
-
-        TaskTable & task_table = task_partition.task_shard.task_table;
-
-        const String current_shards_path = task_partition.getPartitionShardsPath();
-        const String current_partition_active_workers_dir = task_partition.getPartitionActiveWorkersPath();
-        const String is_dirty_flag_path = task_partition.getCommonPartitionIsDirtyPath();
-        const String dirt_cleaner_path = is_dirty_flag_path + "/cleaner";
-        const String is_dirt_cleaned_path = task_partition.getCommonPartitionIsCleanedPath();
-
-        zkutil::EphemeralNodeHolder::Ptr cleaner_holder;
-        try
-        {
-            cleaner_holder = zkutil::EphemeralNodeHolder::create(dirt_cleaner_path, *zookeeper, host_id);
-        }
-        catch (const Coordination::Exception & e)
-        {
-            if (e.code == Coordination::ZNODEEXISTS)
-            {
-                LOG_DEBUG(log, "Partition " << task_partition.name << " is cleaning now by somebody, sleep");
-                std::this_thread::sleep_for(default_sleep_time);
-                return false;
-            }
-
-            throw;
-        }
-
-        Coordination::Stat stat;
-        if (zookeeper->exists(current_partition_active_workers_dir, &stat))
-        {
-            if (stat.numChildren != 0)
-            {
-                LOG_DEBUG(log, "Partition " << task_partition.name << " contains " << stat.numChildren << " active workers while trying to drop it. Going to sleep.");
-                std::this_thread::sleep_for(default_sleep_time);
-                return false;
-            }
-            else
-            {
-                zookeeper->remove(current_partition_active_workers_dir);
-            }
-        }
-
-        {
-            zkutil::EphemeralNodeHolder::Ptr active_workers_lock;
-            try
-            {
-                active_workers_lock = zkutil::EphemeralNodeHolder::create(current_partition_active_workers_dir, *zookeeper, host_id);
-            }
-            catch (const Coordination::Exception & e)
-            {
-                if (e.code == Coordination::ZNODEEXISTS)
-                {
-                    LOG_DEBUG(log, "Partition " << task_partition.name << " is being filled now by somebody, sleep");
-                    return false;
-                }
-
-                throw;
-            }
-
-            // Lock the dirty flag
-            zookeeper->set(is_dirty_flag_path, host_id, clean_state_clock.discovery_version.value());
-            zookeeper->tryRemove(task_partition.getPartitionCleanStartPath());
-            CleanStateClock my_clock(zookeeper, is_dirty_flag_path, is_dirt_cleaned_path);
-
-            /// Remove all status nodes
-            {
-                Strings children;
-                if (zookeeper->tryGetChildren(current_shards_path, children) == Coordination::ZOK)
-                    for (const auto & child : children)
-                    {
-                        zookeeper->removeRecursive(current_shards_path + "/" + child);
-                    }
-            }
-
-            String query = "ALTER TABLE " + getQuotedTable(task_table.table_push);
-            query += " DROP PARTITION " + task_partition.name + "";
-
-            /// TODO: use this statement after servers will be updated up to 1.1.54310
-            // query += " DROP PARTITION ID '" + task_partition.name + "'";
-
-            ClusterPtr & cluster_push = task_table.cluster_push;
-            Settings settings_push = task_cluster->settings_push;
-
-            /// It is important, DROP PARTITION must be done synchronously
-            settings_push.replication_alter_partitions_sync = 2;
-
-            LOG_DEBUG(log, "Execute distributed DROP PARTITION: " << query);
-            /// Limit number of max executing replicas to 1
-            UInt64 num_shards = executeQueryOnCluster(cluster_push, query, nullptr, &settings_push, PoolMode::GET_ONE, 1);
-
-            if (num_shards < cluster_push->getShardCount())
-            {
-                LOG_INFO(log, "DROP PARTITION wasn't successfully executed on " << cluster_push->getShardCount() - num_shards << " shards");
-                return false;
-            }
-
-            /// Update the locking node
-            if (!my_clock.is_stale())
-            {
-                zookeeper->set(is_dirty_flag_path, host_id, my_clock.discovery_version.value());
-                if (my_clock.clean_state_version)
-                    zookeeper->set(is_dirt_cleaned_path, host_id, my_clock.clean_state_version.value());
-                else
-                    zookeeper->create(is_dirt_cleaned_path, host_id, zkutil::CreateMode::Persistent);
-            }
-            else
-            {
-                LOG_DEBUG(log, "Clean state is altered when dropping the partition, cowardly bailing");
-                /// clean state is stale
-                return false;
-            }
-
-            LOG_INFO(log, "Partition " << task_partition.name << " was dropped on cluster " << task_table.cluster_push_name);
-            if (zookeeper->tryCreate(current_shards_path, host_id, zkutil::CreateMode::Persistent) == Coordination::ZNODEEXISTS)
-                zookeeper->set(current_shards_path, host_id);
-        }
-
-        LOG_INFO(log, "Partition " << task_partition.name << " is safe for work now.");
-        return true;
-    }
-
-
-    static constexpr UInt64 max_table_tries = 1000;
-    static constexpr UInt64 max_shard_partition_tries = 600;
-
-    bool tryProcessTable(const ConnectionTimeouts & timeouts, TaskTable & task_table)
-    {
-        /// An heuristic: if previous shard is already done, then check next one without sleeps due to max_workers constraint
-        bool previous_shard_is_instantly_finished = false;
-
-        /// Process each partition that is present in cluster
-        for (const String & partition_name : task_table.ordered_partition_names)
-        {
-            if (!task_table.cluster_partitions.count(partition_name))
-                throw Exception("There are no expected partition " + partition_name + ". It is a bug", ErrorCodes::LOGICAL_ERROR);
-
-            ClusterPartition & cluster_partition = task_table.cluster_partitions[partition_name];
-
-            Stopwatch watch;
-            TasksShard expected_shards;
-            UInt64 num_failed_shards = 0;
-
-            ++cluster_partition.total_tries;
-
-            LOG_DEBUG(log, "Processing partition " << partition_name << " for the whole cluster");
-
-            /// Process each source shard having current partition and copy current partition
-            /// NOTE: shards are sorted by "distance" to current host
-            bool has_shard_to_process = false;
-            for (const TaskShardPtr & shard : task_table.all_shards)
-            {
-                /// Does shard have a node with current partition?
-                if (shard->partition_tasks.count(partition_name) == 0)
-                {
-                    /// If not, did we check existence of that partition previously?
-                    if (shard->checked_partitions.count(partition_name) == 0)
-                    {
-                        auto check_shard_has_partition = [&] () { return checkShardHasPartition(timeouts, *shard, partition_name); };
-                        bool has_partition = retry(check_shard_has_partition);
-
-                        shard->checked_partitions.emplace(partition_name);
-
-                        if (has_partition)
-                        {
-                            shard->partition_tasks.emplace(partition_name, ShardPartition(*shard, partition_name));
-                            LOG_DEBUG(log, "Discovered partition " << partition_name << " in shard " << shard->getDescription());
-                        }
-                        else
-                        {
-                            LOG_DEBUG(log, "Found that shard " << shard->getDescription() << " does not contain current partition " << partition_name);
-                            continue;
-                        }
-                    }
-                    else
-                    {
-                        /// We have already checked that partition, but did not discover it
-                        previous_shard_is_instantly_finished = true;
-                        continue;
-                    }
-                }
-
-                auto it_shard_partition = shard->partition_tasks.find(partition_name);
-                if (it_shard_partition == shard->partition_tasks.end())
-                     throw Exception("There are no such partition in a shard. This is a bug.", ErrorCodes::LOGICAL_ERROR);
-                auto & partition = it_shard_partition->second;
-
-                expected_shards.emplace_back(shard);
-
-                /// Do not sleep if there is a sequence of already processed shards to increase startup
-                bool is_unprioritized_task = !previous_shard_is_instantly_finished && shard->priority.is_remote;
-                PartitionTaskStatus task_status = PartitionTaskStatus::Error;
-                bool was_error = false;
-                has_shard_to_process = true;
-                for (UInt64 try_num = 0; try_num < max_shard_partition_tries; ++try_num)
-                {
-                    task_status = tryProcessPartitionTask(timeouts, partition, is_unprioritized_task);
-
-                    /// Exit if success
-                    if (task_status == PartitionTaskStatus::Finished)
-                        break;
-
-                    was_error = true;
-
-                    /// Skip if the task is being processed by someone
-                    if (task_status == PartitionTaskStatus::Active)
-                        break;
-
-                    /// Repeat on errors
-                    std::this_thread::sleep_for(default_sleep_time);
-                }
-
-                if (task_status == PartitionTaskStatus::Error)
-                    ++num_failed_shards;
-
-                previous_shard_is_instantly_finished = !was_error;
-            }
-
-            cluster_partition.elapsed_time_seconds += watch.elapsedSeconds();
-
-            /// Check that whole cluster partition is done
-            /// Firstly check the number of failed partition tasks, then look into ZooKeeper and ensure that each partition is done
-            bool partition_is_done = num_failed_shards == 0;
-            try
-            {
-                partition_is_done =
-                    !has_shard_to_process
-                    || (partition_is_done && checkPartitionIsDone(task_table, partition_name, expected_shards));
-            }
-            catch (...)
-            {
-                tryLogCurrentException(log);
-                partition_is_done = false;
-            }
-
-            if (partition_is_done)
-            {
-                task_table.finished_cluster_partitions.emplace(partition_name);
-
-                task_table.bytes_copied += cluster_partition.bytes_copied;
-                task_table.rows_copied += cluster_partition.rows_copied;
-                double elapsed = cluster_partition.elapsed_time_seconds;
-
-                LOG_INFO(log, "It took " << std::fixed << std::setprecision(2) << elapsed << " seconds to copy partition " << partition_name
-                         << ": " << formatReadableSizeWithDecimalSuffix(cluster_partition.bytes_copied) << " uncompressed bytes"
-                         << ", " << formatReadableQuantity(cluster_partition.rows_copied) << " rows"
-                         << " and " << cluster_partition.blocks_copied << " source blocks are copied");
-
-                if (cluster_partition.rows_copied)
-                {
-                    LOG_INFO(log, "Average partition speed: "
-                        << formatReadableSizeWithDecimalSuffix(cluster_partition.bytes_copied / elapsed) << " per second.");
-                }
-
-                if (task_table.rows_copied)
-                {
-                    LOG_INFO(log, "Average table " << task_table.table_id << " speed: "
-                        << formatReadableSizeWithDecimalSuffix(task_table.bytes_copied / elapsed) << " per second.");
-                }
-            }
-        }
-
-        UInt64 required_partitions = task_table.cluster_partitions.size();
-        UInt64 finished_partitions = task_table.finished_cluster_partitions.size();
-        bool table_is_done = finished_partitions >= required_partitions;
-
         if (!table_is_done)
         {
-            LOG_INFO(log, "Table " + task_table.table_id + " is not processed yet."
-                << "Copied " << finished_partitions << " of " << required_partitions << ", will retry");
+            throw Exception("Too many tries to process table " + task_table.table_id + ". Abort remaining execution",
+                            ErrorCodes::UNFINISHED);
         }
-
-        return table_is_done;
     }
+}
 
+/// Protected section
 
-    /// Execution status of a task
-    enum class PartitionTaskStatus
+zkutil::EphemeralNodeHolder::Ptr ClusterCopier::createTaskWorkerNodeAndWaitIfNeed(
+    const zkutil::ZooKeeperPtr & zookeeper,
+    const String & description,
+    bool unprioritized)
+{
+    std::chrono::milliseconds current_sleep_time = default_sleep_time;
+    static constexpr std::chrono::milliseconds max_sleep_time(30000); // 30 sec
+
+    if (unprioritized)
+        std::this_thread::sleep_for(current_sleep_time);
+
+    String workers_version_path = getWorkersPathVersion();
+    String workers_path = getWorkersPath();
+    String current_worker_path = getCurrentWorkerNodePath();
+
+    UInt64 num_bad_version_errors = 0;
+
+    while (true)
     {
-        Active,
-        Finished,
-        Error,
-    };
+        updateConfigIfNeeded();
 
-    PartitionTaskStatus tryProcessPartitionTask(const ConnectionTimeouts & timeouts, ShardPartition & task_partition, bool is_unprioritized_task)
-    {
-        PartitionTaskStatus res;
+        Coordination::Stat stat;
+        zookeeper->get(workers_version_path, &stat);
+        auto version = stat.version;
+        zookeeper->get(workers_path, &stat);
 
-        try
+        if (static_cast<UInt64>(stat.numChildren) >= task_cluster->max_workers)
         {
-            res = processPartitionTaskImpl(timeouts, task_partition, is_unprioritized_task);
-        }
-        catch (...)
-        {
-            tryLogCurrentException(log, "An error occurred while processing partition " + task_partition.name);
-            res = PartitionTaskStatus::Error;
-        }
+            LOG_DEBUG(log, "Too many workers (" << stat.numChildren << ", maximum " << task_cluster->max_workers << ")"
+                << ". Postpone processing " << description);
 
-        /// At the end of each task check if the config is updated
-        try
-        {
-            updateConfigIfNeeded();
-        }
-        catch (...)
-        {
-            tryLogCurrentException(log, "An error occurred while updating the config");
-        }
+            if (unprioritized)
+                current_sleep_time = std::min(max_sleep_time, current_sleep_time + default_sleep_time);
 
-        return res;
-    }
-
-    PartitionTaskStatus processPartitionTaskImpl(const ConnectionTimeouts & timeouts, ShardPartition & task_partition, bool is_unprioritized_task)
-    {
-        TaskShard & task_shard = task_partition.task_shard;
-        TaskTable & task_table = task_shard.task_table;
-        ClusterPartition & cluster_partition = task_table.getClusterPartition(task_partition.name);
-
-        /// We need to update table definitions for each partition, it could be changed after ALTER
-        createShardInternalTables(timeouts, task_shard);
-
-        auto zookeeper = context.getZooKeeper();
-
-        const String is_dirty_flag_path = task_partition.getCommonPartitionIsDirtyPath();
-        const String is_dirt_cleaned_path = task_partition.getCommonPartitionIsCleanedPath();
-        const String current_task_is_active_path = task_partition.getActiveWorkerPath();
-        const String current_task_status_path = task_partition.getShardStatusPath();
-
-        /// Auxiliary functions:
-
-        /// Creates is_dirty node to initialize DROP PARTITION
-        auto create_is_dirty_node = [&, this] (const CleanStateClock & clock)
-        {
-            if (clock.is_stale())
-                LOG_DEBUG(log, "Clean state clock is stale while setting dirty flag, cowardly bailing");
-            else if (!clock.is_clean())
-                LOG_DEBUG(log, "Thank you, Captain Obvious");
-            else if (clock.discovery_version)
-            {
-                LOG_DEBUG(log, "Updating clean state clock");
-                zookeeper->set(is_dirty_flag_path, host_id, clock.discovery_version.value());
-            }
-            else
-            {
-                LOG_DEBUG(log, "Creating clean state clock");
-                zookeeper->create(is_dirty_flag_path, host_id, zkutil::CreateMode::Persistent);
-            }
-        };
-
-        /// Returns SELECT query filtering current partition and applying user filter
-        auto get_select_query = [&] (const DatabaseAndTableName & from_table, const String & fields, String limit = "")
-        {
-            String query;
-            query += "SELECT " + fields + " FROM " + getQuotedTable(from_table);
-            /// TODO: Bad, it is better to rewrite with ASTLiteral(partition_key_field)
-            query += " WHERE (" + queryToString(task_table.engine_push_partition_key_ast) + " = (" + task_partition.name + " AS partition_key))";
-            if (!task_table.where_condition_str.empty())
-                query += " AND (" + task_table.where_condition_str + ")";
-            if (!limit.empty())
-                query += " LIMIT " + limit;
-
-            ParserQuery p_query(query.data() + query.size());
-            return parseQuery(p_query, query, 0);
-        };
-
-        /// Load balancing
-        auto worker_node_holder = createTaskWorkerNodeAndWaitIfNeed(zookeeper, current_task_status_path, is_unprioritized_task);
-
-        LOG_DEBUG(log, "Processing " << current_task_status_path);
-
-        CleanStateClock clean_state_clock (zookeeper, is_dirty_flag_path, is_dirt_cleaned_path);
-
-        LogicalClock task_start_clock;
-        {
-            Coordination::Stat stat;
-            if (zookeeper->exists(task_partition.getPartitionShardsPath(), &stat))
-                task_start_clock = LogicalClock(stat.mzxid);
-        }
-
-        /// Do not start if partition is dirty, try to clean it
-        if (clean_state_clock.is_clean()
-            && (!task_start_clock.hasHappened() || clean_state_clock.discovery_zxid <= task_start_clock))
-        {
-            LOG_DEBUG(log, "Partition " << task_partition.name << " appears to be clean");
-            zookeeper->createAncestors(current_task_status_path);
+            std::this_thread::sleep_for(current_sleep_time);
+            num_bad_version_errors = 0;
         }
         else
         {
-            LOG_DEBUG(log, "Partition " << task_partition.name << " is dirty, try to drop it");
+            Coordination::Requests ops;
+            ops.emplace_back(zkutil::makeSetRequest(workers_version_path, description, version));
+            ops.emplace_back(zkutil::makeCreateRequest(current_worker_path, description, zkutil::CreateMode::Ephemeral));
+            Coordination::Responses responses;
+            auto code = zookeeper->tryMulti(ops, responses);
 
-            try
+            if (code == Coordination::ZOK || code == Coordination::ZNODEEXISTS)
+                return std::make_shared<zkutil::EphemeralNodeHolder>(current_worker_path, *zookeeper, false, false, description);
+
+            if (code == Coordination::ZBADVERSION)
             {
-                tryDropPartition(task_partition, zookeeper, clean_state_clock);
+                ++num_bad_version_errors;
+
+                /// Try to make fast retries
+                if (num_bad_version_errors > 3)
+                {
+                    LOG_DEBUG(log, "A concurrent worker has just been added, will check free worker slots again");
+                    std::chrono::milliseconds random_sleep_time(std::uniform_int_distribution<int>(1, 1000)(task_cluster->random_engine));
+                    std::this_thread::sleep_for(random_sleep_time);
+                    num_bad_version_errors = 0;
+                }
             }
-            catch (...)
+            else
+                throw Coordination::Exception(code);
+        }
+    }
+}
+
+/** Checks that the whole partition of a table was copied. We should do it carefully due to dirty lock.
+ * State of some task could change during the processing.
+ * We have to ensure that all shards have the finished state and there is no dirty flag.
+ * Moreover, we have to check status twice and check zxid, because state can change during the checking.
+ */
+bool ClusterCopier::checkPartitionIsDone(const TaskTable & task_table, const String & partition_name, const TasksShard & shards_with_partition)
+{
+    LOG_DEBUG(log, "Check that all shards processed partition " << partition_name << " successfully");
+
+    auto zookeeper = context.getZooKeeper();
+
+    Strings status_paths;
+    for (auto & shard : shards_with_partition)
+    {
+        ShardPartition & task_shard_partition = shard->partition_tasks.find(partition_name)->second;
+        status_paths.emplace_back(task_shard_partition.getShardStatusPath());
+    }
+
+    std::vector<int64_t> zxid1, zxid2;
+
+    try
+    {
+        std::vector<zkutil::ZooKeeper::FutureGet> get_futures;
+        for (const String & path : status_paths)
+            get_futures.emplace_back(zookeeper->asyncGet(path));
+
+        // Check that state is Finished and remember zxid
+        for (auto & future : get_futures)
+        {
+            auto res = future.get();
+
+            TaskStateWithOwner status = TaskStateWithOwner::fromString(res.data);
+            if (status.state != TaskState::Finished)
             {
-                tryLogCurrentException(log, "An error occurred when clean partition");
+                LOG_INFO(log, "The task " << res.data << " is being rewritten by " << status.owner << ". Partition will be rechecked");
+                return false;
             }
 
-            return PartitionTaskStatus::Error;
+            zxid1.push_back(res.stat.pzxid);
         }
 
-        /// Create ephemeral node to mark that we are active and process the partition
-        zookeeper->createAncestors(current_task_is_active_path);
-        zkutil::EphemeralNodeHolderPtr partition_task_node_holder;
+        // Check that partition is not dirty
+        {
+            CleanStateClock clean_state_clock (
+                                               zookeeper,
+                                               task_table.getPartitionIsDirtyPath(partition_name),
+                                               task_table.getPartitionIsCleanedPath(partition_name)
+                                               );
+            Coordination::Stat stat;
+            LogicalClock task_start_clock;
+            if (zookeeper->exists(task_table.getPartitionTaskStatusPath(partition_name), &stat))
+                task_start_clock = LogicalClock(stat.mzxid);
+            zookeeper->get(task_table.getPartitionTaskStatusPath(partition_name), &stat);
+            if (!clean_state_clock.is_clean() || task_start_clock <= clean_state_clock.discovery_zxid)
+            {
+                LOG_INFO(log, "Partition " << partition_name << " become dirty");
+                return false;
+            }
+        }
+
+        get_futures.clear();
+        for (const String & path : status_paths)
+            get_futures.emplace_back(zookeeper->asyncGet(path));
+
+        // Remember zxid of states again
+        for (auto & future : get_futures)
+        {
+            auto res = future.get();
+            zxid2.push_back(res.stat.pzxid);
+        }
+    }
+    catch (const Coordination::Exception & e)
+    {
+        LOG_INFO(log, "A ZooKeeper error occurred while checking partition " << partition_name
+                      << ". Will recheck the partition. Error: " << e.displayText());
+        return false;
+    }
+
+    // If all task is finished and zxid is not changed then partition could not become dirty again
+    for (UInt64 shard_num = 0; shard_num < status_paths.size(); ++shard_num)
+    {
+        if (zxid1[shard_num] != zxid2[shard_num])
+        {
+            LOG_INFO(log, "The task " << status_paths[shard_num] << " is being modified now. Partition will be rechecked");
+            return false;
+        }
+    }
+
+    LOG_INFO(log, "Partition " << partition_name << " is copied successfully");
+    return true;
+}
+
+ASTPtr ClusterCopier::removeAliasColumnsFromCreateQuery(const ASTPtr & query_ast)
+{
+    const ASTs & column_asts = query_ast->as<ASTCreateQuery &>().columns_list->columns->children;
+    auto new_columns = std::make_shared<ASTExpressionList>();
+
+    for (const ASTPtr & column_ast : column_asts)
+    {
+        const auto & column = column_ast->as<ASTColumnDeclaration &>();
+
+        if (!column.default_specifier.empty())
+        {
+            ColumnDefaultKind kind = columnDefaultKindFromString(column.default_specifier);
+            if (kind == ColumnDefaultKind::Materialized || kind == ColumnDefaultKind::Alias)
+                continue;
+        }
+
+        new_columns->children.emplace_back(column_ast->clone());
+    }
+
+    ASTPtr new_query_ast = query_ast->clone();
+    auto & new_query = new_query_ast->as<ASTCreateQuery &>();
+
+    auto new_columns_list = std::make_shared<ASTColumns>();
+    new_columns_list->set(new_columns_list->columns, new_columns);
+    if (auto indices = query_ast->as<ASTCreateQuery>()->columns_list->indices)
+        new_columns_list->set(new_columns_list->indices, indices->clone());
+
+    new_query.replace(new_query.columns_list, new_columns_list);
+
+    return new_query_ast;
+}
+
+std::shared_ptr<ASTCreateQuery> ClusterCopier::rewriteCreateQueryStorage(const ASTPtr & create_query_ast, const DatabaseAndTableName & new_table, const ASTPtr & new_storage_ast)
+{
+    const auto & create = create_query_ast->as<ASTCreateQuery &>();
+    auto res = std::make_shared<ASTCreateQuery>(create);
+
+    if (create.storage == nullptr || new_storage_ast == nullptr)
+        throw Exception("Storage is not specified", ErrorCodes::LOGICAL_ERROR);
+
+    res->database = new_table.first;
+    res->table = new_table.second;
+
+    res->children.clear();
+    res->set(res->columns_list, create.columns_list->clone());
+    res->set(res->storage, new_storage_ast->clone());
+
+    return res;
+}
+
+
+bool ClusterCopier::tryDropPartition(ShardPartition & task_partition, const zkutil::ZooKeeperPtr & zookeeper, const CleanStateClock & clean_state_clock)
+{
+    if (is_safe_mode)
+        throw Exception("DROP PARTITION is prohibited in safe mode", ErrorCodes::NOT_IMPLEMENTED);
+
+    TaskTable & task_table = task_partition.task_shard.task_table;
+
+    const String current_shards_path = task_partition.getPartitionShardsPath();
+    const String current_partition_active_workers_dir = task_partition.getPartitionActiveWorkersPath();
+    const String is_dirty_flag_path = task_partition.getCommonPartitionIsDirtyPath();
+    const String dirt_cleaner_path = is_dirty_flag_path + "/cleaner";
+    const String is_dirt_cleaned_path = task_partition.getCommonPartitionIsCleanedPath();
+
+    zkutil::EphemeralNodeHolder::Ptr cleaner_holder;
+    try
+    {
+        cleaner_holder = zkutil::EphemeralNodeHolder::create(dirt_cleaner_path, *zookeeper, host_id);
+    }
+    catch (const Coordination::Exception & e)
+    {
+        if (e.code == Coordination::ZNODEEXISTS)
+        {
+            LOG_DEBUG(log, "Partition " << task_partition.name << " is cleaning now by somebody, sleep");
+            std::this_thread::sleep_for(default_sleep_time);
+            return false;
+        }
+
+        throw;
+    }
+
+    Coordination::Stat stat;
+    if (zookeeper->exists(current_partition_active_workers_dir, &stat))
+    {
+        if (stat.numChildren != 0)
+        {
+            LOG_DEBUG(log, "Partition " << task_partition.name << " contains " << stat.numChildren << " active workers while trying to drop it. Going to sleep.");
+            std::this_thread::sleep_for(default_sleep_time);
+            return false;
+        }
+        else
+        {
+            zookeeper->remove(current_partition_active_workers_dir);
+        }
+    }
+
+    {
+        zkutil::EphemeralNodeHolder::Ptr active_workers_lock;
         try
         {
-            partition_task_node_holder = zkutil::EphemeralNodeHolder::create(current_task_is_active_path, *zookeeper, host_id);
+            active_workers_lock = zkutil::EphemeralNodeHolder::create(current_partition_active_workers_dir, *zookeeper, host_id);
         }
         catch (const Coordination::Exception & e)
         {
             if (e.code == Coordination::ZNODEEXISTS)
             {
-                LOG_DEBUG(log, "Someone is already processing " << current_task_is_active_path);
-                return PartitionTaskStatus::Active;
+                LOG_DEBUG(log, "Partition " << task_partition.name << " is being filled now by somebody, sleep");
+                return false;
             }
 
             throw;
         }
 
-        /// Exit if task has been already processed;
-        /// create blocking node to signal cleaning up if it is abandoned
-        {
-            String status_data;
-            if (zookeeper->tryGet(current_task_status_path, status_data))
-            {
-                TaskStateWithOwner status = TaskStateWithOwner::fromString(status_data);
-                if (status.state == TaskState::Finished)
-                {
-                    LOG_DEBUG(log, "Task " << current_task_status_path << " has been successfully executed by " << status.owner);
-                    return PartitionTaskStatus::Finished;
-                }
+        // Lock the dirty flag
+        zookeeper->set(is_dirty_flag_path, host_id, clean_state_clock.discovery_version.value());
+        zookeeper->tryRemove(task_partition.getPartitionCleanStartPath());
+        CleanStateClock my_clock(zookeeper, is_dirty_flag_path, is_dirt_cleaned_path);
 
-                // Task is abandoned, initialize DROP PARTITION
-                LOG_DEBUG(log, "Task " << current_task_status_path << " has not been successfully finished by " << status.owner << ". Partition will be dropped and refilled.");
+        /// Remove all status nodes
+        {
+            Strings children;
+            if (zookeeper->tryGetChildren(current_shards_path, children) == Coordination::ZOK)
+                for (const auto & child : children)
+                {
+                    zookeeper->removeRecursive(current_shards_path + "/" + child);
+                }
+        }
+
+        String query = "ALTER TABLE " + getQuotedTable(task_table.table_push);
+        query += " DROP PARTITION " + task_partition.name + "";
+
+        /// TODO: use this statement after servers will be updated up to 1.1.54310
+        // query += " DROP PARTITION ID '" + task_partition.name + "'";
+
+        ClusterPtr & cluster_push = task_table.cluster_push;
+        Settings settings_push = task_cluster->settings_push;
+
+        /// It is important, DROP PARTITION must be done synchronously
+        settings_push.replication_alter_partitions_sync = 2;
+
+        LOG_DEBUG(log, "Execute distributed DROP PARTITION: " << query);
+        /// Limit number of max executing replicas to 1
+        UInt64 num_shards = executeQueryOnCluster(cluster_push, query, nullptr, &settings_push, PoolMode::GET_ONE, 1);
+
+        if (num_shards < cluster_push->getShardCount())
+        {
+            LOG_INFO(log, "DROP PARTITION wasn't successfully executed on " << cluster_push->getShardCount() - num_shards << " shards");
+            return false;
+        }
+
+        /// Update the locking node
+        if (!my_clock.is_stale())
+        {
+            zookeeper->set(is_dirty_flag_path, host_id, my_clock.discovery_version.value());
+            if (my_clock.clean_state_version)
+                zookeeper->set(is_dirt_cleaned_path, host_id, my_clock.clean_state_version.value());
+            else
+                zookeeper->create(is_dirt_cleaned_path, host_id, zkutil::CreateMode::Persistent);
+        }
+        else
+        {
+            LOG_DEBUG(log, "Clean state is altered when dropping the partition, cowardly bailing");
+            /// clean state is stale
+            return false;
+        }
+
+        LOG_INFO(log, "Partition " << task_partition.name << " was dropped on cluster " << task_table.cluster_push_name);
+        if (zookeeper->tryCreate(current_shards_path, host_id, zkutil::CreateMode::Persistent) == Coordination::ZNODEEXISTS)
+            zookeeper->set(current_shards_path, host_id);
+    }
+
+    LOG_INFO(log, "Partition " << task_partition.name << " is safe for work now.");
+    return true;
+}
+
+bool ClusterCopier::tryProcessTable(const ConnectionTimeouts & timeouts, TaskTable & task_table)
+{
+    /// An heuristic: if previous shard is already done, then check next one without sleeps due to max_workers constraint
+    bool previous_shard_is_instantly_finished = false;
+
+    /// Process each partition that is present in cluster
+    for (const String & partition_name : task_table.ordered_partition_names)
+    {
+        if (!task_table.cluster_partitions.count(partition_name))
+            throw Exception("There are no expected partition " + partition_name + ". It is a bug", ErrorCodes::LOGICAL_ERROR);
+
+        ClusterPartition & cluster_partition = task_table.cluster_partitions[partition_name];
+
+        Stopwatch watch;
+        TasksShard expected_shards;
+        UInt64 num_failed_shards = 0;
+
+        ++cluster_partition.total_tries;
+
+        LOG_DEBUG(log, "Processing partition " << partition_name << " for the whole cluster");
+
+        /// Process each source shard having current partition and copy current partition
+        /// NOTE: shards are sorted by "distance" to current host
+        bool has_shard_to_process = false;
+        for (const TaskShardPtr & shard : task_table.all_shards)
+        {
+            /// Does shard have a node with current partition?
+            if (shard->partition_tasks.count(partition_name) == 0)
+            {
+                /// If not, did we check existence of that partition previously?
+                if (shard->checked_partitions.count(partition_name) == 0)
+                {
+                    auto check_shard_has_partition = [&] () { return checkShardHasPartition(timeouts, *shard, partition_name); };
+                    bool has_partition = retry(check_shard_has_partition);
+
+                    shard->checked_partitions.emplace(partition_name);
+
+                    if (has_partition)
+                    {
+                        shard->partition_tasks.emplace(partition_name, ShardPartition(*shard, partition_name));
+                        LOG_DEBUG(log, "Discovered partition " << partition_name << " in shard " << shard->getDescription());
+                    }
+                    else
+                    {
+                        LOG_DEBUG(log, "Found that shard " << shard->getDescription() << " does not contain current partition " << partition_name);
+                        continue;
+                    }
+                }
+                else
+                {
+                    /// We have already checked that partition, but did not discover it
+                    previous_shard_is_instantly_finished = true;
+                    continue;
+                }
+            }
+
+            auto it_shard_partition = shard->partition_tasks.find(partition_name);
+            if (it_shard_partition == shard->partition_tasks.end())
+                 throw Exception("There are no such partition in a shard. This is a bug.", ErrorCodes::LOGICAL_ERROR);
+            auto & partition = it_shard_partition->second;
+
+            expected_shards.emplace_back(shard);
+
+            /// Do not sleep if there is a sequence of already processed shards to increase startup
+            bool is_unprioritized_task = !previous_shard_is_instantly_finished && shard->priority.is_remote;
+            PartitionTaskStatus task_status = PartitionTaskStatus::Error;
+            bool was_error = false;
+            has_shard_to_process = true;
+            for (UInt64 try_num = 0; try_num < max_shard_partition_tries; ++try_num)
+            {
+                task_status = tryProcessPartitionTask(timeouts, partition, is_unprioritized_task);
+
+                /// Exit if success
+                if (task_status == PartitionTaskStatus::Finished)
+                    break;
+
+                was_error = true;
+
+                /// Skip if the task is being processed by someone
+                if (task_status == PartitionTaskStatus::Active)
+                    break;
+
+                /// Repeat on errors
+                std::this_thread::sleep_for(default_sleep_time);
+            }
+
+            if (task_status == PartitionTaskStatus::Error)
+                ++num_failed_shards;
+
+            previous_shard_is_instantly_finished = !was_error;
+        }
+
+        cluster_partition.elapsed_time_seconds += watch.elapsedSeconds();
+
+        /// Check that whole cluster partition is done
+        /// Firstly check the number of failed partition tasks, then look into ZooKeeper and ensure that each partition is done
+        bool partition_is_done = num_failed_shards == 0;
+        try
+        {
+            partition_is_done =
+                !has_shard_to_process
+                || (partition_is_done && checkPartitionIsDone(task_table, partition_name, expected_shards));
+        }
+        catch (...)
+        {
+            tryLogCurrentException(log);
+            partition_is_done = false;
+        }
+
+        if (partition_is_done)
+        {
+            task_table.finished_cluster_partitions.emplace(partition_name);
+
+            task_table.bytes_copied += cluster_partition.bytes_copied;
+            task_table.rows_copied += cluster_partition.rows_copied;
+            double elapsed = cluster_partition.elapsed_time_seconds;
+
+            LOG_INFO(log, "It took " << std::fixed << std::setprecision(2) << elapsed << " seconds to copy partition " << partition_name
+                     << ": " << formatReadableSizeWithDecimalSuffix(cluster_partition.bytes_copied) << " uncompressed bytes"
+                     << ", " << formatReadableQuantity(cluster_partition.rows_copied) << " rows"
+                     << " and " << cluster_partition.blocks_copied << " source blocks are copied");
+
+            if (cluster_partition.rows_copied)
+            {
+                LOG_INFO(log, "Average partition speed: "
+                    << formatReadableSizeWithDecimalSuffix(cluster_partition.bytes_copied / elapsed) << " per second.");
+            }
+
+            if (task_table.rows_copied)
+            {
+                LOG_INFO(log, "Average table " << task_table.table_id << " speed: "
+                    << formatReadableSizeWithDecimalSuffix(task_table.bytes_copied / elapsed) << " per second.");
+            }
+        }
+    }
+
+    UInt64 required_partitions = task_table.cluster_partitions.size();
+    UInt64 finished_partitions = task_table.finished_cluster_partitions.size();
+    bool table_is_done = finished_partitions >= required_partitions;
+
+    if (!table_is_done)
+    {
+        LOG_INFO(log, "Table " + task_table.table_id + " is not processed yet."
+            << "Copied " << finished_partitions << " of " << required_partitions << ", will retry");
+    }
+
+    return table_is_done;
+}
+
+
+PartitionTaskStatus ClusterCopier::tryProcessPartitionTask(const ConnectionTimeouts & timeouts, ShardPartition & task_partition, bool is_unprioritized_task)
+{
+    PartitionTaskStatus res;
+
+    try
+    {
+        res = processPartitionTaskImpl(timeouts, task_partition, is_unprioritized_task);
+    }
+    catch (...)
+    {
+        tryLogCurrentException(log, "An error occurred while processing partition " + task_partition.name);
+        res = PartitionTaskStatus::Error;
+    }
+
+    /// At the end of each task check if the config is updated
+    try
+    {
+        updateConfigIfNeeded();
+    }
+    catch (...)
+    {
+        tryLogCurrentException(log, "An error occurred while updating the config");
+    }
+
+    return res;
+}
+
+PartitionTaskStatus ClusterCopier::processPartitionTaskImpl(const ConnectionTimeouts & timeouts, ShardPartition & task_partition, bool is_unprioritized_task)
+{
+    TaskShard & task_shard = task_partition.task_shard;
+    TaskTable & task_table = task_shard.task_table;
+    ClusterPartition & cluster_partition = task_table.getClusterPartition(task_partition.name);
+
+    /// We need to update table definitions for each partition, it could be changed after ALTER
+    createShardInternalTables(timeouts, task_shard);
+
+    auto zookeeper = context.getZooKeeper();
+
+    const String is_dirty_flag_path = task_partition.getCommonPartitionIsDirtyPath();
+    const String is_dirt_cleaned_path = task_partition.getCommonPartitionIsCleanedPath();
+    const String current_task_is_active_path = task_partition.getActiveWorkerPath();
+    const String current_task_status_path = task_partition.getShardStatusPath();
+
+    /// Auxiliary functions:
+
+    /// Creates is_dirty node to initialize DROP PARTITION
+    auto create_is_dirty_node = [&, this] (const CleanStateClock & clock)
+    {
+        if (clock.is_stale())
+            LOG_DEBUG(log, "Clean state clock is stale while setting dirty flag, cowardly bailing");
+        else if (!clock.is_clean())
+            LOG_DEBUG(log, "Thank you, Captain Obvious");
+        else if (clock.discovery_version)
+        {
+            LOG_DEBUG(log, "Updating clean state clock");
+            zookeeper->set(is_dirty_flag_path, host_id, clock.discovery_version.value());
+        }
+        else
+        {
+            LOG_DEBUG(log, "Creating clean state clock");
+            zookeeper->create(is_dirty_flag_path, host_id, zkutil::CreateMode::Persistent);
+        }
+    };
+
+    /// Returns SELECT query filtering current partition and applying user filter
+    auto get_select_query = [&] (const DatabaseAndTableName & from_table, const String & fields, String limit = "")
+    {
+        String query;
+        query += "SELECT " + fields + " FROM " + getQuotedTable(from_table);
+        /// TODO: Bad, it is better to rewrite with ASTLiteral(partition_key_field)
+        query += " WHERE (" + queryToString(task_table.engine_push_partition_key_ast) + " = (" + task_partition.name + " AS partition_key))";
+        if (!task_table.where_condition_str.empty())
+            query += " AND (" + task_table.where_condition_str + ")";
+        if (!limit.empty())
+            query += " LIMIT " + limit;
+
+        ParserQuery p_query(query.data() + query.size());
+        return parseQuery(p_query, query, 0);
+    };
+
+    /// Load balancing
+    auto worker_node_holder = createTaskWorkerNodeAndWaitIfNeed(zookeeper, current_task_status_path, is_unprioritized_task);
+
+    LOG_DEBUG(log, "Processing " << current_task_status_path);
+
+    CleanStateClock clean_state_clock (zookeeper, is_dirty_flag_path, is_dirt_cleaned_path);
+
+    LogicalClock task_start_clock;
+    {
+        Coordination::Stat stat;
+        if (zookeeper->exists(task_partition.getPartitionShardsPath(), &stat))
+            task_start_clock = LogicalClock(stat.mzxid);
+    }
+
+    /// Do not start if partition is dirty, try to clean it
+    if (clean_state_clock.is_clean()
+        && (!task_start_clock.hasHappened() || clean_state_clock.discovery_zxid <= task_start_clock))
+    {
+        LOG_DEBUG(log, "Partition " << task_partition.name << " appears to be clean");
+        zookeeper->createAncestors(current_task_status_path);
+    }
+    else
+    {
+        LOG_DEBUG(log, "Partition " << task_partition.name << " is dirty, try to drop it");
+
+        try
+        {
+            tryDropPartition(task_partition, zookeeper, clean_state_clock);
+        }
+        catch (...)
+        {
+            tryLogCurrentException(log, "An error occurred when clean partition");
+        }
+
+        return PartitionTaskStatus::Error;
+    }
+
+    /// Create ephemeral node to mark that we are active and process the partition
+    zookeeper->createAncestors(current_task_is_active_path);
+    zkutil::EphemeralNodeHolderPtr partition_task_node_holder;
+    try
+    {
+        partition_task_node_holder = zkutil::EphemeralNodeHolder::create(current_task_is_active_path, *zookeeper, host_id);
+    }
+    catch (const Coordination::Exception & e)
+    {
+        if (e.code == Coordination::ZNODEEXISTS)
+        {
+            LOG_DEBUG(log, "Someone is already processing " << current_task_is_active_path);
+            return PartitionTaskStatus::Active;
+        }
+
+        throw;
+    }
+
+    /// Exit if task has been already processed;
+    /// create blocking node to signal cleaning up if it is abandoned
+    {
+        String status_data;
+        if (zookeeper->tryGet(current_task_status_path, status_data))
+        {
+            TaskStateWithOwner status = TaskStateWithOwner::fromString(status_data);
+            if (status.state == TaskState::Finished)
+            {
+                LOG_DEBUG(log, "Task " << current_task_status_path << " has been successfully executed by " << status.owner);
+                return PartitionTaskStatus::Finished;
+            }
+
+            // Task is abandoned, initialize DROP PARTITION
+            LOG_DEBUG(log, "Task " << current_task_status_path << " has not been successfully finished by " << status.owner << ". Partition will be dropped and refilled.");
+
+            create_is_dirty_node(clean_state_clock);
+            return PartitionTaskStatus::Error;
+        }
+    }
+
+    /// Check that destination partition is empty if we are first worker
+    /// NOTE: this check is incorrect if pull and push tables have different partition key!
+    String clean_start_status;
+    if (!zookeeper->tryGet(task_partition.getPartitionCleanStartPath(), clean_start_status) || clean_start_status != "ok")
+    {
+        zookeeper->createIfNotExists(task_partition.getPartitionCleanStartPath(), "");
+        auto checker = zkutil::EphemeralNodeHolder::create(task_partition.getPartitionCleanStartPath() + "/checker", *zookeeper, host_id);
+        // Maybe we are the first worker
+        ASTPtr query_select_ast = get_select_query(task_shard.table_split_shard, "count()");
+        UInt64 count;
+        {
+            Context local_context = context;
+            // Use pull (i.e. readonly) settings, but fetch data from destination servers
+            local_context.getSettingsRef() = task_cluster->settings_pull;
+            local_context.getSettingsRef().skip_unavailable_shards = true;
+
+            Block block = getBlockWithAllStreamData(InterpreterFactory::get(query_select_ast, local_context)->execute().in);
+            count = (block) ? block.safeGetByPosition(0).column->getUInt(0) : 0;
+        }
+
+        if (count != 0)
+        {
+            Coordination::Stat stat_shards;
+            zookeeper->get(task_partition.getPartitionShardsPath(), &stat_shards);
+
+            /// NOTE: partition is still fresh if dirt discovery happens before cleaning
+            if (stat_shards.numChildren == 0)
+            {
+                LOG_WARNING(log, "There are no workers for partition " << task_partition.name
+                                 << ", but destination table contains " << count << " rows"
+                                 << ". Partition will be dropped and refilled.");
 
                 create_is_dirty_node(clean_state_clock);
                 return PartitionTaskStatus::Error;
             }
         }
+        zookeeper->set(task_partition.getPartitionCleanStartPath(), "ok");
+    }
+    /// At this point, we need to sync that the destination table is clean
+    /// before any actual work
 
-        /// Check that destination partition is empty if we are first worker
-        /// NOTE: this check is incorrect if pull and push tables have different partition key!
-        String clean_start_status;
-        if (!zookeeper->tryGet(task_partition.getPartitionCleanStartPath(), clean_start_status) || clean_start_status != "ok")
+    /// Try start processing, create node about it
+    {
+        String start_state = TaskStateWithOwner::getData(TaskState::Started, host_id);
+        CleanStateClock new_clean_state_clock (zookeeper, is_dirty_flag_path, is_dirt_cleaned_path);
+        if (clean_state_clock != new_clean_state_clock)
         {
-            zookeeper->createIfNotExists(task_partition.getPartitionCleanStartPath(), "");
-            auto checker = zkutil::EphemeralNodeHolder::create(task_partition.getPartitionCleanStartPath() + "/checker", *zookeeper, host_id);
-            // Maybe we are the first worker
-            ASTPtr query_select_ast = get_select_query(task_shard.table_split_shard, "count()");
-            UInt64 count;
-            {
-                Context local_context = context;
-                // Use pull (i.e. readonly) settings, but fetch data from destination servers
-                local_context.getSettingsRef() = task_cluster->settings_pull;
-                local_context.getSettingsRef().skip_unavailable_shards = true;
-
-                Block block = getBlockWithAllStreamData(InterpreterFactory::get(query_select_ast, local_context)->execute().in);
-                count = (block) ? block.safeGetByPosition(0).column->getUInt(0) : 0;
-            }
-
-            if (count != 0)
-            {
-                Coordination::Stat stat_shards;
-                zookeeper->get(task_partition.getPartitionShardsPath(), &stat_shards);
-
-                /// NOTE: partition is still fresh if dirt discovery happens before cleaning
-                if (stat_shards.numChildren == 0)
-                {
-                    LOG_WARNING(log, "There are no workers for partition " << task_partition.name
-                                     << ", but destination table contains " << count << " rows"
-                                     << ". Partition will be dropped and refilled.");
-
-                    create_is_dirty_node(clean_state_clock);
-                    return PartitionTaskStatus::Error;
-                }
-            }
-            zookeeper->set(task_partition.getPartitionCleanStartPath(), "ok");
+            LOG_INFO(log, "Partition " << task_partition.name << " clean state changed, cowardly bailing");
+            return PartitionTaskStatus::Error;
         }
-        /// At this point, we need to sync that the destination table is clean
-        /// before any actual work
-
-        /// Try start processing, create node about it
+        else if (!new_clean_state_clock.is_clean())
         {
-            String start_state = TaskStateWithOwner::getData(TaskState::Started, host_id);
-            CleanStateClock new_clean_state_clock (zookeeper, is_dirty_flag_path, is_dirt_cleaned_path);
-            if (clean_state_clock != new_clean_state_clock)
-            {
-                LOG_INFO(log, "Partition " << task_partition.name << " clean state changed, cowardly bailing");
-                return PartitionTaskStatus::Error;
-            }
-            else if (!new_clean_state_clock.is_clean())
-            {
-                LOG_INFO(log, "Partition " << task_partition.name << " is dirty and will be dropped and refilled");
-                create_is_dirty_node(new_clean_state_clock);
-                return PartitionTaskStatus::Error;
-            }
-            zookeeper->create(current_task_status_path, start_state, zkutil::CreateMode::Persistent);
+            LOG_INFO(log, "Partition " << task_partition.name << " is dirty and will be dropped and refilled");
+            create_is_dirty_node(new_clean_state_clock);
+            return PartitionTaskStatus::Error;
+        }
+        zookeeper->create(current_task_status_path, start_state, zkutil::CreateMode::Persistent);
+    }
+
+    /// Try create table (if not exists) on each shard
+    {
+        auto create_query_push_ast = rewriteCreateQueryStorage(task_shard.current_pull_table_create_query, task_table.table_push, task_table.engine_push_ast);
+        create_query_push_ast->as<ASTCreateQuery &>().if_not_exists = true;
+        String query = queryToString(create_query_push_ast);
+
+        LOG_DEBUG(log, "Create destination tables. Query: " << query);
+        UInt64 shards = executeQueryOnCluster(task_table.cluster_push, query, create_query_push_ast, &task_cluster->settings_push,
+                                PoolMode::GET_MANY);
+        LOG_DEBUG(log, "Destination tables " << getQuotedTable(task_table.table_push) << " have been created on " << shards
+                                             << " shards of " << task_table.cluster_push->getShardCount());
+    }
+
+    /// Do the copying
+    {
+        bool inject_fault = false;
+        if (copy_fault_probability > 0)
+        {
+            double value = std::uniform_real_distribution<>(0, 1)(task_table.task_cluster.random_engine);
+            inject_fault = value < copy_fault_probability;
         }
 
-        /// Try create table (if not exists) on each shard
+        // Select all fields
+        ASTPtr query_select_ast = get_select_query(task_shard.table_read_shard, "*", inject_fault ? "1" : "");
+
+        LOG_DEBUG(log, "Executing SELECT query and pull from " << task_shard.getDescription()
+                       << " : " << queryToString(query_select_ast));
+
+        ASTPtr query_insert_ast;
         {
-            auto create_query_push_ast = rewriteCreateQueryStorage(task_shard.current_pull_table_create_query, task_table.table_push, task_table.engine_push_ast);
-            create_query_push_ast->as<ASTCreateQuery &>().if_not_exists = true;
-            String query = queryToString(create_query_push_ast);
+            String query;
+            query += "INSERT INTO " + getQuotedTable(task_shard.table_split_shard) + " VALUES ";
 
-            LOG_DEBUG(log, "Create destination tables. Query: " << query);
-            UInt64 shards = executeQueryOnCluster(task_table.cluster_push, query, create_query_push_ast, &task_cluster->settings_push,
-                                    PoolMode::GET_MANY);
-            LOG_DEBUG(log, "Destination tables " << getQuotedTable(task_table.table_push) << " have been created on " << shards
-                                                 << " shards of " << task_table.cluster_push->getShardCount());
-        }
-
-        /// Do the copying
-        {
-            bool inject_fault = false;
-            if (copy_fault_probability > 0)
-            {
-                double value = std::uniform_real_distribution<>(0, 1)(task_table.task_cluster.random_engine);
-                inject_fault = value < copy_fault_probability;
-            }
-
-            // Select all fields
-            ASTPtr query_select_ast = get_select_query(task_shard.table_read_shard, "*", inject_fault ? "1" : "");
-
-            LOG_DEBUG(log, "Executing SELECT query and pull from " << task_shard.getDescription()
-                           << " : " << queryToString(query_select_ast));
-
-            ASTPtr query_insert_ast;
-            {
-                String query;
-                query += "INSERT INTO " + getQuotedTable(task_shard.table_split_shard) + " VALUES ";
-
-                ParserQuery p_query(query.data() + query.size());
-                query_insert_ast = parseQuery(p_query, query, 0);
-
-                LOG_DEBUG(log, "Executing INSERT query: " << query);
-            }
-
-            try
-            {
-                /// Custom INSERT SELECT implementation
-                Context context_select = context;
-                context_select.getSettingsRef() = task_cluster->settings_pull;
-
-                Context context_insert = context;
-                context_insert.getSettingsRef() = task_cluster->settings_push;
-
-                BlockInputStreamPtr input;
-                BlockOutputStreamPtr output;
-                {
-                    BlockIO io_select = InterpreterFactory::get(query_select_ast, context_select)->execute();
-                    BlockIO io_insert = InterpreterFactory::get(query_insert_ast, context_insert)->execute();
-
-                    input = io_select.in;
-                    output = io_insert.out;
-                }
-
-                /// Fail-fast optimization to abort copying when the current clean state expires
-                std::future<Coordination::ExistsResponse> future_is_dirty_checker;
-
-                Stopwatch watch(CLOCK_MONOTONIC_COARSE);
-                constexpr UInt64 check_period_milliseconds = 500;
-
-                /// Will asynchronously check that ZooKeeper connection and is_dirty flag appearing while copying data
-                auto cancel_check = [&] ()
-                {
-                    if (zookeeper->expired())
-                        throw Exception("ZooKeeper session is expired, cancel INSERT SELECT", ErrorCodes::UNFINISHED);
-
-                    if (!future_is_dirty_checker.valid())
-                        future_is_dirty_checker = zookeeper->asyncExists(is_dirty_flag_path);
-
-                    /// check_period_milliseconds should less than average insert time of single block
-                    /// Otherwise, the insertion will slow a little bit
-                    if (watch.elapsedMilliseconds() >= check_period_milliseconds)
-                    {
-                        Coordination::ExistsResponse status = future_is_dirty_checker.get();
-
-                        if (status.error != Coordination::ZNONODE)
-                        {
-                            LogicalClock dirt_discovery_epoch (status.stat.mzxid);
-                            if (dirt_discovery_epoch == clean_state_clock.discovery_zxid)
-                                return false;
-                            throw Exception("Partition is dirty, cancel INSERT SELECT", ErrorCodes::UNFINISHED);
-                        }
-                    }
-
-                    return false;
-                };
-
-                /// Update statistics
-                /// It is quite rough: bytes_copied don't take into account DROP PARTITION.
-                auto update_stats = [&cluster_partition] (const Block & block)
-                {
-                    cluster_partition.bytes_copied += block.bytes();
-                    cluster_partition.rows_copied += block.rows();
-                    cluster_partition.blocks_copied += 1;
-                };
-
-                /// Main work is here
-                copyData(*input, *output, cancel_check, update_stats);
-
-                // Just in case
-                if (future_is_dirty_checker.valid())
-                    future_is_dirty_checker.get();
-
-                if (inject_fault)
-                    throw Exception("Copy fault injection is activated", ErrorCodes::UNFINISHED);
-            }
-            catch (...)
-            {
-                tryLogCurrentException(log, "An error occurred during copying, partition will be marked as dirty");
-                return PartitionTaskStatus::Error;
-            }
-        }
-
-        /// Finalize the processing, change state of current partition task (and also check is_dirty flag)
-        {
-            String state_finished = TaskStateWithOwner::getData(TaskState::Finished, host_id);
-            CleanStateClock new_clean_state_clock (zookeeper, is_dirty_flag_path, is_dirt_cleaned_path);
-            if (clean_state_clock != new_clean_state_clock)
-            {
-                LOG_INFO(log, "Partition " << task_partition.name << " clean state changed, cowardly bailing");
-                return PartitionTaskStatus::Error;
-            }
-            else if (!new_clean_state_clock.is_clean())
-            {
-                LOG_INFO(log, "Partition " << task_partition.name << " became dirty and will be dropped and refilled");
-                create_is_dirty_node(new_clean_state_clock);
-                return PartitionTaskStatus::Error;
-            }
-            zookeeper->set(current_task_status_path, state_finished, 0);
-        }
-
-        LOG_INFO(log, "Partition " << task_partition.name << " copied");
-        return PartitionTaskStatus::Finished;
-    }
-
-    void dropAndCreateLocalTable(const ASTPtr & create_ast)
-    {
-        const auto & create = create_ast->as<ASTCreateQuery &>();
-        dropLocalTableIfExists({create.database, create.table});
-
-        InterpreterCreateQuery interpreter(create_ast, context);
-        interpreter.execute();
-    }
-
-    void dropLocalTableIfExists(const DatabaseAndTableName & table_name) const
-    {
-        auto drop_ast = std::make_shared<ASTDropQuery>();
-        drop_ast->if_exists = true;
-        drop_ast->database = table_name.first;
-        drop_ast->table = table_name.second;
-
-        InterpreterDropQuery interpreter(drop_ast, context);
-        interpreter.execute();
-    }
-
-    String getRemoteCreateTable(const DatabaseAndTableName & table, Connection & connection, const Settings * settings = nullptr)
-    {
-        String query = "SHOW CREATE TABLE " + getQuotedTable(table);
-        Block block = getBlockWithAllStreamData(std::make_shared<RemoteBlockInputStream>(
-            connection, query, InterpreterShowCreateQuery::getSampleBlock(), context, settings));
-
-        return typeid_cast<const ColumnString &>(*block.safeGetByPosition(0).column).getDataAt(0).toString();
-    }
-
-    ASTPtr getCreateTableForPullShard(const ConnectionTimeouts & timeouts, TaskShard & task_shard)
-    {
-        /// Fetch and parse (possibly) new definition
-        auto connection_entry = task_shard.info.pool->get(timeouts, &task_cluster->settings_pull);
-        String create_query_pull_str = getRemoteCreateTable(
-            task_shard.task_table.table_pull,
-            *connection_entry,
-            &task_cluster->settings_pull);
-
-        ParserCreateQuery parser_create_query;
-        return parseQuery(parser_create_query, create_query_pull_str, 0);
-    }
-
-    void createShardInternalTables(const ConnectionTimeouts & timeouts, TaskShard & task_shard, bool create_split = true)
-    {
-        TaskTable & task_table = task_shard.task_table;
-
-        /// We need to update table definitions for each part, it could be changed after ALTER
-        task_shard.current_pull_table_create_query = getCreateTableForPullShard(timeouts, task_shard);
-
-        /// Create local Distributed tables:
-        ///  a table fetching data from current shard and a table inserting data to the whole destination cluster
-        String read_shard_prefix = ".read_shard_" + toString(task_shard.indexInCluster()) + ".";
-        String split_shard_prefix = ".split.";
-        task_shard.table_read_shard = DatabaseAndTableName(working_database_name, read_shard_prefix + task_table.table_id);
-        task_shard.table_split_shard = DatabaseAndTableName(working_database_name, split_shard_prefix + task_table.table_id);
-
-        /// Create special cluster with single shard
-        String shard_read_cluster_name = read_shard_prefix + task_table.cluster_pull_name;
-        ClusterPtr cluster_pull_current_shard = task_table.cluster_pull->getClusterWithSingleShard(task_shard.indexInCluster());
-        context.setCluster(shard_read_cluster_name, cluster_pull_current_shard);
-
-        auto storage_shard_ast = createASTStorageDistributed(shard_read_cluster_name, task_table.table_pull.first, task_table.table_pull.second);
-        const auto & storage_split_ast = task_table.engine_split_ast;
-
-        auto create_query_ast = removeAliasColumnsFromCreateQuery(task_shard.current_pull_table_create_query);
-        auto create_table_pull_ast = rewriteCreateQueryStorage(create_query_ast, task_shard.table_read_shard, storage_shard_ast);
-        auto create_table_split_ast = rewriteCreateQueryStorage(create_query_ast, task_shard.table_split_shard, storage_split_ast);
-
-        dropAndCreateLocalTable(create_table_pull_ast);
-
-        if (create_split)
-            dropAndCreateLocalTable(create_table_split_ast);
-    }
-
-
-    std::set<String> getShardPartitions(const ConnectionTimeouts & timeouts, TaskShard & task_shard)
-    {
-        createShardInternalTables(timeouts, task_shard, false);
-
-        TaskTable & task_table = task_shard.task_table;
-
-        String query;
-        {
-            WriteBufferFromOwnString wb;
-            wb << "SELECT DISTINCT " << queryToString(task_table.engine_push_partition_key_ast) << " AS partition FROM"
-               << " " << getQuotedTable(task_shard.table_read_shard) << " ORDER BY partition DESC";
-            query = wb.str();
-        }
-
-        ParserQuery parser_query(query.data() + query.size());
-        ASTPtr query_ast = parseQuery(parser_query, query, 0);
-
-        LOG_DEBUG(log, "Computing destination partition set, executing query: " << query);
-
-        Context local_context = context;
-        local_context.setSettings(task_cluster->settings_pull);
-        Block block = getBlockWithAllStreamData(InterpreterFactory::get(query_ast, local_context)->execute().in);
-
-        std::set<String> res;
-        if (block)
-        {
-            ColumnWithTypeAndName & column = block.getByPosition(0);
-            task_shard.partition_key_column = column;
-
-            for (size_t i = 0; i < column.column->size(); ++i)
-            {
-                WriteBufferFromOwnString wb;
-                column.type->serializeAsTextQuoted(*column.column, i, wb, FormatSettings());
-                res.emplace(wb.str());
-            }
-        }
-
-        LOG_DEBUG(log, "There are " << res.size() << " destination partitions in shard " << task_shard.getDescription());
-
-        return res;
-    }
-
-    bool checkShardHasPartition(const ConnectionTimeouts & timeouts, TaskShard & task_shard, const String & partition_quoted_name)
-    {
-        createShardInternalTables(timeouts, task_shard, false);
-
-        TaskTable & task_table = task_shard.task_table;
-
-        std::string query = "SELECT 1 FROM " + getQuotedTable(task_shard.table_read_shard)
-            + " WHERE (" + queryToString(task_table.engine_push_partition_key_ast) + " = (" + partition_quoted_name + " AS partition_key))";
-
-        if (!task_table.where_condition_str.empty())
-            query += " AND (" + task_table.where_condition_str + ")";
-
-        query += " LIMIT 1";
-
-        LOG_DEBUG(log, "Checking shard " << task_shard.getDescription() << " for partition "
-                       << partition_quoted_name << " existence, executing query: " << query);
-
-        ParserQuery parser_query(query.data() + query.size());
-        ASTPtr query_ast = parseQuery(parser_query, query, 0);
-
-        Context local_context = context;
-        local_context.setSettings(task_cluster->settings_pull);
-        return InterpreterFactory::get(query_ast, local_context)->execute().in->read().rows() != 0;
-    }
-
-    /** Executes simple query (without output streams, for example DDL queries) on each shard of the cluster
-      * Returns number of shards for which at least one replica executed query successfully
-      */
-    UInt64 executeQueryOnCluster(
-        const ClusterPtr & cluster,
-        const String & query,
-        const ASTPtr & query_ast_ = nullptr,
-        const Settings * settings = nullptr,
-        PoolMode pool_mode = PoolMode::GET_ALL,
-        UInt64 max_successful_executions_per_shard = 0) const
-    {
-        auto num_shards = cluster->getShardsInfo().size();
-        std::vector<UInt64> per_shard_num_successful_replicas(num_shards, 0);
-
-        ASTPtr query_ast;
-        if (query_ast_ == nullptr)
-        {
             ParserQuery p_query(query.data() + query.size());
-            query_ast = parseQuery(p_query, query, 0);
+            query_insert_ast = parseQuery(p_query, query, 0);
+
+            LOG_DEBUG(log, "Executing INSERT query: " << query);
         }
-        else
-            query_ast = query_ast_;
 
-
-        /// We need to execute query on one replica at least
-        auto do_for_shard = [&] (UInt64 shard_index)
+        try
         {
-            const Cluster::ShardInfo & shard = cluster->getShardsInfo().at(shard_index);
-            UInt64 & num_successful_executions = per_shard_num_successful_replicas.at(shard_index);
-            num_successful_executions = 0;
+            /// Custom INSERT SELECT implementation
+            Context context_select = context;
+            context_select.getSettingsRef() = task_cluster->settings_pull;
 
-            auto increment_and_check_exit = [&] ()
+            Context context_insert = context;
+            context_insert.getSettingsRef() = task_cluster->settings_push;
+
+            BlockInputStreamPtr input;
+            BlockOutputStreamPtr output;
             {
-                ++num_successful_executions;
-                return max_successful_executions_per_shard && num_successful_executions >= max_successful_executions_per_shard;
+                BlockIO io_select = InterpreterFactory::get(query_select_ast, context_select)->execute();
+                BlockIO io_insert = InterpreterFactory::get(query_insert_ast, context_insert)->execute();
+
+                input = io_select.in;
+                output = io_insert.out;
+            }
+
+            /// Fail-fast optimization to abort copying when the current clean state expires
+            std::future<Coordination::ExistsResponse> future_is_dirty_checker;
+
+            Stopwatch watch(CLOCK_MONOTONIC_COARSE);
+            constexpr UInt64 check_period_milliseconds = 500;
+
+            /// Will asynchronously check that ZooKeeper connection and is_dirty flag appearing while copying data
+            auto cancel_check = [&] ()
+            {
+                if (zookeeper->expired())
+                    throw Exception("ZooKeeper session is expired, cancel INSERT SELECT", ErrorCodes::UNFINISHED);
+
+                if (!future_is_dirty_checker.valid())
+                    future_is_dirty_checker = zookeeper->asyncExists(is_dirty_flag_path);
+
+                /// check_period_milliseconds should less than average insert time of single block
+                /// Otherwise, the insertion will slow a little bit
+                if (watch.elapsedMilliseconds() >= check_period_milliseconds)
+                {
+                    Coordination::ExistsResponse status = future_is_dirty_checker.get();
+
+                    if (status.error != Coordination::ZNONODE)
+                    {
+                        LogicalClock dirt_discovery_epoch (status.stat.mzxid);
+                        if (dirt_discovery_epoch == clean_state_clock.discovery_zxid)
+                            return false;
+                        throw Exception("Partition is dirty, cancel INSERT SELECT", ErrorCodes::UNFINISHED);
+                    }
+                }
+
+                return false;
             };
 
-            UInt64 num_replicas = cluster->getShardsAddresses().at(shard_index).size();
-            UInt64 num_local_replicas = shard.getLocalNodeCount();
-            UInt64 num_remote_replicas = num_replicas - num_local_replicas;
-
-            /// In that case we don't have local replicas, but do it just in case
-            for (UInt64 i = 0; i < num_local_replicas; ++i)
+            /// Update statistics
+            /// It is quite rough: bytes_copied don't take into account DROP PARTITION.
+            auto update_stats = [&cluster_partition] (const Block & block)
             {
-                auto interpreter = InterpreterFactory::get(query_ast, context);
-                interpreter->execute();
+                cluster_partition.bytes_copied += block.bytes();
+                cluster_partition.rows_copied += block.rows();
+                cluster_partition.blocks_copied += 1;
+            };
 
-                if (increment_and_check_exit())
-                    return;
-            }
+            /// Main work is here
+            copyData(*input, *output, cancel_check, update_stats);
 
-            /// Will try to make as many as possible queries
-            if (shard.hasRemoteConnections())
-            {
-                Settings current_settings = settings ? *settings : task_cluster->settings_common;
-                current_settings.max_parallel_replicas = num_remote_replicas ? num_remote_replicas : 1;
+            // Just in case
+            if (future_is_dirty_checker.valid())
+                future_is_dirty_checker.get();
 
-                auto timeouts = ConnectionTimeouts::getTCPTimeoutsWithFailover(current_settings).getSaturated(current_settings.max_execution_time);
-                auto connections = shard.pool->getMany(timeouts, &current_settings, pool_mode);
+            if (inject_fault)
+                throw Exception("Copy fault injection is activated", ErrorCodes::UNFINISHED);
+        }
+        catch (...)
+        {
+            tryLogCurrentException(log, "An error occurred during copying, partition will be marked as dirty");
+            return PartitionTaskStatus::Error;
+        }
+    }
 
-                for (auto & connection : connections)
-                {
-                    if (connection.isNull())
-                        continue;
+    /// Finalize the processing, change state of current partition task (and also check is_dirty flag)
+    {
+        String state_finished = TaskStateWithOwner::getData(TaskState::Finished, host_id);
+        CleanStateClock new_clean_state_clock (zookeeper, is_dirty_flag_path, is_dirt_cleaned_path);
+        if (clean_state_clock != new_clean_state_clock)
+        {
+            LOG_INFO(log, "Partition " << task_partition.name << " clean state changed, cowardly bailing");
+            return PartitionTaskStatus::Error;
+        }
+        else if (!new_clean_state_clock.is_clean())
+        {
+            LOG_INFO(log, "Partition " << task_partition.name << " became dirty and will be dropped and refilled");
+            create_is_dirty_node(new_clean_state_clock);
+            return PartitionTaskStatus::Error;
+        }
+        zookeeper->set(current_task_status_path, state_finished, 0);
+    }
 
-                    try
-                    {
-                        /// CREATE TABLE and DROP PARTITION queries return empty block
-                        RemoteBlockInputStream stream{*connection, query, Block{}, context, &current_settings};
-                        NullBlockOutputStream output{Block{}};
-                        copyData(stream, output);
+    LOG_INFO(log, "Partition " << task_partition.name << " copied");
+    return PartitionTaskStatus::Finished;
+}
 
-                        if (increment_and_check_exit())
-                            return;
-                    }
-                    catch (const Exception &)
-                    {
-                        LOG_INFO(log, getCurrentExceptionMessage(false, true));
-                    }
-                }
-            }
+void ClusterCopier::dropAndCreateLocalTable(const ASTPtr & create_ast)
+{
+    const auto & create = create_ast->as<ASTCreateQuery &>();
+    dropLocalTableIfExists({create.database, create.table});
+
+    InterpreterCreateQuery interpreter(create_ast, context);
+    interpreter.execute();
+}
+
+void ClusterCopier::dropLocalTableIfExists(const DatabaseAndTableName & table_name) const
+{
+    auto drop_ast = std::make_shared<ASTDropQuery>();
+    drop_ast->if_exists = true;
+    drop_ast->database = table_name.first;
+    drop_ast->table = table_name.second;
+
+    InterpreterDropQuery interpreter(drop_ast, context);
+    interpreter.execute();
+}
+
+String ClusterCopier::getRemoteCreateTable(const DatabaseAndTableName & table, Connection & connection, const Settings * settings)
+{
+    String query = "SHOW CREATE TABLE " + getQuotedTable(table);
+    Block block = getBlockWithAllStreamData(std::make_shared<RemoteBlockInputStream>(
+        connection, query, InterpreterShowCreateQuery::getSampleBlock(), context, settings));
+
+    return typeid_cast<const ColumnString &>(*block.safeGetByPosition(0).column).getDataAt(0).toString();
+}
+
+ASTPtr ClusterCopier::getCreateTableForPullShard(const ConnectionTimeouts & timeouts, TaskShard & task_shard)
+{
+    /// Fetch and parse (possibly) new definition
+    auto connection_entry = task_shard.info.pool->get(timeouts, &task_cluster->settings_pull);
+    String create_query_pull_str = getRemoteCreateTable(
+        task_shard.task_table.table_pull,
+        *connection_entry,
+        &task_cluster->settings_pull);
+
+    ParserCreateQuery parser_create_query;
+    return parseQuery(parser_create_query, create_query_pull_str, 0);
+}
+
+void ClusterCopier::createShardInternalTables(const ConnectionTimeouts & timeouts, TaskShard & task_shard, bool create_split)
+{
+    TaskTable & task_table = task_shard.task_table;
+
+    /// We need to update table definitions for each part, it could be changed after ALTER
+    task_shard.current_pull_table_create_query = getCreateTableForPullShard(timeouts, task_shard);
+
+    /// Create local Distributed tables:
+    ///  a table fetching data from current shard and a table inserting data to the whole destination cluster
+    String read_shard_prefix = ".read_shard_" + toString(task_shard.indexInCluster()) + ".";
+    String split_shard_prefix = ".split.";
+    task_shard.table_read_shard = DatabaseAndTableName(working_database_name, read_shard_prefix + task_table.table_id);
+    task_shard.table_split_shard = DatabaseAndTableName(working_database_name, split_shard_prefix + task_table.table_id);
+
+    /// Create special cluster with single shard
+    String shard_read_cluster_name = read_shard_prefix + task_table.cluster_pull_name;
+    ClusterPtr cluster_pull_current_shard = task_table.cluster_pull->getClusterWithSingleShard(task_shard.indexInCluster());
+    context.setCluster(shard_read_cluster_name, cluster_pull_current_shard);
+
+    auto storage_shard_ast = createASTStorageDistributed(shard_read_cluster_name, task_table.table_pull.first, task_table.table_pull.second);
+    const auto & storage_split_ast = task_table.engine_split_ast;
+
+    auto create_query_ast = removeAliasColumnsFromCreateQuery(task_shard.current_pull_table_create_query);
+    auto create_table_pull_ast = rewriteCreateQueryStorage(create_query_ast, task_shard.table_read_shard, storage_shard_ast);
+    auto create_table_split_ast = rewriteCreateQueryStorage(create_query_ast, task_shard.table_split_shard, storage_split_ast);
+
+    dropAndCreateLocalTable(create_table_pull_ast);
+
+    if (create_split)
+        dropAndCreateLocalTable(create_table_split_ast);
+}
+
+
+std::set<String> ClusterCopier::getShardPartitions(const ConnectionTimeouts & timeouts, TaskShard & task_shard)
+{
+    createShardInternalTables(timeouts, task_shard, false);
+
+    TaskTable & task_table = task_shard.task_table;
+
+    String query;
+    {
+        WriteBufferFromOwnString wb;
+        wb << "SELECT DISTINCT " << queryToString(task_table.engine_push_partition_key_ast) << " AS partition FROM"
+           << " " << getQuotedTable(task_shard.table_read_shard) << " ORDER BY partition DESC";
+        query = wb.str();
+    }
+
+    ParserQuery parser_query(query.data() + query.size());
+    ASTPtr query_ast = parseQuery(parser_query, query, 0);
+
+    LOG_DEBUG(log, "Computing destination partition set, executing query: " << query);
+
+    Context local_context = context;
+    local_context.setSettings(task_cluster->settings_pull);
+    Block block = getBlockWithAllStreamData(InterpreterFactory::get(query_ast, local_context)->execute().in);
+
+    std::set<String> res;
+    if (block)
+    {
+        ColumnWithTypeAndName & column = block.getByPosition(0);
+        task_shard.partition_key_column = column;
+
+        for (size_t i = 0; i < column.column->size(); ++i)
+        {
+            WriteBufferFromOwnString wb;
+            column.type->serializeAsTextQuoted(*column.column, i, wb, FormatSettings());
+            res.emplace(wb.str());
+        }
+    }
+
+    LOG_DEBUG(log, "There are " << res.size() << " destination partitions in shard " << task_shard.getDescription());
+
+    return res;
+}
+
+bool ClusterCopier::checkShardHasPartition(const ConnectionTimeouts & timeouts, TaskShard & task_shard, const String & partition_quoted_name)
+{
+    createShardInternalTables(timeouts, task_shard, false);
+
+    TaskTable & task_table = task_shard.task_table;
+
+    std::string query = "SELECT 1 FROM " + getQuotedTable(task_shard.table_read_shard)
+        + " WHERE (" + queryToString(task_table.engine_push_partition_key_ast) + " = (" + partition_quoted_name + " AS partition_key))";
+
+    if (!task_table.where_condition_str.empty())
+        query += " AND (" + task_table.where_condition_str + ")";
+
+    query += " LIMIT 1";
+
+    LOG_DEBUG(log, "Checking shard " << task_shard.getDescription() << " for partition "
+                   << partition_quoted_name << " existence, executing query: " << query);
+
+    ParserQuery parser_query(query.data() + query.size());
+    ASTPtr query_ast = parseQuery(parser_query, query, 0);
+
+    Context local_context = context;
+    local_context.setSettings(task_cluster->settings_pull);
+    return InterpreterFactory::get(query_ast, local_context)->execute().in->read().rows() != 0;
+}
+
+
+UInt64 ClusterCopier::executeQueryOnCluster(
+    const ClusterPtr & cluster,
+    const String & query,
+    const ASTPtr & query_ast_,
+    const Settings * settings,
+    PoolMode pool_mode,
+    UInt64 max_successful_executions_per_shard) const
+{
+    auto num_shards = cluster->getShardsInfo().size();
+    std::vector<UInt64> per_shard_num_successful_replicas(num_shards, 0);
+
+    ASTPtr query_ast;
+    if (query_ast_ == nullptr)
+    {
+        ParserQuery p_query(query.data() + query.size());
+        query_ast = parseQuery(p_query, query, 0);
+    }
+    else
+        query_ast = query_ast_;
+
+
+    /// We need to execute query on one replica at least
+    auto do_for_shard = [&] (UInt64 shard_index)
+    {
+        const Cluster::ShardInfo & shard = cluster->getShardsInfo().at(shard_index);
+        UInt64 & num_successful_executions = per_shard_num_successful_replicas.at(shard_index);
+        num_successful_executions = 0;
+
+        auto increment_and_check_exit = [&] ()
+        {
+            ++num_successful_executions;
+            return max_successful_executions_per_shard && num_successful_executions >= max_successful_executions_per_shard;
         };
 
+        UInt64 num_replicas = cluster->getShardsAddresses().at(shard_index).size();
+        UInt64 num_local_replicas = shard.getLocalNodeCount();
+        UInt64 num_remote_replicas = num_replicas - num_local_replicas;
+
+        /// In that case we don't have local replicas, but do it just in case
+        for (UInt64 i = 0; i < num_local_replicas; ++i)
         {
-            ThreadPool thread_pool(std::min<UInt64>(num_shards, getNumberOfPhysicalCPUCores()));
+            auto interpreter = InterpreterFactory::get(query_ast, context);
+            interpreter->execute();
 
-            for (UInt64 shard_index = 0; shard_index < num_shards; ++shard_index)
-                thread_pool.scheduleOrThrowOnError([=] { do_for_shard(shard_index); });
-
-            thread_pool.wait();
+            if (increment_and_check_exit())
+                return;
         }
 
-        UInt64 successful_shards = 0;
-        for (UInt64 num_replicas : per_shard_num_successful_replicas)
-            successful_shards += (num_replicas > 0);
+        /// Will try to make as many as possible queries
+        if (shard.hasRemoteConnections())
+        {
+            Settings current_settings = settings ? *settings : task_cluster->settings_common;
+            current_settings.max_parallel_replicas = num_remote_replicas ? num_remote_replicas : 1;
 
-        return successful_shards;
-    }
+            auto timeouts = ConnectionTimeouts::getTCPTimeoutsWithFailover(current_settings).getSaturated(current_settings.max_execution_time);
+            auto connections = shard.pool->getMany(timeouts, &current_settings, pool_mode);
 
-private:
-    String task_zookeeper_path;
-    String task_description_path;
-    String host_id;
-    String working_database_name;
+            for (auto & connection : connections)
+            {
+                if (connection.isNull())
+                    continue;
 
-    /// Auto update config stuff
-    UInt64 task_descprtion_current_version = 1;
-    std::atomic<UInt64> task_descprtion_version{1};
-    Coordination::WatchCallback task_description_watch_callback;
-    /// ZooKeeper session used to set the callback
-    zkutil::ZooKeeperPtr task_description_watch_zookeeper;
+                try
+                {
+                    /// CREATE TABLE and DROP PARTITION queries return empty block
+                    RemoteBlockInputStream stream{*connection, query, Block{}, context, &current_settings};
+                    NullBlockOutputStream output{Block{}};
+                    copyData(stream, output);
 
-    ConfigurationPtr task_cluster_initial_config;
-    ConfigurationPtr task_cluster_current_config;
-    Coordination::Stat task_descprtion_current_stat{};
+                    if (increment_and_check_exit())
+                        return;
+                }
+                catch (const Exception &)
+                {
+                    LOG_INFO(log, getCurrentExceptionMessage(false, true));
+                }
+            }
+        }
+    };
 
-    std::unique_ptr<TaskCluster> task_cluster;
-
-    bool is_safe_mode = false;
-    double copy_fault_probability = 0.0;
-
-    Context & context;
-    Poco::Logger * log;
-
-    std::chrono::milliseconds default_sleep_time{1000};
-};
-
-
-/// ClusterCopierApp
-
-
-void ClusterCopierApp::initialize(Poco::Util::Application & self)
-{
-    is_help = config().has("help");
-    if (is_help)
-        return;
-
-    config_xml_path = config().getString("config-file");
-    task_path = config().getString("task-path");
-    log_level = config().getString("log-level", "debug");
-    is_safe_mode = config().has("safe-mode");
-    if (config().has("copy-fault-probability"))
-        copy_fault_probability = std::max(std::min(config().getDouble("copy-fault-probability"), 1.0), 0.0);
-    base_dir = (config().has("base-dir")) ? config().getString("base-dir") : Poco::Path::current();
-
-    // process_id is '<hostname>#<start_timestamp>_<pid>'
-    time_t timestamp = Poco::Timestamp().epochTime();
-    auto curr_pid = Poco::Process::id();
-
-    process_id = std::to_string(DateLUT::instance().toNumYYYYMMDDhhmmss(timestamp)) + "_" + std::to_string(curr_pid);
-    host_id = escapeForFileName(getFQDNOrHostName()) + '#' + process_id;
-    process_path = Poco::Path(base_dir + "/clickhouse-copier_" + process_id).absolute().toString();
-    Poco::File(process_path).createDirectories();
-
-    /// Override variables for BaseDaemon
-    if (config().has("log-level"))
-        config().setString("logger.level", config().getString("log-level"));
-
-    if (config().has("base-dir") || !config().has("logger.log"))
-        config().setString("logger.log", process_path + "/log.log");
-
-    if (config().has("base-dir") || !config().has("logger.errorlog"))
-        config().setString("logger.errorlog", process_path + "/log.err.log");
-
-    Base::initialize(self);
-}
-
-
-void ClusterCopierApp::handleHelp(const std::string &, const std::string &)
-{
-    Poco::Util::HelpFormatter helpFormatter(options());
-    helpFormatter.setCommand(commandName());
-    helpFormatter.setHeader("Copies tables from one cluster to another");
-    helpFormatter.setUsage("--config-file <config-file> --task-path <task-path>");
-    helpFormatter.format(std::cerr);
-
-    stopOptionsProcessing();
-}
-
-
-void ClusterCopierApp::defineOptions(Poco::Util::OptionSet & options)
-{
-    Base::defineOptions(options);
-
-    options.addOption(Poco::Util::Option("task-path", "", "path to task in ZooKeeper")
-                          .argument("task-path").binding("task-path"));
-    options.addOption(Poco::Util::Option("task-file", "", "path to task file for uploading in ZooKeeper to task-path")
-                          .argument("task-file").binding("task-file"));
-    options.addOption(Poco::Util::Option("task-upload-force", "", "Force upload task-file even node already exists")
-                          .argument("task-upload-force").binding("task-upload-force"));
-    options.addOption(Poco::Util::Option("safe-mode", "", "disables ALTER DROP PARTITION in case of errors")
-                          .binding("safe-mode"));
-    options.addOption(Poco::Util::Option("copy-fault-probability", "", "the copying fails with specified probability (used to test partition state recovering)")
-                          .argument("copy-fault-probability").binding("copy-fault-probability"));
-    options.addOption(Poco::Util::Option("log-level", "", "sets log level")
-                          .argument("log-level").binding("log-level"));
-    options.addOption(Poco::Util::Option("base-dir", "", "base directory for copiers, consecutive copier launches will populate /base-dir/launch_id/* directories")
-                          .argument("base-dir").binding("base-dir"));
-
-    using Me = std::decay_t<decltype(*this)>;
-    options.addOption(Poco::Util::Option("help", "", "produce this help message").binding("help")
-                          .callback(Poco::Util::OptionCallback<Me>(this, &Me::handleHelp)));
-}
-
-
-void ClusterCopierApp::mainImpl()
-{
-    StatusFile status_file(process_path + "/status");
-    ThreadStatus thread_status;
-
-    auto log = &logger();
-    LOG_INFO(log, "Starting clickhouse-copier ("
-        << "id " << process_id << ", "
-        << "host_id " << host_id << ", "
-        << "path " << process_path << ", "
-        << "revision " << ClickHouseRevision::get() << ")");
-
-    auto context = std::make_unique<Context>(Context::createGlobal());
-    context->makeGlobalContext();
-    SCOPE_EXIT(context->shutdown());
-
-    context->setConfig(loaded_config.configuration);
-    context->setApplicationType(Context::ApplicationType::LOCAL);
-    context->setPath(process_path);
-
-    registerFunctions();
-    registerAggregateFunctions();
-    registerTableFunctions();
-    registerStorages();
-    registerDictionaries();
-    registerDisks();
-
-    static const std::string default_database = "_local";
-    context->addDatabase(default_database, std::make_shared<DatabaseMemory>(default_database));
-    context->setCurrentDatabase(default_database);
-
-    /// Initialize query scope just in case.
-    CurrentThread::QueryScope query_scope(*context);
-
-    auto copier = std::make_unique<ClusterCopier>(task_path, host_id, default_database, *context);
-    copier->setSafeMode(is_safe_mode);
-    copier->setCopyFaultProbability(copy_fault_probability);
-
-    auto task_file = config().getString("task-file", "");
-    if (!task_file.empty())
-        copier->uploadTaskDescription(task_path, task_file, config().getBool("task-upload-force", false));
-
-    copier->init();
-    copier->process(ConnectionTimeouts::getTCPTimeoutsWithoutFailover(context->getSettingsRef()));
-
-    /// Reset ZooKeeper before removing ClusterCopier.
-    /// Otherwise zookeeper watch can call callback which use already removed ClusterCopier object.
-    context->resetZooKeeper();
-}
-
-
-int ClusterCopierApp::main(const std::vector<std::string> &)
-{
-    if (is_help)
-        return 0;
-
-    try
     {
-        mainImpl();
-    }
-    catch (...)
-    {
-        tryLogCurrentException(&Poco::Logger::root(), __PRETTY_FUNCTION__);
-        auto code = getCurrentExceptionCode();
+        ThreadPool thread_pool(std::min<UInt64>(num_shards, getNumberOfPhysicalCPUCores()));
 
-        return (code) ? code : -1;
+        for (UInt64 shard_index = 0; shard_index < num_shards; ++shard_index)
+            thread_pool.scheduleOrThrowOnError([=] { do_for_shard(shard_index); });
+
+        thread_pool.wait();
     }
 
-    return 0;
+    UInt64 successful_shards = 0;
+    for (UInt64 num_replicas : per_shard_num_successful_replicas)
+        successful_shards += (num_replicas > 0);
+
+    return successful_shards;
 }
 
-
-}
-
-#pragma GCC diagnostic ignored "-Wunused-function"
-#pragma GCC diagnostic ignored "-Wmissing-declarations"
-
-int mainEntryClickHouseClusterCopier(int argc, char ** argv)
-{
-    try
-    {
-        DB::ClusterCopierApp app;
-        return app.run(argc, argv);
-    }
-    catch (...)
-    {
-        std::cerr << DB::getCurrentExceptionMessage(true) << "\n";
-        auto code = DB::getCurrentExceptionCode();
-
-        return (code) ? code : -1;
-    }
 }
diff --git a/dbms/programs/copier/ClusterCopier.h b/dbms/programs/copier/ClusterCopier.h
index 89f45df8686..2e920f17791 100644
--- a/dbms/programs/copier/ClusterCopier.h
+++ b/dbms/programs/copier/ClusterCopier.h
@@ -1,89 +1,175 @@
 #pragma once
-#include <Poco/Util/ServerApplication.h>
-#include <daemon/BaseDaemon.h>
 
-/* clickhouse cluster copier util
- * Copies tables data from one cluster to new tables of other (possibly the same) cluster in distributed fault-tolerant manner.
- *
- * See overview in the docs: docs/en/utils/clickhouse-copier.md
- *
- * Implementation details:
- *
- * cluster-copier workers pull each partition of each shard of the source cluster and push it to the destination cluster through
- * Distributed table (to preform data resharding). So, worker job is a partition of a source shard.
- * A job has three states: Active, Finished and Abandoned. Abandoned means that worker died and did not finish the job.
- *
- * If an error occurred during the copying (a worker failed or a worker did not finish the INSERT), then the whole partition (on
- * all destination servers) should be dropped and refilled. So, copying entity is a partition of all destination shards.
- * If a failure is detected a special /is_dirty node is created in ZooKeeper signalling that other workers copying the same partition
- * should stop, after a refilling procedure should start.
- *
- * ZooKeeper task node has the following structure:
- *  /task/path_root                     - path passed in --task-path parameter
- *      /description                    - contains user-defined XML config of the task
- *      /task_active_workers            - contains ephemeral nodes of all currently active workers, used to implement max_workers limitation
- *          /server_fqdn#PID_timestamp  - cluster-copier worker ID
- *          ...
- *      /tables             - directory with table tasks
- *      /cluster.db.table1  - directory of table_hits task
- *          /partition1     - directory for partition1
- *              /shards     - directory for source cluster shards
- *                  /1      - worker job for the first shard of partition1 of table test.hits
- *                            Contains info about current status (Active or Finished) and worker ID.
- *                  /2
- *                  ...
- *              /partition_active_workers
- *                  /1      - for each job in /shards a corresponding ephemeral node created in /partition_active_workers
- *                            It is used to detect Abandoned jobs (if there is Active node in /shards and there is no node in
- *                            /partition_active_workers).
- *                            Also, it is used to track active workers in the partition (when we need to refill the partition we do
- *                            not DROP PARTITION while there are active workers)
- *                  /2
- *                  ...
- *              /is_dirty   - the node is set if some worker detected that an error occurred (the INSERT is failed or an Abandoned node is
- *                            detected). If the node appeared workers in this partition should stop and start cleaning and refilling
- *                            partition procedure.
- *                            During this procedure a single 'cleaner' worker is selected. The worker waits for stopping all partition
- *                            workers, removes /shards node, executes DROP PARTITION on each destination node and removes /is_dirty node.
- *                  /cleaner- An ephemeral node used to select 'cleaner' worker. Contains ID of the worker.
- *      /cluster.db.table2
- *          ...
- */
+#include "Aliases.h"
+#include "Internals.h"
+#include "TaskCluster.h"
+#include "TaskTableAndShard.h"
+#include "ShardPartition.h"
+#include "ZooKeeperStaff.h"
 
 namespace DB
 {
 
-class ClusterCopierApp : public BaseDaemon
+class ClusterCopier
 {
 public:
 
-    void initialize(Poco::Util::Application & self) override;
+    ClusterCopier(const String &task_path_,
+                  const String &host_id_,
+                  const String &proxy_database_name_,
+                  Context &context_)
+            :
+            task_zookeeper_path(task_path_),
+            host_id(host_id_),
+            working_database_name(proxy_database_name_),
+            context(context_),
+            log(&Poco::Logger::get("ClusterCopier")) {}
 
-    void handleHelp(const std::string &, const std::string &);
+    void init();
 
-    void defineOptions(Poco::Util::OptionSet & options) override;
+    template<typename T>
+    decltype(auto) retry(T &&func, UInt64 max_tries = 100);
 
-    int main(const std::vector<std::string> &) override;
+    void discoverShardPartitions(const ConnectionTimeouts & timeouts, const TaskShardPtr & task_shard) ;
+
+    /// Compute set of partitions, assume set of partitions aren't changed during the processing
+    void discoverTablePartitions(const ConnectionTimeouts &timeouts, TaskTable &task_table, UInt64 num_threads = 0);
+
+    void uploadTaskDescription(const std::string &task_path, const std::string &task_file, const bool force);
+
+    void reloadTaskDescription();
+
+    void updateConfigIfNeeded();
+
+    void process(const ConnectionTimeouts & timeouts);
+
+    /// Disables DROP PARTITION commands that used to clear data after errors
+    void setSafeMode(bool is_safe_mode_ = true)
+    {
+        is_safe_mode = is_safe_mode_;
+    }
+
+    void setCopyFaultProbability(double copy_fault_probability_)
+    {
+        copy_fault_probability = copy_fault_probability_;
+    }
+
+
+protected:
+
+    String getWorkersPath() const
+    {
+        return task_cluster->task_zookeeper_path + "/task_active_workers";
+    }
+
+    String getWorkersPathVersion() const
+    {
+        return getWorkersPath() + "_version";
+    }
+
+    String getCurrentWorkerNodePath() const
+    {
+        return getWorkersPath() + "/" + host_id;
+    }
+
+    zkutil::EphemeralNodeHolder::Ptr createTaskWorkerNodeAndWaitIfNeed(
+            const zkutil::ZooKeeperPtr &zookeeper,
+            const String &description,
+            bool unprioritized);
+
+    /** Checks that the whole partition of a table was copied. We should do it carefully due to dirty lock.
+     * State of some task could change during the processing.
+     * We have to ensure that all shards have the finished state and there is no dirty flag.
+     * Moreover, we have to check status twice and check zxid, because state can change during the checking.
+     */
+    bool checkPartitionIsDone(const TaskTable &task_table, const String &partition_name,
+                              const TasksShard &shards_with_partition);
+
+    /// Removes MATERIALIZED and ALIAS columns from create table query
+    static ASTPtr removeAliasColumnsFromCreateQuery(const ASTPtr &query_ast);
+
+    /// Replaces ENGINE and table name in a create query
+    std::shared_ptr<ASTCreateQuery>
+    rewriteCreateQueryStorage(const ASTPtr &create_query_ast, const DatabaseAndTableName &new_table,
+                              const ASTPtr &new_storage_ast);
+
+    bool tryDropPartition(ShardPartition &task_partition,
+                          const zkutil::ZooKeeperPtr &zookeeper,
+                          const CleanStateClock &clean_state_clock);
+
+
+    static constexpr UInt64 max_table_tries = 1000;
+    static constexpr UInt64 max_shard_partition_tries = 600;
+
+    bool tryProcessTable(const ConnectionTimeouts &timeouts, TaskTable &task_table);
+
+    PartitionTaskStatus tryProcessPartitionTask(const ConnectionTimeouts & timeouts,
+                                                ShardPartition & task_partition,
+                                                bool is_unprioritized_task);
+
+    PartitionTaskStatus processPartitionTaskImpl(const ConnectionTimeouts & timeouts,
+                                                 ShardPartition & task_partition,
+                                                 bool is_unprioritized_task);
+
+    void dropAndCreateLocalTable(const ASTPtr & create_ast);
+
+    void dropLocalTableIfExists (const DatabaseAndTableName & table_name) const;
+
+    String getRemoteCreateTable(const DatabaseAndTableName & table,
+                                Connection & connection,
+                                const Settings * settings = nullptr);
+
+    ASTPtr getCreateTableForPullShard(const ConnectionTimeouts & timeouts,
+                                      TaskShard & task_shard);
+
+    void createShardInternalTables(const ConnectionTimeouts & timeouts,
+                                   TaskShard & task_shard,
+                                   bool create_split = true);
+
+    std::set<String> getShardPartitions(const ConnectionTimeouts & timeouts,
+                                        TaskShard & task_shard);
+
+    bool checkShardHasPartition(const ConnectionTimeouts & timeouts,
+                                TaskShard & task_shard,
+                                const String & partition_quoted_name);
+
+    /** Executes simple query (without output streams, for example DDL queries) on each shard of the cluster
+      * Returns number of shards for which at least one replica executed query successfully
+      */
+    UInt64 executeQueryOnCluster(
+            const ClusterPtr &cluster,
+            const String &query,
+            const ASTPtr &query_ast_ = nullptr,
+            const Settings *settings = nullptr,
+            PoolMode pool_mode = PoolMode::GET_ALL,
+            UInt64 max_successful_executions_per_shard = 0) const;
 
 private:
+    String task_zookeeper_path;
+    String task_description_path;
+    String host_id;
+    String working_database_name;
 
-    using Base = BaseDaemon;
+    /// Auto update config stuff
+    UInt64 task_descprtion_current_version = 1;
+    std::atomic<UInt64> task_descprtion_version{1};
+    Coordination::WatchCallback task_description_watch_callback;
+    /// ZooKeeper session used to set the callback
+    zkutil::ZooKeeperPtr task_description_watch_zookeeper;
 
-    void mainImpl();
+    ConfigurationPtr task_cluster_initial_config;
+    ConfigurationPtr task_cluster_current_config;
+    Coordination::Stat task_descprtion_current_stat{};
 
-    void setupLogging();
+    std::unique_ptr<TaskCluster> task_cluster;
 
-    std::string config_xml_path;
-    std::string task_path;
-    std::string log_level = "debug";
     bool is_safe_mode = false;
-    double copy_fault_probability = 0;
-    bool is_help = false;
+    double copy_fault_probability = 0.0;
 
-    std::string base_dir;
-    std::string process_path;
-    std::string process_id;
-    std::string host_id;
+    Context &context;
+    Poco::Logger *log;
+
+    std::chrono::milliseconds default_sleep_time{1000};
 };
 
 }
diff --git a/dbms/programs/copier/ClusterCopierApp.cpp b/dbms/programs/copier/ClusterCopierApp.cpp
new file mode 100644
index 00000000000..93b6ff6da50
--- /dev/null
+++ b/dbms/programs/copier/ClusterCopierApp.cpp
@@ -0,0 +1,172 @@
+#include "ClusterCopierApp.h"
+
+namespace DB
+{
+
+/// ClusterCopierApp
+
+void ClusterCopierApp::initialize(Poco::Util::Application & self)
+{
+    is_help = config().has("help");
+    if (is_help)
+        return;
+
+    config_xml_path = config().getString("config-file");
+    task_path = config().getString("task-path");
+    log_level = config().getString("log-level", "trace");
+    is_safe_mode = config().has("safe-mode");
+    if (config().has("copy-fault-probability"))
+        copy_fault_probability = std::max(std::min(config().getDouble("copy-fault-probability"), 1.0), 0.0);
+    base_dir = (config().has("base-dir")) ? config().getString("base-dir") : Poco::Path::current();
+    // process_id is '<hostname>#<start_timestamp>_<pid>'
+    time_t timestamp = Poco::Timestamp().epochTime();
+    auto curr_pid = Poco::Process::id();
+
+    process_id = std::to_string(DateLUT::instance().toNumYYYYMMDDhhmmss(timestamp)) + "_" + std::to_string(curr_pid);
+    host_id = escapeForFileName(getFQDNOrHostName()) + '#' + process_id;
+    process_path = Poco::Path(base_dir + "/clickhouse-copier_" + process_id).absolute().toString();
+    Poco::File(process_path).createDirectories();
+
+    /// Override variables for BaseDaemon
+    if (config().has("log-level"))
+        config().setString("logger.level", config().getString("log-level"));
+
+    if (config().has("base-dir") || !config().has("logger.log"))
+        config().setString("logger.log", process_path + "/log.log");
+
+    if (config().has("base-dir") || !config().has("logger.errorlog"))
+        config().setString("logger.errorlog", process_path + "/log.err.log");
+
+    Base::initialize(self);
+}
+
+
+void ClusterCopierApp::handleHelp(const std::string &, const std::string &)
+{
+    Poco::Util::HelpFormatter helpFormatter(options());
+    helpFormatter.setCommand(commandName());
+    helpFormatter.setHeader("Copies tables from one cluster to another");
+    helpFormatter.setUsage("--config-file <config-file> --task-path <task-path>");
+    helpFormatter.format(std::cerr);
+
+    stopOptionsProcessing();
+}
+
+
+void ClusterCopierApp::defineOptions(Poco::Util::OptionSet & options)
+{
+    Base::defineOptions(options);
+
+    options.addOption(Poco::Util::Option("task-path", "", "path to task in ZooKeeper")
+                              .argument("task-path").binding("task-path"));
+    options.addOption(Poco::Util::Option("task-file", "", "path to task file for uploading in ZooKeeper to task-path")
+                              .argument("task-file").binding("task-file"));
+    options.addOption(Poco::Util::Option("task-upload-force", "", "Force upload task-file even node already exists")
+                              .argument("task-upload-force").binding("task-upload-force"));
+    options.addOption(Poco::Util::Option("safe-mode", "", "disables ALTER DROP PARTITION in case of errors")
+                              .binding("safe-mode"));
+    options.addOption(Poco::Util::Option("copy-fault-probability", "", "the copying fails with specified probability (used to test partition state recovering)")
+                              .argument("copy-fault-probability").binding("copy-fault-probability"));
+    options.addOption(Poco::Util::Option("log-level", "", "sets log level")
+                              .argument("log-level").binding("log-level"));
+    options.addOption(Poco::Util::Option("base-dir", "", "base directory for copiers, consecutive copier launches will populate /base-dir/launch_id/* directories")
+                              .argument("base-dir").binding("base-dir"));
+
+    using Me = std::decay_t<decltype(*this)>;
+    options.addOption(Poco::Util::Option("help", "", "produce this help message").binding("help")
+                              .callback(Poco::Util::OptionCallback<Me>(this, &Me::handleHelp)));
+}
+
+
+void ClusterCopierApp::mainImpl()
+{
+    StatusFile status_file(process_path + "/status");
+    ThreadStatus thread_status;
+
+    auto log = &logger();
+    LOG_INFO(log, "Starting clickhouse-copier ("
+            << "id " << process_id << ", "
+            << "host_id " << host_id << ", "
+            << "path " << process_path << ", "
+            << "revision " << ClickHouseRevision::get() << ")");
+
+    auto context = std::make_unique<Context>(Context::createGlobal());
+    context->makeGlobalContext();
+    SCOPE_EXIT(context->shutdown());
+
+    context->setConfig(loaded_config.configuration);
+    context->setApplicationType(Context::ApplicationType::LOCAL);
+    context->setPath(process_path);
+
+    registerFunctions();
+    registerAggregateFunctions();
+    registerTableFunctions();
+    registerStorages();
+    registerDictionaries();
+    registerDisks();
+
+    static const std::string default_database = "_local";
+    context->addDatabase(default_database, std::make_shared<DatabaseMemory>(default_database));
+    context->setCurrentDatabase(default_database);
+
+    /// Initialize query scope just in case.
+    CurrentThread::QueryScope query_scope(*context);
+
+    auto copier = std::make_unique<ClusterCopier>(task_path, host_id, default_database, *context);
+    copier->setSafeMode(is_safe_mode);
+    copier->setCopyFaultProbability(copy_fault_probability);
+
+    auto task_file = config().getString("task-file", "");
+    if (!task_file.empty())
+        copier->uploadTaskDescription(task_path, task_file, config().getBool("task-upload-force", false));
+
+    copier->init();
+    copier->process(ConnectionTimeouts::getTCPTimeoutsWithoutFailover(context->getSettingsRef()));
+
+    /// Reset ZooKeeper before removing ClusterCopier.
+    /// Otherwise zookeeper watch can call callback which use already removed ClusterCopier object.
+    context->resetZooKeeper();
+}
+
+
+int ClusterCopierApp::main(const std::vector<std::string> &)
+{
+    if (is_help)
+        return 0;
+
+    try
+    {
+        mainImpl();
+    }
+    catch (...)
+    {
+        tryLogCurrentException(&Poco::Logger::root(), __PRETTY_FUNCTION__);
+        auto code = getCurrentExceptionCode();
+
+        return (code) ? code : -1;
+    }
+
+    return 0;
+}
+
+
+}
+
+#pragma GCC diagnostic ignored "-Wunused-function"
+#pragma GCC diagnostic ignored "-Wmissing-declarations"
+
+int mainEntryClickHouseClusterCopier(int argc, char ** argv)
+{
+    try
+    {
+        DB::ClusterCopierApp app;
+        return app.run(argc, argv);
+    }
+    catch (...)
+    {
+        std::cerr << DB::getCurrentExceptionMessage(true) << "\n";
+        auto code = DB::getCurrentExceptionCode();
+
+        return (code) ? code : -1;
+    }
+}
diff --git a/dbms/programs/copier/ClusterCopierApp.h b/dbms/programs/copier/ClusterCopierApp.h
new file mode 100644
index 00000000000..25350aefd3a
--- /dev/null
+++ b/dbms/programs/copier/ClusterCopierApp.h
@@ -0,0 +1,90 @@
+#pragma once
+
+#include <Poco/Util/ServerApplication.h>
+#include <daemon/BaseDaemon.h>
+
+#include "ClusterCopier.h"
+
+/* clickhouse cluster copier util
+ * Copies tables data from one cluster to new tables of other (possibly the same) cluster in distributed fault-tolerant manner.
+ *
+ * See overview in the docs: docs/en/utils/clickhouse-copier.md
+ *
+ * Implementation details:
+ *
+ * cluster-copier workers pull each partition of each shard of the source cluster and push it to the destination cluster through
+ * Distributed table (to preform data resharding). So, worker job is a partition of a source shard.
+ * A job has three states: Active, Finished and Abandoned. Abandoned means that worker died and did not finish the job.
+ *
+ * If an error occurred during the copying (a worker failed or a worker did not finish the INSERT), then the whole partition (on
+ * all destination servers) should be dropped and refilled. So, copying entity is a partition of all destination shards.
+ * If a failure is detected a special /is_dirty node is created in ZooKeeper signalling that other workers copying the same partition
+ * should stop, after a refilling procedure should start.
+ *
+ * ZooKeeper task node has the following structure:
+ *  /task/path_root                     - path passed in --task-path parameter
+ *      /description                    - contains user-defined XML config of the task
+ *      /task_active_workers            - contains ephemeral nodes of all currently active workers, used to implement max_workers limitation
+ *          /server_fqdn#PID_timestamp  - cluster-copier worker ID
+ *          ...
+ *      /tables             - directory with table tasks
+ *      /cluster.db.table1  - directory of table_hits task
+ *          /partition1     - directory for partition1
+ *              /shards     - directory for source cluster shards
+ *                  /1      - worker job for the first shard of partition1 of table test.hits
+ *                            Contains info about current status (Active or Finished) and worker ID.
+ *                  /2
+ *                  ...
+ *              /partition_active_workers
+ *                  /1      - for each job in /shards a corresponding ephemeral node created in /partition_active_workers
+ *                            It is used to detect Abandoned jobs (if there is Active node in /shards and there is no node in
+ *                            /partition_active_workers).
+ *                            Also, it is used to track active workers in the partition (when we need to refill the partition we do
+ *                            not DROP PARTITION while there are active workers)
+ *                  /2
+ *                  ...
+ *              /is_dirty   - the node is set if some worker detected that an error occurred (the INSERT is failed or an Abandoned node is
+ *                            detected). If the node appeared workers in this partition should stop and start cleaning and refilling
+ *                            partition procedure.
+ *                            During this procedure a single 'cleaner' worker is selected. The worker waits for stopping all partition
+ *                            workers, removes /shards node, executes DROP PARTITION on each destination node and removes /is_dirty node.
+ *                  /cleaner- An ephemeral node used to select 'cleaner' worker. Contains ID of the worker.
+ *      /cluster.db.table2
+ *          ...
+ */
+
+namespace DB
+{
+
+class ClusterCopierApp : public BaseDaemon
+{
+public:
+
+    void initialize(Poco::Util::Application & self) override;
+
+    void handleHelp(const std::string &, const std::string &);
+
+    void defineOptions(Poco::Util::OptionSet & options) override;
+
+    int main(const std::vector<std::string> &) override;
+
+private:
+
+    using Base = BaseDaemon;
+
+    void mainImpl();
+
+    std::string config_xml_path;
+    std::string task_path;
+    std::string log_level = "trace";
+    bool is_safe_mode = false;
+    double copy_fault_probability = 0;
+    bool is_help = false;
+
+    std::string base_dir;
+    std::string process_path;
+    std::string process_id;
+    std::string host_id;
+};
+
+}
diff --git a/dbms/programs/copier/ClusterPartition.h b/dbms/programs/copier/ClusterPartition.h
new file mode 100644
index 00000000000..333d24e5326
--- /dev/null
+++ b/dbms/programs/copier/ClusterPartition.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "Aliases.h"
+
+namespace DB
+{
+    /// Contains info about all shards that contain a partition
+    struct ClusterPartition {
+        double elapsed_time_seconds = 0;
+        UInt64 bytes_copied = 0;
+        UInt64 rows_copied = 0;
+        UInt64 blocks_copied = 0;
+
+        UInt64 total_tries = 0;
+    };
+}
diff --git a/dbms/programs/copier/Internals.cpp b/dbms/programs/copier/Internals.cpp
new file mode 100644
index 00000000000..0e1de3e7b84
--- /dev/null
+++ b/dbms/programs/copier/Internals.cpp
@@ -0,0 +1,129 @@
+#include "Internals.h"
+
+namespace DB {
+
+
+[[maybe_unused]] ConfigurationPtr getConfigurationFromXMLString(const std::string &xml_data) {
+    std::stringstream ss(xml_data);
+    Poco::XML::InputSource input_source{ss};
+    return {new Poco::Util::XMLConfiguration{&input_source}};
+}
+
+
+String getQuotedTable(const String &database, const String &table) {
+    if (database.empty()) {
+        return backQuoteIfNeed(table);
+    }
+
+    return backQuoteIfNeed(database) + "." + backQuoteIfNeed(table);
+}
+
+String getQuotedTable(const DatabaseAndTableName &db_and_table) {
+    return getQuotedTable(db_and_table.first, db_and_table.second);
+}
+
+
+// Creates AST representing 'ENGINE = Distributed(cluster, db, table, [sharding_key])
+std::shared_ptr<ASTStorage> createASTStorageDistributed(
+        const String &cluster_name, const String &database, const String &table,
+        const ASTPtr &sharding_key_ast) {
+    auto args = std::make_shared<ASTExpressionList>();
+    args->children.emplace_back(std::make_shared<ASTLiteral>(cluster_name));
+    args->children.emplace_back(std::make_shared<ASTIdentifier>(database));
+    args->children.emplace_back(std::make_shared<ASTIdentifier>(table));
+    if (sharding_key_ast)
+        args->children.emplace_back(sharding_key_ast);
+
+    auto engine = std::make_shared<ASTFunction>();
+    engine->name = "Distributed";
+    engine->arguments = args;
+
+    auto storage = std::make_shared<ASTStorage>();
+    storage->set(storage->engine, engine);
+
+    return storage;
+}
+
+
+BlockInputStreamPtr squashStreamIntoOneBlock(const BlockInputStreamPtr &stream) {
+    return std::make_shared<SquashingBlockInputStream>(
+            stream,
+            std::numeric_limits<size_t>::max(),
+            std::numeric_limits<size_t>::max());
+}
+
+Block getBlockWithAllStreamData(const BlockInputStreamPtr &stream) {
+    return squashStreamIntoOneBlock(stream)->read();
+}
+
+
+bool isExtendedDefinitionStorage(const ASTPtr &storage_ast) {
+    const auto &storage = storage_ast->as<ASTStorage &>();
+    return storage.partition_by || storage.order_by || storage.sample_by;
+}
+
+ASTPtr extractPartitionKey(const ASTPtr &storage_ast) {
+    String storage_str = queryToString(storage_ast);
+
+    const auto &storage = storage_ast->as<ASTStorage &>();
+    const auto &engine = storage.engine->as<ASTFunction &>();
+
+    if (!endsWith(engine.name, "MergeTree")) {
+        throw Exception(
+                "Unsupported engine was specified in " + storage_str + ", only *MergeTree engines are supported",
+                ErrorCodes::BAD_ARGUMENTS);
+    }
+
+    if (isExtendedDefinitionStorage(storage_ast)) {
+        if (storage.partition_by)
+            return storage.partition_by->clone();
+
+        static const char *all = "all";
+        return std::make_shared<ASTLiteral>(Field(all, strlen(all)));
+    } else {
+        bool is_replicated = startsWith(engine.name, "Replicated");
+        size_t min_args = is_replicated ? 3 : 1;
+
+        if (!engine.arguments)
+            throw Exception("Expected arguments in " + storage_str, ErrorCodes::BAD_ARGUMENTS);
+
+        ASTPtr arguments_ast = engine.arguments->clone();
+        ASTs &arguments = arguments_ast->children;
+
+        if (arguments.size() < min_args)
+            throw Exception("Expected at least " + toString(min_args) + " arguments in " + storage_str,
+                            ErrorCodes::BAD_ARGUMENTS);
+
+        ASTPtr &month_arg = is_replicated ? arguments[2] : arguments[1];
+        return makeASTFunction("toYYYYMM", month_arg->clone());
+    }
+}
+
+[[maybe_unused]] ShardPriority getReplicasPriority(const Cluster::Addresses & replicas, const std::string & local_hostname, UInt8 random) {
+    ShardPriority res;
+
+    if (replicas.empty())
+        return res;
+
+    res.is_remote = 1;
+    for (auto & replica : replicas)
+    {
+        if (isLocalAddress(DNSResolver::instance().resolveHost(replica.host_name)))
+        {
+            res.is_remote = 0;
+            break;
+        }
+    }
+
+    res.hostname_difference = std::numeric_limits<size_t>::max();
+    for (auto & replica : replicas)
+    {
+        size_t difference = getHostNameDifference(local_hostname, replica.host_name);
+        res.hostname_difference = std::min(difference, res.hostname_difference);
+    }
+
+    res.random = random;
+    return res;
+}
+
+}
diff --git a/dbms/programs/copier/Internals.h b/dbms/programs/copier/Internals.h
new file mode 100644
index 00000000000..955b1ace7da
--- /dev/null
+++ b/dbms/programs/copier/Internals.h
@@ -0,0 +1,176 @@
+#pragma once
+
+#include <chrono>
+#include <optional>
+#include <Poco/Util/XMLConfiguration.h>
+#include <Poco/Logger.h>
+#include <Poco/ConsoleChannel.h>
+#include <Poco/FormattingChannel.h>
+#include <Poco/PatternFormatter.h>
+#include <Poco/UUIDGenerator.h>
+#include <Poco/File.h>
+#include <Poco/Process.h>
+#include <Poco/FileChannel.h>
+#include <Poco/SplitterChannel.h>
+#include <Poco/Util/HelpFormatter.h>
+#include <boost/algorithm/string.hpp>
+#include <common/logger_useful.h>
+#include <Common/ThreadPool.h>
+#include <Common/Exception.h>
+#include <Common/ZooKeeper/ZooKeeper.h>
+#include <Common/ZooKeeper/KeeperException.h>
+#include <Common/getFQDNOrHostName.h>
+#include <Common/isLocalAddress.h>
+#include <Common/typeid_cast.h>
+#include <Common/ClickHouseRevision.h>
+#include <Common/formatReadable.h>
+#include <Common/DNSResolver.h>
+#include <Common/CurrentThread.h>
+#include <Common/escapeForFileName.h>
+#include <Common/getNumberOfPhysicalCPUCores.h>
+#include <Common/ThreadStatus.h>
+#include <Client/Connection.h>
+#include <Interpreters/Context.h>
+#include <Interpreters/InterpreterFactory.h>
+#include <Interpreters/InterpreterExistsQuery.h>
+#include <Interpreters/InterpreterShowCreateQuery.h>
+#include <Interpreters/InterpreterDropQuery.h>
+#include <Interpreters/InterpreterCreateQuery.h>
+#include <Columns/ColumnString.h>
+#include <Columns/ColumnsNumber.h>
+#include <DataTypes/DataTypeString.h>
+#include <Parsers/ParserCreateQuery.h>
+#include <Parsers/parseQuery.h>
+#include <Parsers/ParserQuery.h>
+#include <Parsers/ASTCreateQuery.h>
+#include <Parsers/queryToString.h>
+#include <Parsers/ASTDropQuery.h>
+#include <Parsers/ASTLiteral.h>
+#include <Parsers/ASTExpressionList.h>
+#include <Formats/FormatSettings.h>
+#include <DataStreams/RemoteBlockInputStream.h>
+#include <DataStreams/SquashingBlockInputStream.h>
+#include <DataStreams/AsynchronousBlockInputStream.h>
+#include <DataStreams/copyData.h>
+#include <DataStreams/NullBlockOutputStream.h>
+#include <IO/ConnectionTimeouts.h>
+#include <IO/Operators.h>
+#include <IO/ReadBufferFromString.h>
+#include <IO/ReadBufferFromFile.h>
+#include <Functions/registerFunctions.h>
+#include <TableFunctions/registerTableFunctions.h>
+#include <AggregateFunctions/registerAggregateFunctions.h>
+#include <Storages/registerStorages.h>
+#include <Storages/StorageDistributed.h>
+#include <Dictionaries/registerDictionaries.h>
+#include <Disks/registerDisks.h>
+#include <Databases/DatabaseMemory.h>
+#include <Common/StatusFile.h>
+
+#include "Aliases.h"
+
+namespace DB {
+
+namespace ErrorCodes
+{
+    extern const int NO_ZOOKEEPER;
+    extern const int BAD_ARGUMENTS;
+    extern const int UNKNOWN_TABLE;
+    extern const int UNFINISHED;
+    extern const int UNKNOWN_ELEMENT_IN_CONFIG;
+}
+
+
+[[maybe_unused]] ConfigurationPtr getConfigurationFromXMLString(const std::string &xml_data);
+
+[[maybe_unused]] String getQuotedTable(const String &database, const String &table);
+
+[[maybe_unused]] String getQuotedTable(const DatabaseAndTableName &db_and_table);
+
+
+enum class TaskState {
+    Started = 0,
+    Finished,
+    Unknown
+};
+
+/// Used to mark status of shard partition tasks
+struct TaskStateWithOwner {
+    TaskStateWithOwner() = default;
+
+    TaskStateWithOwner(TaskState state_, const String &owner_) : state(state_), owner(owner_) {}
+
+    TaskState state{TaskState::Unknown};
+    String owner;
+
+    static String getData(TaskState state, const String &owner) {
+        return TaskStateWithOwner(state, owner).toString();
+    }
+
+    String toString() {
+        WriteBufferFromOwnString wb;
+        wb << static_cast<UInt32>(state) << "\n" << escape << owner;
+        return wb.str();
+    }
+
+    static TaskStateWithOwner fromString(const String &data) {
+        ReadBufferFromString rb(data);
+        TaskStateWithOwner res;
+        UInt32 state;
+
+        rb >> state >> "\n" >> escape >> res.owner;
+
+        if (state >= static_cast<int>(TaskState::Unknown))
+            throw Exception("Unknown state " + data, ErrorCodes::LOGICAL_ERROR);
+
+        res.state = static_cast<TaskState>(state);
+        return res;
+    }
+};
+
+
+
+struct ShardPriority
+{
+    UInt8 is_remote = 1;
+    size_t hostname_difference = 0;
+    UInt8 random = 0;
+
+    static bool greaterPriority(const ShardPriority &current, const ShardPriority &other) {
+        return std::forward_as_tuple(current.is_remote, current.hostname_difference, current.random)
+               < std::forward_as_tuple(other.is_remote, other.hostname_difference, other.random);
+    }
+};
+
+/// Execution status of a task
+enum class PartitionTaskStatus
+{
+    Active,
+    Finished,
+    Error,
+};
+
+
+struct MultiTransactionInfo {
+    int32_t code;
+    Coordination::Requests requests;
+    Coordination::Responses responses;
+};
+
+// Creates AST representing 'ENGINE = Distributed(cluster, db, table, [sharding_key])
+[[maybe_unused]] std::shared_ptr<ASTStorage> createASTStorageDistributed(
+        const String &cluster_name, const String &database, const String &table,
+        const ASTPtr &sharding_key_ast = nullptr);
+
+
+[[maybe_unused]] BlockInputStreamPtr squashStreamIntoOneBlock(const BlockInputStreamPtr &stream);
+
+[[maybe_unused]] Block getBlockWithAllStreamData(const BlockInputStreamPtr &stream);
+
+[[maybe_unused]] bool isExtendedDefinitionStorage(const ASTPtr &storage_ast);
+
+[[maybe_unused]] ASTPtr extractPartitionKey(const ASTPtr &storage_ast);
+
+[[maybe_unused]] ShardPriority getReplicasPriority(const Cluster::Addresses & replicas, const std::string & local_hostname, UInt8 random);
+
+}
diff --git a/dbms/programs/copier/ShardPartition.h b/dbms/programs/copier/ShardPartition.h
new file mode 100644
index 00000000000..a299cf496d8
--- /dev/null
+++ b/dbms/programs/copier/ShardPartition.h
@@ -0,0 +1,68 @@
+#pragma once
+
+#include "Aliases.h"
+
+namespace DB
+{
+
+/// Just destination partition of a shard
+struct ShardPartition
+{
+    ShardPartition(TaskShard & parent, const String & name_quoted_) : task_shard(parent), name(name_quoted_) {}
+
+    String getPartitionPath() const;
+    String getPartitionCleanStartPath() const;
+    String getCommonPartitionIsDirtyPath() const;
+    String getCommonPartitionIsCleanedPath() const;
+    String getPartitionActiveWorkersPath() const;
+    String getActiveWorkerPath() const;
+    String getPartitionShardsPath() const;
+    String getShardStatusPath() const;
+
+    TaskShard & task_shard;
+    String name;
+};
+
+inline String ShardPartition::getPartitionCleanStartPath() const
+{
+    return getPartitionPath() + "/clean_start";
+}
+
+inline String ShardPartition::getPartitionPath() const
+{
+    return task_shard.task_table.getPartitionPath(name);
+}
+
+inline String ShardPartition::getShardStatusPath() const
+{
+    // schema: /<root...>/tables/<table>/<partition>/shards/<shard>
+    // e.g. /root/table_test.hits/201701/shards/1
+    return getPartitionShardsPath() + "/" + toString(task_shard.numberInCluster());
+}
+
+inline String ShardPartition::getPartitionShardsPath() const
+{
+    return getPartitionPath() + "/shards";
+}
+
+inline String ShardPartition::getPartitionActiveWorkersPath() const
+{
+    return getPartitionPath() + "/partition_active_workers";
+}
+
+inline String ShardPartition::getActiveWorkerPath() const
+{
+    return getPartitionActiveWorkersPath() + "/" + toString(task_shard.numberInCluster());
+}
+
+inline String ShardPartition::getCommonPartitionIsDirtyPath() const
+{
+    return getPartitionPath() + "/is_dirty";
+}
+
+inline String ShardPartition::getCommonPartitionIsCleanedPath() const
+{
+    return getCommonPartitionIsDirtyPath() + "/cleaned";
+}
+
+}
diff --git a/dbms/programs/copier/TaskCluster.h b/dbms/programs/copier/TaskCluster.h
new file mode 100644
index 00000000000..cff0512a199
--- /dev/null
+++ b/dbms/programs/copier/TaskCluster.h
@@ -0,0 +1,96 @@
+#pragma once
+
+#include "Aliases.h"
+
+namespace DB
+{
+
+struct TaskCluster
+{
+    TaskCluster(const String & task_zookeeper_path_, const String & default_local_database_)
+            : task_zookeeper_path(task_zookeeper_path_), default_local_database(default_local_database_) {}
+
+    void loadTasks(const Poco::Util::AbstractConfiguration & config, const String & base_key = "");
+
+    /// Set (or update) settings and max_workers param
+    void reloadSettings(const Poco::Util::AbstractConfiguration & config, const String & base_key = "");
+
+    /// Base node for all tasks. Its structure:
+    ///  workers/ - directory with active workers (amount of them is less or equal max_workers)
+    ///  description - node with task configuration
+    ///  table_table1/ - directories with per-partition copying status
+    String task_zookeeper_path;
+
+    /// Database used to create temporary Distributed tables
+    String default_local_database;
+
+    /// Limits number of simultaneous workers
+    UInt64 max_workers = 0;
+
+    /// Base settings for pull and push
+    Settings settings_common;
+    /// Settings used to fetch data
+    Settings settings_pull;
+    /// Settings used to insert data
+    Settings settings_push;
+
+    String clusters_prefix;
+
+    /// Subtasks
+    TasksTable table_tasks;
+
+    std::random_device random_device;
+    pcg64 random_engine;
+};
+
+inline void DB::TaskCluster::loadTasks(const Poco::Util::AbstractConfiguration & config, const String & base_key)
+{
+    String prefix = base_key.empty() ? "" : base_key + ".";
+
+    clusters_prefix = prefix + "remote_servers";
+    if (!config.has(clusters_prefix))
+        throw Exception("You should specify list of clusters in " + clusters_prefix, ErrorCodes::BAD_ARGUMENTS);
+
+    Poco::Util::AbstractConfiguration::Keys tables_keys;
+    config.keys(prefix + "tables", tables_keys);
+
+    for (const auto & table_key : tables_keys)
+    {
+        table_tasks.emplace_back(*this, config, prefix + "tables", table_key);
+    }
+}
+
+inline void DB::TaskCluster::reloadSettings(const Poco::Util::AbstractConfiguration & config, const String & base_key)
+{
+    String prefix = base_key.empty() ? "" : base_key + ".";
+
+    max_workers = config.getUInt64(prefix + "max_workers");
+
+    settings_common = Settings();
+    if (config.has(prefix + "settings"))
+        settings_common.loadSettingsFromConfig(prefix + "settings", config);
+
+    settings_pull = settings_common;
+    if (config.has(prefix + "settings_pull"))
+        settings_pull.loadSettingsFromConfig(prefix + "settings_pull", config);
+
+    settings_push = settings_common;
+    if (config.has(prefix + "settings_push"))
+        settings_push.loadSettingsFromConfig(prefix + "settings_push", config);
+
+    auto set_default_value = [] (auto && setting, auto && default_value)
+    {
+        setting = setting.changed ? setting.value : default_value;
+    };
+
+    /// Override important settings
+    settings_pull.readonly = 1;
+    settings_push.insert_distributed_sync = 1;
+    set_default_value(settings_pull.load_balancing, LoadBalancing::NEAREST_HOSTNAME);
+    set_default_value(settings_pull.max_threads, 1);
+    set_default_value(settings_pull.max_block_size, 8192UL);
+    set_default_value(settings_pull.preferred_block_size_bytes, 0);
+    set_default_value(settings_push.insert_distributed_timeout, 0);
+}
+
+}
diff --git a/dbms/programs/copier/TaskTableAndShard.h b/dbms/programs/copier/TaskTableAndShard.h
new file mode 100644
index 00000000000..2a6753576e1
--- /dev/null
+++ b/dbms/programs/copier/TaskTableAndShard.h
@@ -0,0 +1,269 @@
+#pragma once
+
+#include "Aliases.h"
+#include "Internals.h"
+#include "ClusterPartition.h"
+
+namespace DB
+{
+
+struct TaskShard;
+
+struct TaskTable
+{
+    TaskTable(TaskCluster & parent, const Poco::Util::AbstractConfiguration & config, const String & prefix,
+              const String & table_key);
+
+    TaskCluster & task_cluster;
+
+    String getPartitionPath(const String & partition_name) const;
+    String getPartitionIsDirtyPath(const String & partition_name) const;
+    String getPartitionIsCleanedPath(const String & partition_name) const;
+    String getPartitionTaskStatusPath(const String & partition_name) const;
+
+    String name_in_config;
+
+    /// Used as task ID
+    String table_id;
+
+    /// Source cluster and table
+    String cluster_pull_name;
+    DatabaseAndTableName table_pull;
+
+    /// Destination cluster and table
+    String cluster_push_name;
+    DatabaseAndTableName table_push;
+
+    /// Storage of destination table
+    String engine_push_str;
+    ASTPtr engine_push_ast;
+    ASTPtr engine_push_partition_key_ast;
+
+    /// A Distributed table definition used to split data
+    String sharding_key_str;
+    ASTPtr sharding_key_ast;
+    ASTPtr engine_split_ast;
+
+    /// Additional WHERE expression to filter input data
+    String where_condition_str;
+    ASTPtr where_condition_ast;
+
+    /// Resolved clusters
+    ClusterPtr cluster_pull;
+    ClusterPtr cluster_push;
+
+    /// Filter partitions that should be copied
+    bool has_enabled_partitions = false;
+    Strings enabled_partitions;
+    NameSet enabled_partitions_set;
+
+    /// Prioritized list of shards
+    TasksShard all_shards;
+    TasksShard local_shards;
+
+    ClusterPartitions cluster_partitions;
+    NameSet finished_cluster_partitions;
+
+    /// Parition names to process in user-specified order
+    Strings ordered_partition_names;
+
+    ClusterPartition & getClusterPartition(const String & partition_name)
+    {
+        auto it = cluster_partitions.find(partition_name);
+        if (it == cluster_partitions.end())
+            throw Exception("There are no cluster partition " + partition_name + " in " + table_id, ErrorCodes::LOGICAL_ERROR);
+        return it->second;
+    }
+
+    Stopwatch watch;
+    UInt64 bytes_copied = 0;
+    UInt64 rows_copied = 0;
+
+    template <typename RandomEngine>
+    void initShards(RandomEngine && random_engine);
+};
+
+
+ struct TaskShard {
+    TaskShard(TaskTable &parent, const ShardInfo &info_) : task_table(parent), info(info_) {}
+
+    TaskTable &task_table;
+
+    ShardInfo info;
+
+    UInt32 numberInCluster() const { return info.shard_num; }
+
+    UInt32 indexInCluster() const { return info.shard_num - 1; }
+
+    String getDescription() const;
+
+    String getHostNameExample() const;
+
+    /// Used to sort clusters by their proximity
+    ShardPriority priority;
+
+    /// Column with unique destination partitions (computed from engine_push_partition_key expr.) in the shard
+    ColumnWithTypeAndName partition_key_column;
+
+    /// There is a task for each destination partition
+    TasksPartition partition_tasks;
+
+    /// Which partitions have been checked for existence
+    /// If some partition from this lists is exists, it is in partition_tasks
+    std::set<String> checked_partitions;
+
+    /// Last CREATE TABLE query of the table of the shard
+    ASTPtr current_pull_table_create_query;
+
+    /// Internal distributed tables
+    DatabaseAndTableName table_read_shard;
+    DatabaseAndTableName table_split_shard;
+};
+
+
+inline String TaskTable::getPartitionPath(const String & partition_name) const
+{
+    return task_cluster.task_zookeeper_path             // root
+           + "/tables/" + table_id                      // tables/dst_cluster.merge.hits
+           + "/" + escapeForFileName(partition_name);   // 201701
+}
+
+inline String TaskTable::getPartitionIsDirtyPath(const String & partition_name) const
+{
+    return getPartitionPath(partition_name) + "/is_dirty";
+}
+
+inline String TaskTable::getPartitionIsCleanedPath(const String & partition_name) const
+{
+    return getPartitionIsDirtyPath(partition_name) + "/cleaned";
+}
+
+inline String TaskTable::getPartitionTaskStatusPath(const String & partition_name) const
+{
+    return getPartitionPath(partition_name) + "/shards";
+}
+
+inline TaskTable::TaskTable(TaskCluster & parent, const Poco::Util::AbstractConfiguration & config, const String & prefix_,
+                     const String & table_key)
+        : task_cluster(parent)
+{
+    String table_prefix = prefix_ + "." + table_key + ".";
+
+    name_in_config = table_key;
+
+    cluster_pull_name = config.getString(table_prefix + "cluster_pull");
+    cluster_push_name = config.getString(table_prefix + "cluster_push");
+
+    table_pull.first = config.getString(table_prefix + "database_pull");
+    table_pull.second = config.getString(table_prefix + "table_pull");
+
+    table_push.first = config.getString(table_prefix + "database_push");
+    table_push.second = config.getString(table_prefix + "table_push");
+
+    /// Used as node name in ZooKeeper
+    table_id = escapeForFileName(cluster_push_name)
+               + "." + escapeForFileName(table_push.first)
+               + "." + escapeForFileName(table_push.second);
+
+    engine_push_str = config.getString(table_prefix + "engine");
+    {
+        ParserStorage parser_storage;
+        engine_push_ast = parseQuery(parser_storage, engine_push_str, 0);
+        engine_push_partition_key_ast = extractPartitionKey(engine_push_ast);
+    }
+
+    sharding_key_str = config.getString(table_prefix + "sharding_key");
+    {
+        ParserExpressionWithOptionalAlias parser_expression(false);
+        sharding_key_ast = parseQuery(parser_expression, sharding_key_str, 0);
+        engine_split_ast = createASTStorageDistributed(cluster_push_name, table_push.first, table_push.second, sharding_key_ast);
+    }
+
+    where_condition_str = config.getString(table_prefix + "where_condition", "");
+    if (!where_condition_str.empty())
+    {
+        ParserExpressionWithOptionalAlias parser_expression(false);
+        where_condition_ast = parseQuery(parser_expression, where_condition_str, 0);
+
+        // Will use canonical expression form
+        where_condition_str = queryToString(where_condition_ast);
+    }
+
+    String enabled_partitions_prefix = table_prefix + "enabled_partitions";
+    has_enabled_partitions = config.has(enabled_partitions_prefix);
+
+    if (has_enabled_partitions)
+    {
+        Strings keys;
+        config.keys(enabled_partitions_prefix, keys);
+
+        if (keys.empty())
+        {
+            /// Parse list of partition from space-separated string
+            String partitions_str = config.getString(table_prefix + "enabled_partitions");
+            boost::trim_if(partitions_str, isWhitespaceASCII);
+            boost::split(enabled_partitions, partitions_str, isWhitespaceASCII, boost::token_compress_on);
+        }
+        else
+        {
+            /// Parse sequence of <partition>...</partition>
+            for (const String & key : keys)
+            {
+                if (!startsWith(key, "partition"))
+                    throw Exception("Unknown key " + key + " in " + enabled_partitions_prefix, ErrorCodes::UNKNOWN_ELEMENT_IN_CONFIG);
+
+                enabled_partitions.emplace_back(config.getString(enabled_partitions_prefix + "." + key));
+            }
+        }
+
+        std::copy(enabled_partitions.begin(), enabled_partitions.end(), std::inserter(enabled_partitions_set, enabled_partitions_set.begin()));
+    }
+}
+
+template<typename RandomEngine>
+inline void TaskTable::initShards(RandomEngine &&random_engine) {
+    const String & fqdn_name = getFQDNOrHostName();
+    std::uniform_int_distribution<UInt8> get_urand(0, std::numeric_limits<UInt8>::max());
+
+    // Compute the priority
+    for (auto &shard_info : cluster_pull->getShardsInfo()) {
+        TaskShardPtr task_shard = std::make_shared<TaskShard>(*this, shard_info);
+        const auto &replicas = cluster_pull->getShardsAddresses().at(task_shard->indexInCluster());
+        task_shard->priority = getReplicasPriority(replicas, fqdn_name, get_urand(random_engine));
+
+        all_shards.emplace_back(task_shard);
+    }
+
+    // Sort by priority
+    std::sort(all_shards.begin(), all_shards.end(),
+              [](const TaskShardPtr &lhs, const TaskShardPtr &rhs) {
+                  return ShardPriority::greaterPriority(lhs->priority, rhs->priority);
+              });
+
+    // Cut local shards
+    auto it_first_remote = std::lower_bound(all_shards.begin(), all_shards.end(), 1,
+                                            [](const TaskShardPtr &lhs, UInt8 is_remote) {
+                                                return lhs->priority.is_remote < is_remote;
+                                            });
+
+    local_shards.assign(all_shards.begin(), it_first_remote);
+}
+
+
+inline String DB::TaskShard::getDescription() const
+{
+    std::stringstream ss;
+    ss << "N" << numberInCluster()
+       << " (having a replica " << getHostNameExample()
+       << ", pull table " + getQuotedTable(task_table.table_pull)
+       << " of cluster " + task_table.cluster_pull_name << ")";
+    return ss.str();
+}
+
+inline String DB::TaskShard::getHostNameExample() const
+{
+    auto &replicas = task_table.cluster_pull->getShardsAddresses().at(indexInCluster());
+    return replicas.at(0).readableString();
+}
+
+}
diff --git a/dbms/programs/copier/ZooKeeperStaff.h b/dbms/programs/copier/ZooKeeperStaff.h
new file mode 100644
index 00000000000..3133c68933d
--- /dev/null
+++ b/dbms/programs/copier/ZooKeeperStaff.h
@@ -0,0 +1,224 @@
+#pragma once
+
+/** Allows to compare two incremental counters of type UInt32 in presence of possible overflow.
+  * We assume that we compare values that are not too far away.
+  * For example, when we increment 0xFFFFFFFF, we get 0. So, 0xFFFFFFFF is less than 0.
+  */
+class WrappingUInt32
+{
+public:
+    UInt32 value;
+
+    explicit WrappingUInt32(UInt32 _value)
+            : value(_value)
+    {}
+
+    bool operator<(const WrappingUInt32 & other) const
+    {
+        return value != other.value && *this <= other;
+    }
+
+    bool operator<=(const WrappingUInt32 & other) const
+    {
+        const UInt32 HALF = 1 << 31;
+        return (value <= other.value && other.value - value < HALF)
+               || (value > other.value && value - other.value > HALF);
+    }
+
+    bool operator==(const WrappingUInt32 & other) const
+    {
+        return value == other.value;
+    }
+};
+
+/** Conforming Zxid definition.
+  * cf. https://github.com/apache/zookeeper/blob/631d1b284f0edb1c4f6b0fb221bf2428aec71aaa/zookeeper-docs/src/main/resources/markdown/zookeeperInternals.md#guarantees-properties-and-definitions
+  *
+  * But it is better to read this: https://zookeeper.apache.org/doc/r3.1.2/zookeeperProgrammers.html
+  *
+  * Actually here is the definition of Zxid.
+  * Every change to the ZooKeeper state receives a stamp in the form of a zxid (ZooKeeper Transaction Id).
+  * This exposes the total ordering of all changes to ZooKeeper. Each change will have a unique zxid
+  * and if zxid1 is smaller than zxid2 then zxid1 happened before zxid2.
+  */
+class Zxid
+{
+public:
+    WrappingUInt32 epoch;
+    WrappingUInt32 counter;
+    explicit Zxid(UInt64 _zxid)
+            : epoch(_zxid >> 32)
+            , counter(_zxid)
+    {}
+
+    bool operator<=(const Zxid & other) const
+    {
+        return (epoch < other.epoch)
+               || (epoch == other.epoch && counter <= other.counter);
+    }
+
+    bool operator==(const Zxid & other) const
+    {
+        return epoch == other.epoch && counter == other.counter;
+    }
+};
+
+/* When multiple ClusterCopiers discover that the target partition is not empty,
+ * they will attempt to clean up this partition before proceeding to copying.
+ *
+ * Instead of purging is_dirty, the history of cleaning work is preserved and partition hygiene is established
+ * based on a happens-before relation between the events.
+ * This relation is encoded by LogicalClock based on the mzxid of the is_dirty ZNode and is_dirty/cleaned.
+ * The fact of the partition hygiene is encoded by CleanStateClock.
+ *
+ * For you to know what mzxid means:
+ *
+ * ZooKeeper Stat Structure:
+ * The Stat structure for each znode in ZooKeeper is made up of the following fields:
+ *
+ * -- czxid
+ * The zxid of the change that caused this znode to be created.
+ *
+ * -- mzxid
+ * The zxid of the change that last modified this znode.
+ *
+ * -- ctime
+ * The time in milliseconds from epoch when this znode was created.
+ *
+ * -- mtime
+ * The time in milliseconds from epoch when this znode was last modified.
+ *
+ * -- version
+ * The number of changes to the data of this znode.
+ *
+ * -- cversion
+ * The number of changes to the children of this znode.
+ *
+ * -- aversion
+ * The number of changes to the ACL of this znode.
+ *
+ * -- ephemeralOwner
+ * The session id of the owner of this znode if the znode is an ephemeral node.
+ * If it is not an ephemeral node, it will be zero.
+ *
+ * -- dataLength
+ * The length of the data field of this znode.
+ *
+ * -- numChildren
+ * The number of children of this znode.
+ * */
+
+class LogicalClock
+{
+public:
+    std::optional<Zxid> zxid;
+
+    LogicalClock() = default;
+
+    explicit LogicalClock(UInt64 _zxid)
+            : zxid(_zxid)
+    {}
+
+    bool hasHappened() const
+    {
+        return bool(zxid);
+    }
+
+    /// happens-before relation with a reasonable time bound
+    bool happensBefore(const LogicalClock & other) const
+    {
+        return !zxid
+               || (other.zxid && *zxid <= *other.zxid);
+    }
+
+    bool operator<=(const LogicalClock & other) const
+    {
+        return happensBefore(other);
+    }
+
+    /// strict equality check
+    bool operator==(const LogicalClock & other) const
+    {
+        return zxid == other.zxid;
+    }
+};
+
+
+class CleanStateClock
+{
+public:
+    LogicalClock discovery_zxid;
+    std::optional<UInt32> discovery_version;
+
+    LogicalClock clean_state_zxid;
+    std::optional<UInt32> clean_state_version;
+
+    std::shared_ptr<std::atomic_bool> stale;
+
+    bool is_clean() const
+    {
+        return
+                !is_stale()
+                && (
+                        !discovery_zxid.hasHappened()
+                        || (clean_state_zxid.hasHappened() && discovery_zxid <= clean_state_zxid));
+    }
+
+    bool is_stale() const
+    {
+        return stale->load();
+    }
+
+    CleanStateClock(
+            const zkutil::ZooKeeperPtr & zookeeper,
+            const String & discovery_path,
+            const String & clean_state_path)
+            : stale(std::make_shared<std::atomic_bool>(false))
+    {
+        Coordination::Stat stat{};
+        String _some_data;
+        auto watch_callback =
+                [stale = stale] (const Coordination::WatchResponse & rsp)
+                {
+                    auto logger = &Poco::Logger::get("ClusterCopier");
+                    if (rsp.error == Coordination::ZOK)
+                    {
+                        switch (rsp.type)
+                        {
+                            case Coordination::CREATED:
+                                LOG_DEBUG(logger, "CleanStateClock change: CREATED, at " << rsp.path);
+                                stale->store(true);
+                                break;
+                            case Coordination::CHANGED:
+                                LOG_DEBUG(logger, "CleanStateClock change: CHANGED, at" << rsp.path);
+                                stale->store(true);
+                        }
+                    }
+                };
+        if (zookeeper->tryGetWatch(discovery_path, _some_data, &stat, watch_callback))
+        {
+            discovery_zxid = LogicalClock(stat.mzxid);
+            discovery_version = stat.version;
+        }
+        if (zookeeper->tryGetWatch(clean_state_path, _some_data, &stat, watch_callback))
+        {
+            clean_state_zxid = LogicalClock(stat.mzxid);
+            clean_state_version = stat.version;
+        }
+    }
+
+    bool operator==(const CleanStateClock & other) const
+    {
+        return !is_stale()
+               && !other.is_stale()
+               && discovery_zxid == other.discovery_zxid
+               && discovery_version == other.discovery_version
+               && clean_state_zxid == other.clean_state_zxid
+               && clean_state_version == other.clean_state_version;
+    }
+
+    bool operator!=(const CleanStateClock & other) const
+    {
+        return !(*this == other);
+    }
+};

From 30322f3b4a84fa85ba92784735a017671589e232 Mon Sep 17 00:00:00 2001
From: Yohann Jardin <yohannjardin@contentsquare.com>
Date: Wed, 19 Feb 2020 16:35:50 +0100
Subject: [PATCH 18/40] Update collapsing algorithm doc to match actual code

---
 docs/en/operations/table_engines/collapsingmergetree.md | 8 ++++----
 docs/zh/operations/table_engines/collapsingmergetree.md | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/docs/en/operations/table_engines/collapsingmergetree.md b/docs/en/operations/table_engines/collapsingmergetree.md
index 56f10f4c988..9c50dd959ed 100644
--- a/docs/en/operations/table_engines/collapsingmergetree.md
+++ b/docs/en/operations/table_engines/collapsingmergetree.md
@@ -109,12 +109,12 @@ When ClickHouse merges data parts, each group of consecutive rows with the same
 
 For each resulting data part ClickHouse saves:
 
-  1. The first "cancel" and the last "state" rows, if the number of "state" and "cancel" rows matches.
-  2. The last "state" row, if there is one more "state" row than "cancel" rows.
-  3. The first "cancel" row, if there is one more "cancel" row than "state" rows.
+  1. The first "cancel" and the last "state" rows, if the number of "state" and "cancel" rows matches and the last row is a "state" row.
+  2. The last "state" row, if there is more "state" rows than "cancel" rows.
+  3. The first "cancel" row, if there is more "cancel" rows than "state" rows.
   4. None of the rows, in all other cases.
 
-      The merge continues, but ClickHouse treats this situation as a logical error and records it in the server log. This error can occur if the same data were inserted more than once.
+      In addition when there is at least 2 more "state" rows than "cancel" rows, or at least 2 more "cancel" rows then "state" rows, the merge continues, but ClickHouse treats this situation as a logical error and records it in the server log. This error can occur if the same data were inserted more than once.
 
 Thus, collapsing should not change the results of calculating statistics.
 Changes gradually collapsed so that in the end only the last state of almost every object left.
diff --git a/docs/zh/operations/table_engines/collapsingmergetree.md b/docs/zh/operations/table_engines/collapsingmergetree.md
index fee3db919e2..2aa695b155a 100644
--- a/docs/zh/operations/table_engines/collapsingmergetree.md
+++ b/docs/zh/operations/table_engines/collapsingmergetree.md
@@ -109,9 +109,9 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
 
 对每个结果的数据部分 ClickHouse 保存：
 
-  1. 第一个“取消”和最后一个“状态”行，如果“状态”和“取消”行的数量匹配
-  2. 最后一个“状态”行，如果“状态”行比“取消”行多一个。
-  3. 第一个“取消”行，如果“取消”行比“状态”行多一个。
+  1. 第一个“取消”和最后一个“状态”行，如果“状态”和“取消”行的数量匹配和最后一个行是“状态”行
+  2. 最后一个“状态”行，如果“状态”行比“取消”行多一个或一个以上。
+  3. 第一个“取消”行，如果“取消”行比“状态”行多一个或一个以上。
   4. 没有行，在其他所有情况下。
 
       合并会继续，但是 ClickHouse 会把此情况视为逻辑错误并将其记录在服务日志中。这个错误会在相同的数据被插入超过一次时出现。

From 819a4ad6da5a6725e2d56e8a6c635777550aa205 Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <mikhaylovnikitka@gmail.com>
Date: Wed, 19 Feb 2020 18:45:49 +0300
Subject: [PATCH 19/40] codestyle

---
 dbms/programs/copier/ClusterCopier.h     | 42 ++++++++++++------------
 dbms/programs/copier/Internals.cpp       | 32 +++++++++---------
 dbms/programs/copier/Internals.h         | 26 +++++++--------
 dbms/programs/copier/TaskTableAndShard.h | 10 +++---
 4 files changed, 55 insertions(+), 55 deletions(-)

diff --git a/dbms/programs/copier/ClusterCopier.h b/dbms/programs/copier/ClusterCopier.h
index 2e920f17791..90a003a2528 100644
--- a/dbms/programs/copier/ClusterCopier.h
+++ b/dbms/programs/copier/ClusterCopier.h
@@ -14,10 +14,10 @@ class ClusterCopier
 {
 public:
 
-    ClusterCopier(const String &task_path_,
-                  const String &host_id_,
-                  const String &proxy_database_name_,
-                  Context &context_)
+    ClusterCopier(const String & task_path_,
+                  const String & host_id_,
+                  const String & proxy_database_name_,
+                  Context & context_)
             :
             task_zookeeper_path(task_path_),
             host_id(host_id_),
@@ -28,14 +28,14 @@ public:
     void init();
 
     template<typename T>
-    decltype(auto) retry(T &&func, UInt64 max_tries = 100);
+    decltype(auto) retry(T && func, UInt64 max_tries = 100);
 
     void discoverShardPartitions(const ConnectionTimeouts & timeouts, const TaskShardPtr & task_shard) ;
 
     /// Compute set of partitions, assume set of partitions aren't changed during the processing
-    void discoverTablePartitions(const ConnectionTimeouts &timeouts, TaskTable &task_table, UInt64 num_threads = 0);
+    void discoverTablePartitions(const ConnectionTimeouts & timeouts, TaskTable & task_table, UInt64 num_threads = 0);
 
-    void uploadTaskDescription(const std::string &task_path, const std::string &task_file, const bool force);
+    void uploadTaskDescription(const std::string & task_path, const std::string & task_file, const bool force);
 
     void reloadTaskDescription();
 
@@ -82,26 +82,26 @@ protected:
      * We have to ensure that all shards have the finished state and there is no dirty flag.
      * Moreover, we have to check status twice and check zxid, because state can change during the checking.
      */
-    bool checkPartitionIsDone(const TaskTable &task_table, const String &partition_name,
-                              const TasksShard &shards_with_partition);
+    bool checkPartitionIsDone(const TaskTable & task_table, const String & partition_name,
+                              const TasksShard & shards_with_partition);
 
     /// Removes MATERIALIZED and ALIAS columns from create table query
     static ASTPtr removeAliasColumnsFromCreateQuery(const ASTPtr &query_ast);
 
     /// Replaces ENGINE and table name in a create query
     std::shared_ptr<ASTCreateQuery>
-    rewriteCreateQueryStorage(const ASTPtr &create_query_ast, const DatabaseAndTableName &new_table,
-                              const ASTPtr &new_storage_ast);
+    rewriteCreateQueryStorage(const ASTPtr & create_query_ast, const DatabaseAndTableName & new_table,
+                              const ASTPtr & new_storage_ast);
 
-    bool tryDropPartition(ShardPartition &task_partition,
-                          const zkutil::ZooKeeperPtr &zookeeper,
-                          const CleanStateClock &clean_state_clock);
+    bool tryDropPartition(ShardPartition & task_partition,
+                          const zkutil::ZooKeeperPtr & zookeeper,
+                          const CleanStateClock & clean_state_clock);
 
 
     static constexpr UInt64 max_table_tries = 1000;
     static constexpr UInt64 max_shard_partition_tries = 600;
 
-    bool tryProcessTable(const ConnectionTimeouts &timeouts, TaskTable &task_table);
+    bool tryProcessTable(const ConnectionTimeouts & timeouts, TaskTable & task_table);
 
     PartitionTaskStatus tryProcessPartitionTask(const ConnectionTimeouts & timeouts,
                                                 ShardPartition & task_partition,
@@ -137,10 +137,10 @@ protected:
       * Returns number of shards for which at least one replica executed query successfully
       */
     UInt64 executeQueryOnCluster(
-            const ClusterPtr &cluster,
-            const String &query,
-            const ASTPtr &query_ast_ = nullptr,
-            const Settings *settings = nullptr,
+            const ClusterPtr & cluster,
+            const String & query,
+            const ASTPtr & query_ast_ = nullptr,
+            const Settings * settings = nullptr,
             PoolMode pool_mode = PoolMode::GET_ALL,
             UInt64 max_successful_executions_per_shard = 0) const;
 
@@ -166,8 +166,8 @@ private:
     bool is_safe_mode = false;
     double copy_fault_probability = 0.0;
 
-    Context &context;
-    Poco::Logger *log;
+    Context & context;
+    Poco::Logger * log;
 
     std::chrono::milliseconds default_sleep_time{1000};
 };
diff --git a/dbms/programs/copier/Internals.cpp b/dbms/programs/copier/Internals.cpp
index 0e1de3e7b84..043610425aa 100644
--- a/dbms/programs/copier/Internals.cpp
+++ b/dbms/programs/copier/Internals.cpp
@@ -3,14 +3,14 @@
 namespace DB {
 
 
-[[maybe_unused]] ConfigurationPtr getConfigurationFromXMLString(const std::string &xml_data) {
+ConfigurationPtr getConfigurationFromXMLString(const std::string & xml_data) {
     std::stringstream ss(xml_data);
     Poco::XML::InputSource input_source{ss};
     return {new Poco::Util::XMLConfiguration{&input_source}};
 }
 
 
-String getQuotedTable(const String &database, const String &table) {
+String getQuotedTable(const String & database, const String & table) {
     if (database.empty()) {
         return backQuoteIfNeed(table);
     }
@@ -18,15 +18,15 @@ String getQuotedTable(const String &database, const String &table) {
     return backQuoteIfNeed(database) + "." + backQuoteIfNeed(table);
 }
 
-String getQuotedTable(const DatabaseAndTableName &db_and_table) {
+String getQuotedTable(const DatabaseAndTableName & db_and_table) {
     return getQuotedTable(db_and_table.first, db_and_table.second);
 }
 
 
 // Creates AST representing 'ENGINE = Distributed(cluster, db, table, [sharding_key])
 std::shared_ptr<ASTStorage> createASTStorageDistributed(
-        const String &cluster_name, const String &database, const String &table,
-        const ASTPtr &sharding_key_ast) {
+        const String & cluster_name, const String & database, const String & table,
+        const ASTPtr & sharding_key_ast) {
     auto args = std::make_shared<ASTExpressionList>();
     args->children.emplace_back(std::make_shared<ASTLiteral>(cluster_name));
     args->children.emplace_back(std::make_shared<ASTIdentifier>(database));
@@ -45,28 +45,28 @@ std::shared_ptr<ASTStorage> createASTStorageDistributed(
 }
 
 
-BlockInputStreamPtr squashStreamIntoOneBlock(const BlockInputStreamPtr &stream) {
+BlockInputStreamPtr squashStreamIntoOneBlock(const BlockInputStreamPtr & stream) {
     return std::make_shared<SquashingBlockInputStream>(
             stream,
             std::numeric_limits<size_t>::max(),
             std::numeric_limits<size_t>::max());
 }
 
-Block getBlockWithAllStreamData(const BlockInputStreamPtr &stream) {
+Block getBlockWithAllStreamData(const BlockInputStreamPtr & stream) {
     return squashStreamIntoOneBlock(stream)->read();
 }
 
 
-bool isExtendedDefinitionStorage(const ASTPtr &storage_ast) {
-    const auto &storage = storage_ast->as<ASTStorage &>();
+bool isExtendedDefinitionStorage(const ASTPtr & storage_ast) {
+    const auto & storage = storage_ast->as<ASTStorage &>();
     return storage.partition_by || storage.order_by || storage.sample_by;
 }
 
-ASTPtr extractPartitionKey(const ASTPtr &storage_ast) {
+ASTPtr extractPartitionKey(const ASTPtr & storage_ast) {
     String storage_str = queryToString(storage_ast);
 
-    const auto &storage = storage_ast->as<ASTStorage &>();
-    const auto &engine = storage.engine->as<ASTFunction &>();
+    const auto & storage = storage_ast->as<ASTStorage &>();
+    const auto & engine = storage.engine->as<ASTFunction &>();
 
     if (!endsWith(engine.name, "MergeTree")) {
         throw Exception(
@@ -78,7 +78,7 @@ ASTPtr extractPartitionKey(const ASTPtr &storage_ast) {
         if (storage.partition_by)
             return storage.partition_by->clone();
 
-        static const char *all = "all";
+        static const char * all = "all";
         return std::make_shared<ASTLiteral>(Field(all, strlen(all)));
     } else {
         bool is_replicated = startsWith(engine.name, "Replicated");
@@ -88,18 +88,18 @@ ASTPtr extractPartitionKey(const ASTPtr &storage_ast) {
             throw Exception("Expected arguments in " + storage_str, ErrorCodes::BAD_ARGUMENTS);
 
         ASTPtr arguments_ast = engine.arguments->clone();
-        ASTs &arguments = arguments_ast->children;
+        ASTs & arguments = arguments_ast->children;
 
         if (arguments.size() < min_args)
             throw Exception("Expected at least " + toString(min_args) + " arguments in " + storage_str,
                             ErrorCodes::BAD_ARGUMENTS);
 
-        ASTPtr &month_arg = is_replicated ? arguments[2] : arguments[1];
+        ASTPtr & month_arg = is_replicated ? arguments[2] : arguments[1];
         return makeASTFunction("toYYYYMM", month_arg->clone());
     }
 }
 
-[[maybe_unused]] ShardPriority getReplicasPriority(const Cluster::Addresses & replicas, const std::string & local_hostname, UInt8 random) {
+ShardPriority getReplicasPriority(const Cluster::Addresses & replicas, const std::string & local_hostname, UInt8 random) {
     ShardPriority res;
 
     if (replicas.empty())
diff --git a/dbms/programs/copier/Internals.h b/dbms/programs/copier/Internals.h
index 955b1ace7da..58723e41650 100644
--- a/dbms/programs/copier/Internals.h
+++ b/dbms/programs/copier/Internals.h
@@ -81,11 +81,11 @@ namespace ErrorCodes
 }
 
 
-[[maybe_unused]] ConfigurationPtr getConfigurationFromXMLString(const std::string &xml_data);
+ConfigurationPtr getConfigurationFromXMLString(const std::string & xml_data);
 
-[[maybe_unused]] String getQuotedTable(const String &database, const String &table);
+String getQuotedTable(const String & database, const String & table);
 
-[[maybe_unused]] String getQuotedTable(const DatabaseAndTableName &db_and_table);
+String getQuotedTable(const DatabaseAndTableName & db_and_table);
 
 
 enum class TaskState {
@@ -98,7 +98,7 @@ enum class TaskState {
 struct TaskStateWithOwner {
     TaskStateWithOwner() = default;
 
-    TaskStateWithOwner(TaskState state_, const String &owner_) : state(state_), owner(owner_) {}
+    TaskStateWithOwner(TaskState state_, const String & owner_) : state(state_), owner(owner_) {}
 
     TaskState state{TaskState::Unknown};
     String owner;
@@ -136,7 +136,7 @@ struct ShardPriority
     size_t hostname_difference = 0;
     UInt8 random = 0;
 
-    static bool greaterPriority(const ShardPriority &current, const ShardPriority &other) {
+    static bool greaterPriority(const ShardPriority & current, const ShardPriority & other) {
         return std::forward_as_tuple(current.is_remote, current.hostname_difference, current.random)
                < std::forward_as_tuple(other.is_remote, other.hostname_difference, other.random);
     }
@@ -158,19 +158,19 @@ struct MultiTransactionInfo {
 };
 
 // Creates AST representing 'ENGINE = Distributed(cluster, db, table, [sharding_key])
-[[maybe_unused]] std::shared_ptr<ASTStorage> createASTStorageDistributed(
-        const String &cluster_name, const String &database, const String &table,
-        const ASTPtr &sharding_key_ast = nullptr);
+std::shared_ptr<ASTStorage> createASTStorageDistributed(
+        const String & cluster_name, const String & database, const String & table,
+        const ASTPtr & sharding_key_ast = nullptr);
 
 
-[[maybe_unused]] BlockInputStreamPtr squashStreamIntoOneBlock(const BlockInputStreamPtr &stream);
+BlockInputStreamPtr squashStreamIntoOneBlock(const BlockInputStreamPtr & stream);
 
-[[maybe_unused]] Block getBlockWithAllStreamData(const BlockInputStreamPtr &stream);
+Block getBlockWithAllStreamData(const BlockInputStreamPtr & stream);
 
-[[maybe_unused]] bool isExtendedDefinitionStorage(const ASTPtr &storage_ast);
+bool isExtendedDefinitionStorage(const ASTPtr & storage_ast);
 
-[[maybe_unused]] ASTPtr extractPartitionKey(const ASTPtr &storage_ast);
+ASTPtr extractPartitionKey(const ASTPtr & storage_ast);
 
-[[maybe_unused]] ShardPriority getReplicasPriority(const Cluster::Addresses & replicas, const std::string & local_hostname, UInt8 random);
+ShardPriority getReplicasPriority(const Cluster::Addresses & replicas, const std::string & local_hostname, UInt8 random);
 
 }
diff --git a/dbms/programs/copier/TaskTableAndShard.h b/dbms/programs/copier/TaskTableAndShard.h
index 2a6753576e1..05b1c3f543a 100644
--- a/dbms/programs/copier/TaskTableAndShard.h
+++ b/dbms/programs/copier/TaskTableAndShard.h
@@ -221,14 +221,14 @@ inline TaskTable::TaskTable(TaskCluster & parent, const Poco::Util::AbstractConf
 }
 
 template<typename RandomEngine>
-inline void TaskTable::initShards(RandomEngine &&random_engine) {
+inline void TaskTable::initShards(RandomEngine && random_engine) {
     const String & fqdn_name = getFQDNOrHostName();
     std::uniform_int_distribution<UInt8> get_urand(0, std::numeric_limits<UInt8>::max());
 
     // Compute the priority
-    for (auto &shard_info : cluster_pull->getShardsInfo()) {
+    for (auto & shard_info : cluster_pull->getShardsInfo()) {
         TaskShardPtr task_shard = std::make_shared<TaskShard>(*this, shard_info);
-        const auto &replicas = cluster_pull->getShardsAddresses().at(task_shard->indexInCluster());
+        const auto & replicas = cluster_pull->getShardsAddresses().at(task_shard->indexInCluster());
         task_shard->priority = getReplicasPriority(replicas, fqdn_name, get_urand(random_engine));
 
         all_shards.emplace_back(task_shard);
@@ -236,13 +236,13 @@ inline void TaskTable::initShards(RandomEngine &&random_engine) {
 
     // Sort by priority
     std::sort(all_shards.begin(), all_shards.end(),
-              [](const TaskShardPtr &lhs, const TaskShardPtr &rhs) {
+              [](const TaskShardPtr & lhs, const TaskShardPtr & rhs) {
                   return ShardPriority::greaterPriority(lhs->priority, rhs->priority);
               });
 
     // Cut local shards
     auto it_first_remote = std::lower_bound(all_shards.begin(), all_shards.end(), 1,
-                                            [](const TaskShardPtr &lhs, UInt8 is_remote) {
+                                            [](const TaskShardPtr & lhs, UInt8 is_remote) {
                                                 return lhs->priority.is_remote < is_remote;
                                             });
 

From caa4810c5cde611918ce1b59e4c7b6630e7b7693 Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <mikhaylovnikitka@gmail.com>
Date: Wed, 19 Feb 2020 18:59:47 +0300
Subject: [PATCH 20/40] more codestyle

---
 dbms/programs/copier/Internals.cpp | 41 +++++++++++++++++++-----------
 dbms/programs/copier/Internals.h   | 12 ++++++---
 2 files changed, 34 insertions(+), 19 deletions(-)

diff --git a/dbms/programs/copier/Internals.cpp b/dbms/programs/copier/Internals.cpp
index 043610425aa..05286515970 100644
--- a/dbms/programs/copier/Internals.cpp
+++ b/dbms/programs/copier/Internals.cpp
@@ -2,23 +2,24 @@
 
 namespace DB {
 
-
-ConfigurationPtr getConfigurationFromXMLString(const std::string & xml_data) {
+ConfigurationPtr getConfigurationFromXMLString(const std::string & xml_data)
+{
     std::stringstream ss(xml_data);
     Poco::XML::InputSource input_source{ss};
     return {new Poco::Util::XMLConfiguration{&input_source}};
 }
 
 
-String getQuotedTable(const String & database, const String & table) {
-    if (database.empty()) {
+String getQuotedTable(const String & database, const String & table)
+{
+    if (database.empty())
         return backQuoteIfNeed(table);
-    }
 
     return backQuoteIfNeed(database) + "." + backQuoteIfNeed(table);
 }
 
-String getQuotedTable(const DatabaseAndTableName & db_and_table) {
+String getQuotedTable(const DatabaseAndTableName & db_and_table)
+{
     return getQuotedTable(db_and_table.first, db_and_table.second);
 }
 
@@ -26,7 +27,8 @@ String getQuotedTable(const DatabaseAndTableName & db_and_table) {
 // Creates AST representing 'ENGINE = Distributed(cluster, db, table, [sharding_key])
 std::shared_ptr<ASTStorage> createASTStorageDistributed(
         const String & cluster_name, const String & database, const String & table,
-        const ASTPtr & sharding_key_ast) {
+        const ASTPtr & sharding_key_ast)
+{
     auto args = std::make_shared<ASTExpressionList>();
     args->children.emplace_back(std::make_shared<ASTLiteral>(cluster_name));
     args->children.emplace_back(std::make_shared<ASTIdentifier>(database));
@@ -45,42 +47,50 @@ std::shared_ptr<ASTStorage> createASTStorageDistributed(
 }
 
 
-BlockInputStreamPtr squashStreamIntoOneBlock(const BlockInputStreamPtr & stream) {
+BlockInputStreamPtr squashStreamIntoOneBlock(const BlockInputStreamPtr & stream)
+{
     return std::make_shared<SquashingBlockInputStream>(
             stream,
             std::numeric_limits<size_t>::max(),
             std::numeric_limits<size_t>::max());
 }
 
-Block getBlockWithAllStreamData(const BlockInputStreamPtr & stream) {
+Block getBlockWithAllStreamData(const BlockInputStreamPtr & stream)
+{
     return squashStreamIntoOneBlock(stream)->read();
 }
 
 
-bool isExtendedDefinitionStorage(const ASTPtr & storage_ast) {
+bool isExtendedDefinitionStorage(const ASTPtr & storage_ast)
+{
     const auto & storage = storage_ast->as<ASTStorage &>();
     return storage.partition_by || storage.order_by || storage.sample_by;
 }
 
-ASTPtr extractPartitionKey(const ASTPtr & storage_ast) {
+ASTPtr extractPartitionKey(const ASTPtr & storage_ast)
+{
     String storage_str = queryToString(storage_ast);
 
     const auto & storage = storage_ast->as<ASTStorage &>();
     const auto & engine = storage.engine->as<ASTFunction &>();
 
-    if (!endsWith(engine.name, "MergeTree")) {
+    if (!endsWith(engine.name, "MergeTree"))
+    {
         throw Exception(
                 "Unsupported engine was specified in " + storage_str + ", only *MergeTree engines are supported",
                 ErrorCodes::BAD_ARGUMENTS);
     }
 
-    if (isExtendedDefinitionStorage(storage_ast)) {
+    if (isExtendedDefinitionStorage(storage_ast))
+    {
         if (storage.partition_by)
             return storage.partition_by->clone();
 
         static const char * all = "all";
         return std::make_shared<ASTLiteral>(Field(all, strlen(all)));
-    } else {
+    }
+    else
+    {
         bool is_replicated = startsWith(engine.name, "Replicated");
         size_t min_args = is_replicated ? 3 : 1;
 
@@ -99,7 +109,8 @@ ASTPtr extractPartitionKey(const ASTPtr & storage_ast) {
     }
 }
 
-ShardPriority getReplicasPriority(const Cluster::Addresses & replicas, const std::string & local_hostname, UInt8 random) {
+ShardPriority getReplicasPriority(const Cluster::Addresses & replicas, const std::string & local_hostname, UInt8 random)
+{
     ShardPriority res;
 
     if (replicas.empty())
diff --git a/dbms/programs/copier/Internals.h b/dbms/programs/copier/Internals.h
index 58723e41650..44f2fa63db4 100644
--- a/dbms/programs/copier/Internals.h
+++ b/dbms/programs/copier/Internals.h
@@ -69,7 +69,8 @@
 
 #include "Aliases.h"
 
-namespace DB {
+namespace DB
+{
 
 namespace ErrorCodes
 {
@@ -88,14 +89,16 @@ String getQuotedTable(const String & database, const String & table);
 String getQuotedTable(const DatabaseAndTableName & db_and_table);
 
 
-enum class TaskState {
+enum class TaskState
+{
     Started = 0,
     Finished,
     Unknown
 };
 
 /// Used to mark status of shard partition tasks
-struct TaskStateWithOwner {
+struct TaskStateWithOwner
+{
     TaskStateWithOwner() = default;
 
     TaskStateWithOwner(TaskState state_, const String & owner_) : state(state_), owner(owner_) {}
@@ -151,7 +154,8 @@ enum class PartitionTaskStatus
 };
 
 
-struct MultiTransactionInfo {
+struct MultiTransactionInfo
+{
     int32_t code;
     Coordination::Requests requests;
     Coordination::Responses responses;

From fa8f07374bd134771946e2fda3578804633279d2 Mon Sep 17 00:00:00 2001
From: chertus <chertus@gmail.com>
Date: Wed, 19 Feb 2020 20:22:16 +0300
Subject: [PATCH 21/40] fix CROSS to INNER JOIN rewrite with [NOT] LIKE

---
 dbms/src/Interpreters/CrossToInnerJoinVisitor.cpp  |  4 ++++
 dbms/src/Interpreters/misc.h                       |  5 +++++
 .../01083_cross_to_inner_with_like.reference       |  3 +++
 .../0_stateless/01083_cross_to_inner_with_like.sql | 14 ++++++++++++++
 4 files changed, 26 insertions(+)
 create mode 100644 dbms/tests/queries/0_stateless/01083_cross_to_inner_with_like.reference
 create mode 100644 dbms/tests/queries/0_stateless/01083_cross_to_inner_with_like.sql

diff --git a/dbms/src/Interpreters/CrossToInnerJoinVisitor.cpp b/dbms/src/Interpreters/CrossToInnerJoinVisitor.cpp
index 596819dcde9..54d5205c4c2 100644
--- a/dbms/src/Interpreters/CrossToInnerJoinVisitor.cpp
+++ b/dbms/src/Interpreters/CrossToInnerJoinVisitor.cpp
@@ -124,6 +124,10 @@ public:
         {
             /// leave other comparisons as is
         }
+        else if (functionIsLikeOperator(node.name)) /// LIKE, NOT LIKE
+        {
+            /// leave as is
+        }
         else if (functionIsInOperator(node.name)) /// IN, NOT IN
         {
             if (auto ident = node.arguments->children.at(0)->as<ASTIdentifier>())
diff --git a/dbms/src/Interpreters/misc.h b/dbms/src/Interpreters/misc.h
index d5e2894bb4c..e2f34375dc0 100644
--- a/dbms/src/Interpreters/misc.h
+++ b/dbms/src/Interpreters/misc.h
@@ -13,4 +13,9 @@ inline bool functionIsInOrGlobalInOperator(const std::string & name)
     return functionIsInOperator(name) || name == "globalIn" || name == "globalNotIn";
 }
 
+inline bool functionIsLikeOperator(const std::string & name)
+{
+    return name == "like" || name == "notLike";
+}
+
 }
diff --git a/dbms/tests/queries/0_stateless/01083_cross_to_inner_with_like.reference b/dbms/tests/queries/0_stateless/01083_cross_to_inner_with_like.reference
new file mode 100644
index 00000000000..6b1f89b8424
--- /dev/null
+++ b/dbms/tests/queries/0_stateless/01083_cross_to_inner_with_like.reference
@@ -0,0 +1,3 @@
+SELECT \n    k, \n    r.k, \n    name\nFROM n\nALL INNER JOIN \n(\n    SELECT *\n    FROM r\n    HAVING name = \'A\'\n) AS r ON k = r.k\nWHERE (k = r.k) AND (name = \'A\')
+SELECT \n    k, \n    r.k, \n    name\nFROM n\nALL INNER JOIN \n(\n    SELECT *\n    FROM r\n    HAVING name LIKE \'A%\'\n) AS r ON k = r.k\nWHERE (k = r.k) AND (name LIKE \'A%\')
+SELECT \n    k, \n    r.k, \n    name\nFROM n\nALL INNER JOIN \n(\n    SELECT *\n    FROM r\n    HAVING name NOT LIKE \'A%\'\n) AS r ON k = r.k\nWHERE (k = r.k) AND (name NOT LIKE \'A%\')
diff --git a/dbms/tests/queries/0_stateless/01083_cross_to_inner_with_like.sql b/dbms/tests/queries/0_stateless/01083_cross_to_inner_with_like.sql
new file mode 100644
index 00000000000..311c42775b7
--- /dev/null
+++ b/dbms/tests/queries/0_stateless/01083_cross_to_inner_with_like.sql
@@ -0,0 +1,14 @@
+DROP TABLE IF EXISTS n;
+DROP TABLE IF EXISTS r;
+
+CREATE TABLE n (k UInt32) ENGINE = Memory;
+CREATE TABLE r (k UInt32, name String) ENGINE = Memory;
+
+SET enable_debug_queries = 1;
+
+ANALYZE SELECT * FROM n, r WHERE n.k = r.k AND r.name = 'A';
+ANALYZE SELECT * FROM n, r WHERE n.k = r.k AND r.name LIKE 'A%';
+ANALYZE SELECT * FROM n, r WHERE n.k = r.k AND r.name NOT LIKE 'A%';
+
+DROP TABLE n;
+DROP TABLE r;

From 732c50408a63ee4c3d07672e34b074f8f9ed7c19 Mon Sep 17 00:00:00 2001
From: chertus <chertus@gmail.com>
Date: Wed, 19 Feb 2020 20:29:57 +0300
Subject: [PATCH 22/40] disable crazy push down for test

---
 .../0_stateless/01083_cross_to_inner_with_like.reference    | 6 +++---
 .../queries/0_stateless/01083_cross_to_inner_with_like.sql  | 1 +
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/dbms/tests/queries/0_stateless/01083_cross_to_inner_with_like.reference b/dbms/tests/queries/0_stateless/01083_cross_to_inner_with_like.reference
index 6b1f89b8424..92b51afb544 100644
--- a/dbms/tests/queries/0_stateless/01083_cross_to_inner_with_like.reference
+++ b/dbms/tests/queries/0_stateless/01083_cross_to_inner_with_like.reference
@@ -1,3 +1,3 @@
-SELECT \n    k, \n    r.k, \n    name\nFROM n\nALL INNER JOIN \n(\n    SELECT *\n    FROM r\n    HAVING name = \'A\'\n) AS r ON k = r.k\nWHERE (k = r.k) AND (name = \'A\')
-SELECT \n    k, \n    r.k, \n    name\nFROM n\nALL INNER JOIN \n(\n    SELECT *\n    FROM r\n    HAVING name LIKE \'A%\'\n) AS r ON k = r.k\nWHERE (k = r.k) AND (name LIKE \'A%\')
-SELECT \n    k, \n    r.k, \n    name\nFROM n\nALL INNER JOIN \n(\n    SELECT *\n    FROM r\n    HAVING name NOT LIKE \'A%\'\n) AS r ON k = r.k\nWHERE (k = r.k) AND (name NOT LIKE \'A%\')
+SELECT \n    k, \n    r.k, \n    name\nFROM n\nALL INNER JOIN r ON k = r.k\nWHERE (k = r.k) AND (name = \'A\')
+SELECT \n    k, \n    r.k, \n    name\nFROM n\nALL INNER JOIN r ON k = r.k\nWHERE (k = r.k) AND (name LIKE \'A%\')
+SELECT \n    k, \n    r.k, \n    name\nFROM n\nALL INNER JOIN r ON k = r.k\nWHERE (k = r.k) AND (name NOT LIKE \'A%\')
diff --git a/dbms/tests/queries/0_stateless/01083_cross_to_inner_with_like.sql b/dbms/tests/queries/0_stateless/01083_cross_to_inner_with_like.sql
index 311c42775b7..9e03502c11b 100644
--- a/dbms/tests/queries/0_stateless/01083_cross_to_inner_with_like.sql
+++ b/dbms/tests/queries/0_stateless/01083_cross_to_inner_with_like.sql
@@ -5,6 +5,7 @@ CREATE TABLE n (k UInt32) ENGINE = Memory;
 CREATE TABLE r (k UInt32, name String) ENGINE = Memory;
 
 SET enable_debug_queries = 1;
+SET enable_optimize_predicate_expression = 0;
 
 ANALYZE SELECT * FROM n, r WHERE n.k = r.k AND r.name = 'A';
 ANALYZE SELECT * FROM n, r WHERE n.k = r.k AND r.name LIKE 'A%';

From 894c451cf0f0b334e1cdb4b6d32e11a64c547c65 Mon Sep 17 00:00:00 2001
From: Ivan Lezhankin <ilezhankin@yandex-team.ru>
Date: Wed, 19 Feb 2020 21:06:18 +0300
Subject: [PATCH 23/40] Get marks from first non-alias column in Log storage

---
 dbms/src/Storages/StorageLog.cpp                     | 12 ++++++++++--
 .../01083_log_first_column_alias.reference           |  1 +
 .../0_stateless/01083_log_first_column_alias.sql     |  7 +++++++
 3 files changed, 18 insertions(+), 2 deletions(-)
 create mode 100644 dbms/tests/queries/0_stateless/01083_log_first_column_alias.reference
 create mode 100644 dbms/tests/queries/0_stateless/01083_log_first_column_alias.sql

diff --git a/dbms/src/Storages/StorageLog.cpp b/dbms/src/Storages/StorageLog.cpp
index 549dd352a48..0c44081d68f 100644
--- a/dbms/src/Storages/StorageLog.cpp
+++ b/dbms/src/Storages/StorageLog.cpp
@@ -540,8 +540,16 @@ void StorageLog::truncate(const ASTPtr &, const Context &, TableStructureWriteLo
 
 const StorageLog::Marks & StorageLog::getMarksWithRealRowCount() const
 {
-    const String & column_name = getColumns().begin()->name;
-    const IDataType & column_type = *getColumns().begin()->type;
+    /// There should be at least one physical column
+    auto begin = getColumns().begin();
+    while (begin != getColumns().end() && begin->default_desc == ColumnDefaultKind::Alias)
+        ++begin;
+
+    if (begin == getColumns().end())
+        throw Exception("No physical columns found!", ErrorCodes::LOGICAL_ERROR);
+
+    const String & column_name = begin->name;
+    const IDataType & column_type = *begin->type;
     String filename;
 
     /** We take marks from first column.
diff --git a/dbms/tests/queries/0_stateless/01083_log_first_column_alias.reference b/dbms/tests/queries/0_stateless/01083_log_first_column_alias.reference
new file mode 100644
index 00000000000..573541ac970
--- /dev/null
+++ b/dbms/tests/queries/0_stateless/01083_log_first_column_alias.reference
@@ -0,0 +1 @@
+0
diff --git a/dbms/tests/queries/0_stateless/01083_log_first_column_alias.sql b/dbms/tests/queries/0_stateless/01083_log_first_column_alias.sql
new file mode 100644
index 00000000000..4652802999a
--- /dev/null
+++ b/dbms/tests/queries/0_stateless/01083_log_first_column_alias.sql
@@ -0,0 +1,7 @@
+DROP TABLE IF EXISTS test_alias;
+
+CREATE TABLE test_alias (a UInt8 ALIAS b, b UInt8) ENGINE Log;
+
+SELECT count() FROM test_alias;
+
+DROP TABLE test_alias;

From 1f42d03bf8374a535c993c7d37b7562691fd438d Mon Sep 17 00:00:00 2001
From: alexey-milovidov <milovidov@yandex-team.ru>
Date: Wed, 19 Feb 2020 22:26:33 +0300
Subject: [PATCH 24/40] Update SeekableReadBuffer.h

---
 dbms/src/IO/SeekableReadBuffer.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dbms/src/IO/SeekableReadBuffer.h b/dbms/src/IO/SeekableReadBuffer.h
index be06887e61f..f7a468b0490 100644
--- a/dbms/src/IO/SeekableReadBuffer.h
+++ b/dbms/src/IO/SeekableReadBuffer.h
@@ -22,7 +22,7 @@ public:
     virtual off_t seek(off_t off, int whence) = 0;
 
     /**
-     * @return Offset from the begging of underlying buffer / file corresponds to buffer current position.
+     * @return Offset from the begin of the underlying buffer / file corresponds to the buffer current position.
      */
     virtual off_t getPosition() = 0;
 };

From 3c1f2b61248e5c24d83315292a1f869610a6d59e Mon Sep 17 00:00:00 2001
From: nikitamikhaylov <mikhaylovnikitka@gmail.com>
Date: Wed, 19 Feb 2020 23:50:27 +0300
Subject: [PATCH 25/40] style check final

---
 dbms/programs/copier/ClusterPartition.h  |  3 ++-
 dbms/programs/copier/Internals.cpp       |  3 ++-
 dbms/programs/copier/Internals.h         | 12 ++++++++----
 dbms/programs/copier/TaskTableAndShard.h | 17 +++++++++++------
 4 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/dbms/programs/copier/ClusterPartition.h b/dbms/programs/copier/ClusterPartition.h
index 333d24e5326..89dd1c1d2a2 100644
--- a/dbms/programs/copier/ClusterPartition.h
+++ b/dbms/programs/copier/ClusterPartition.h
@@ -5,7 +5,8 @@
 namespace DB
 {
     /// Contains info about all shards that contain a partition
-    struct ClusterPartition {
+    struct ClusterPartition 
+    {
         double elapsed_time_seconds = 0;
         UInt64 bytes_copied = 0;
         UInt64 rows_copied = 0;
diff --git a/dbms/programs/copier/Internals.cpp b/dbms/programs/copier/Internals.cpp
index 05286515970..84add0f2098 100644
--- a/dbms/programs/copier/Internals.cpp
+++ b/dbms/programs/copier/Internals.cpp
@@ -1,6 +1,7 @@
 #include "Internals.h"
 
-namespace DB {
+namespace DB 
+{
 
 ConfigurationPtr getConfigurationFromXMLString(const std::string & xml_data)
 {
diff --git a/dbms/programs/copier/Internals.h b/dbms/programs/copier/Internals.h
index 44f2fa63db4..57fcd2bfb01 100644
--- a/dbms/programs/copier/Internals.h
+++ b/dbms/programs/copier/Internals.h
@@ -106,17 +106,20 @@ struct TaskStateWithOwner
     TaskState state{TaskState::Unknown};
     String owner;
 
-    static String getData(TaskState state, const String &owner) {
+    static String getData(TaskState state, const String &owner) 
+    {
         return TaskStateWithOwner(state, owner).toString();
     }
 
-    String toString() {
+    String toString() 
+    {
         WriteBufferFromOwnString wb;
         wb << static_cast<UInt32>(state) << "\n" << escape << owner;
         return wb.str();
     }
 
-    static TaskStateWithOwner fromString(const String &data) {
+    static TaskStateWithOwner fromString(const String & data) 
+    {
         ReadBufferFromString rb(data);
         TaskStateWithOwner res;
         UInt32 state;
@@ -139,7 +142,8 @@ struct ShardPriority
     size_t hostname_difference = 0;
     UInt8 random = 0;
 
-    static bool greaterPriority(const ShardPriority & current, const ShardPriority & other) {
+    static bool greaterPriority(const ShardPriority & current, const ShardPriority & other) 
+    {
         return std::forward_as_tuple(current.is_remote, current.hostname_difference, current.random)
                < std::forward_as_tuple(other.is_remote, other.hostname_difference, other.random);
     }
diff --git a/dbms/programs/copier/TaskTableAndShard.h b/dbms/programs/copier/TaskTableAndShard.h
index 05b1c3f543a..4e4aaf18a96 100644
--- a/dbms/programs/copier/TaskTableAndShard.h
+++ b/dbms/programs/copier/TaskTableAndShard.h
@@ -84,10 +84,11 @@ struct TaskTable
 };
 
 
- struct TaskShard {
+ struct TaskShard 
+ {
     TaskShard(TaskTable &parent, const ShardInfo &info_) : task_table(parent), info(info_) {}
 
-    TaskTable &task_table;
+    TaskTable & task_table;
 
     ShardInfo info;
 
@@ -221,12 +222,14 @@ inline TaskTable::TaskTable(TaskCluster & parent, const Poco::Util::AbstractConf
 }
 
 template<typename RandomEngine>
-inline void TaskTable::initShards(RandomEngine && random_engine) {
+inline void TaskTable::initShards(RandomEngine && random_engine) 
+{
     const String & fqdn_name = getFQDNOrHostName();
     std::uniform_int_distribution<UInt8> get_urand(0, std::numeric_limits<UInt8>::max());
 
     // Compute the priority
-    for (auto & shard_info : cluster_pull->getShardsInfo()) {
+    for (auto & shard_info : cluster_pull->getShardsInfo()) 
+    {
         TaskShardPtr task_shard = std::make_shared<TaskShard>(*this, shard_info);
         const auto & replicas = cluster_pull->getShardsAddresses().at(task_shard->indexInCluster());
         task_shard->priority = getReplicasPriority(replicas, fqdn_name, get_urand(random_engine));
@@ -236,13 +239,15 @@ inline void TaskTable::initShards(RandomEngine && random_engine) {
 
     // Sort by priority
     std::sort(all_shards.begin(), all_shards.end(),
-              [](const TaskShardPtr & lhs, const TaskShardPtr & rhs) {
+              [](const TaskShardPtr & lhs, const TaskShardPtr & rhs) 
+              {
                   return ShardPriority::greaterPriority(lhs->priority, rhs->priority);
               });
 
     // Cut local shards
     auto it_first_remote = std::lower_bound(all_shards.begin(), all_shards.end(), 1,
-                                            [](const TaskShardPtr & lhs, UInt8 is_remote) {
+                                            [](const TaskShardPtr & lhs, UInt8 is_remote) 
+                                            {
                                                 return lhs->priority.is_remote < is_remote;
                                             });
 

From 7ef480e4dee2e1c38f02e645b5653f1f6450bf38 Mon Sep 17 00:00:00 2001
From: BayoNet <da-daos@yandex.ru>
Date: Thu, 20 Feb 2020 09:31:06 +0300
Subject: [PATCH 26/40] DOCS-442: RU translation of the docs on DDL queries
 creating extenal dictionary (#9141)

* CLICKHOUSEDOCS-442: Translated docs for DDL query for creating external dictinaries.

* CLICKHOUSEDOCS-442: Fixes.

* Update docs/ru/query_language/show.md

* CLICKHOUSEDOCS-442: Fixed failing build.

Co-authored-by: Ivan Blinkov <github@blinkov.ru>
---
 docs/en/query_language/create.md              |   4 +-
 .../en/query_language/dicts/external_dicts.md |  12 +-
 .../dicts/external_dicts_dict.md              |   4 +-
 .../dicts/external_dicts_dict_layout.md       |   2 +-
 .../dicts/external_dicts_dict_structure.md    |   8 +-
 docs/en/query_language/show.md                |   4 +-
 docs/ru/query_language/create.md              |  24 ++
 .../ru/query_language/dicts/external_dicts.md |  16 +-
 .../dicts/external_dicts_dict.md              |  33 ++-
 .../dicts/external_dicts_dict_layout.md       | 101 ++++++++-
 .../dicts/external_dicts_dict_lifetime.md     |  23 ++
 .../dicts/external_dicts_dict_sources.md      | 213 +++++++++++++++---
 .../dicts/external_dicts_dict_structure.md    |  45 +++-
 docs/ru/query_language/show.md                |  36 ++-
 14 files changed, 453 insertions(+), 72 deletions(-)

diff --git a/docs/en/query_language/create.md b/docs/en/query_language/create.md
index 1d8df0e1c91..0866d298c32 100644
--- a/docs/en/query_language/create.md
+++ b/docs/en/query_language/create.md
@@ -275,8 +275,6 @@ Views look the same as normal tables. For example, they are listed in the result
 
 There isn't a separate query for deleting views. To delete a view, use `DROP TABLE`.
 
-[Original article](https://clickhouse.tech/docs/en/query_language/create/) <!--hide-->
-
 ## CREATE DICTIONARY {#create-dictionary-query}
 
 ```sql
@@ -300,3 +298,5 @@ External dictionary structure consists of attributes. Dictionary attributes are
 Depending on dictionary [layout](dicts/external_dicts_dict_layout.md) one or more attributes can be specified as dictionary keys.
 
 For more information, see [External Dictionaries](dicts/external_dicts.md) section.
+
+[Original article](https://clickhouse.tech/docs/en/query_language/create/) <!--hide-->
diff --git a/docs/en/query_language/dicts/external_dicts.md b/docs/en/query_language/dicts/external_dicts.md
index 19a45e40e73..46733654f68 100644
--- a/docs/en/query_language/dicts/external_dicts.md
+++ b/docs/en/query_language/dicts/external_dicts.md
@@ -6,7 +6,7 @@ ClickHouse:
 
 - Fully or partially stores dictionaries in RAM.
 - Periodically updates dictionaries and dynamically loads missing values. In other words, dictionaries can be loaded dynamically.
-- Allows to create external dictionaries with xml-files or [DDL queries](../create.md#create-dictionary-query).
+- Allows to create external dictionaries with xml files or [DDL queries](../create.md#create-dictionary-query).
 
 The configuration of external dictionaries can be located in one or more xml-files. The path to the configuration is specified in the [dictionaries_config](../../operations/server_settings/settings.md#server_settings-dictionaries_config) parameter.
 
@@ -34,12 +34,16 @@ You can [configure](external_dicts_dict.md) any number of dictionaries in the sa
 
 [DDL queries for dictionaries](../create.md#create-dictionary-query) doesn't require any additional records in server configuration. They allow to work with dictionaries as first-class entities, like tables or views.
 
-!!! attention
+!!! attention "Attention"
     You can convert values for a small dictionary by describing it in a `SELECT` query (see the [transform](../functions/other_functions.md) function). This functionality is not related to external dictionaries.
 
-**See also**
+## See also {#ext-dicts-see-also}
 
+- [Configuring an External Dictionary](external_dicts_dict.md)
+- [Storing Dictionaries in Memory](external_dicts_dict_layout.md)
+- [Dictionary Updates](external_dicts_dict_lifetime.md)
+- [Sources of External Dictionaries](external_dicts_dict_sources.md)
+- [Dictionary Key and Fields](external_dicts_dict_structure.md)
 - [Functions for Working with External Dictionaries](../functions/ext_dict_functions.md)
 
-
 [Original article](https://clickhouse.tech/docs/en/query_language/dicts/external_dicts/) <!--hide-->
diff --git a/docs/en/query_language/dicts/external_dicts_dict.md b/docs/en/query_language/dicts/external_dicts_dict.md
index 03963c2d3b2..fba76199784 100644
--- a/docs/en/query_language/dicts/external_dicts_dict.md
+++ b/docs/en/query_language/dicts/external_dicts_dict.md
@@ -1,6 +1,6 @@
 # Configuring an External Dictionary {#dicts-external_dicts_dict}
 
-If dictionary is configured using xml-file, than dictionary configuration has the following structure:
+If dictionary is configured using xml file, than dictionary configuration has the following structure:
 
 ```xml
 <dictionary>
@@ -37,7 +37,7 @@ LAYOUT(...) -- Memory layout configuration
 LIFETIME(...) -- Lifetime of dictionary in memory
 ```
 
-- name – The identifier that can be used to access the dictionary. Use the characters `[a-zA-Z0-9_\-]`.
+- `name` – The identifier that can be used to access the dictionary. Use the characters `[a-zA-Z0-9_\-]`.
 - [source](external_dicts_dict_sources.md) — Source of the dictionary.
 - [layout](external_dicts_dict_layout.md) — Dictionary layout in memory.
 - [structure](external_dicts_dict_structure.md) — Structure of the dictionary . A key and attributes that can be retrieved by this key.
diff --git a/docs/en/query_language/dicts/external_dicts_dict_layout.md b/docs/en/query_language/dicts/external_dicts_dict_layout.md
index 5a587b5408c..87034a5aa14 100644
--- a/docs/en/query_language/dicts/external_dicts_dict_layout.md
+++ b/docs/en/query_language/dicts/external_dicts_dict_layout.md
@@ -34,7 +34,7 @@ The configuration looks like this:
 </yandex>
 ```
 
-in case of [DDL-query](../create.md#create-dictionary-query), equal configuration will looks like
+Corresponding [DDL-query](../create.md#create-dictionary-query):
 
 ```sql
 CREATE DICTIONARY (...)
diff --git a/docs/en/query_language/dicts/external_dicts_dict_structure.md b/docs/en/query_language/dicts/external_dicts_dict_structure.md
index 8c80c53561d..f5a0b0b6017 100644
--- a/docs/en/query_language/dicts/external_dicts_dict_structure.md
+++ b/docs/en/query_language/dicts/external_dicts_dict_structure.md
@@ -47,10 +47,14 @@ Attributes are described in the query body:
 
 ClickHouse supports the following types of keys:
 
-- Numeric key. UInt64. Defined in the `<id>` tag or using `PRIMARY KEY` keyword.
+- Numeric key. `UInt64`. Defined in the `<id>` tag or using `PRIMARY KEY` keyword.
 - Composite key. Set of values of different types. Defined in the tag `<key>` or `PRIMARY KEY` keyword.
 
-A xml structure can contain either `<id>` or `<key>`. DDL-query must contain single `PRIMARY KEY`.
+An xml structure can contain either `<id>` or `<key>`. DDL-query must contain single `PRIMARY KEY`.
+
+!!! warning "Warning"
+    You must not describe key as an attribute.
+
 
 ### Numeric Key {#ext_dict-numeric-key}
 
diff --git a/docs/en/query_language/show.md b/docs/en/query_language/show.md
index 93a2a1a8bd0..f6a9cc6865b 100644
--- a/docs/en/query_language/show.md
+++ b/docs/en/query_language/show.md
@@ -81,7 +81,7 @@ SELECT name FROM system.dictionaries WHERE database = <db> [AND name LIKE <patte
 
 **Example**
 
-The following query selects the first two rows from the list of tables in the `system` database, whose names contain `co`.
+The following query selects the first two rows from the list of tables in the `system` database, whose names contain `reg`.
 
 ```sql
 SHOW DICTIONARIES FROM db LIKE '%reg%' LIMIT 2
@@ -92,3 +92,5 @@ SHOW DICTIONARIES FROM db LIKE '%reg%' LIMIT 2
 │ region_names │
 └──────────────┘
 ```
+
+[Original article](https://clickhouse.tech/docs/en/query_language/show/) <!--hide-->
diff --git a/docs/ru/query_language/create.md b/docs/ru/query_language/create.md
index 7008e8588db..42568305cd3 100644
--- a/docs/ru/query_language/create.md
+++ b/docs/ru/query_language/create.md
@@ -274,4 +274,28 @@ SELECT a, b, c FROM (SELECT ...)
 
 Отсутствует отдельный запрос для удаления представлений. Чтобы удалить представление, следует использовать `DROP TABLE`.
 
+## CREATE DICTIONARY {#create-dictionary-query}
+
+```sql
+CREATE DICTIONARY [IF NOT EXISTS] [db.]dictionary_name
+(
+    key1 type1  [DEFAULT|EXPRESSION expr1] [HIERARCHICAL|INJECTIVE|IS_OBJECT_ID],
+    key2 type2  [DEFAULT|EXPRESSION expr2] [HIERARCHICAL|INJECTIVE|IS_OBJECT_ID],
+    attr1 type2 [DEFAULT|EXPRESSION expr3],
+    attr2 type2 [DEFAULT|EXPRESSION expr4]
+)
+PRIMARY KEY key1, key2
+SOURCE(SOURCE_NAME([param1 value1 ... paramN valueN]))
+LAYOUT(LAYOUT_NAME([param_name param_value]))
+LIFETIME([MIN val1] MAX val2)
+```
+
+Создаёт [внешний словарь](dicts/external_dicts.md) с заданной [структурой](dicts/external_dicts_dict_structure.md), [источником](dicts/external_dicts_dict_sources.md), [способом размещения в памяти](dicts/external_dicts_dict_layout.md) и [периодом обновления](dicts/external_dicts_dict_lifetime.md).
+
+Структура внешнего словаря состоит из атрибутов. Атрибуты словаря задаются как столбцы таблицы. Единственным обязательным свойством атрибута является его тип, все остальные свойства могут иметь значения по умолчанию.
+
+В зависимости от [способа размещения словаря в памяти](dicts/external_dicts_dict_layout.md), ключами словаря могут быть один и более атрибутов.
+
+Смотрите [Внешние словари](dicts/external_dicts.md).
+
 [Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/create/) <!--hide-->
diff --git a/docs/ru/query_language/dicts/external_dicts.md b/docs/ru/query_language/dicts/external_dicts.md
index 4ebb8f5143d..7e38cd9221d 100644
--- a/docs/ru/query_language/dicts/external_dicts.md
+++ b/docs/ru/query_language/dicts/external_dicts.md
@@ -2,9 +2,12 @@
 
 Существует возможность подключать собственные словари из различных источников данных. Источником данных для словаря может быть локальный текстовый/исполняемый файл, HTTP(s) ресурс или другая СУБД. Подробнее смотрите в разделе "[Источники внешних словарей](external_dicts_dict_sources.md)".
 
-ClickHouse полностью или частично хранит словари в оперативной памяти. Словари можно подгружать динамически, ClickHouse периодически обновляет их и динамически подгружает отсутствующие значения.
+ClickHouse:
+- Полностью или частично хранит словари в оперативной памяти.
+- Периодически обновляет их и динамически подгружает отсутствующие значения.
+- Позволяет создавать внешние словари с помощью xml-файлов или [DDL-запросов](../create.md#create-dictionary-query).
 
-Конфигурация внешних словарей находится в одном или нескольких файлах. Путь к конфигурации указывается в параметре [dictionaries_config](../../operations/server_settings/settings.md).
+Конфигурация внешних словарей может находится в одном или нескольких xml-файлах. Путь к конфигурации указывается в параметре [dictionaries_config](../../operations/server_settings/settings.md).
 
 Словари могут загружаться при старте сервера или при первом использовании, в зависимости от настройки [dictionaries_lazy_load](../../operations/server_settings/settings.md).
 
@@ -30,12 +33,15 @@ ClickHouse полностью или частично хранит словар
 </yandex>
 ```
 
-В одном файле можно [сконфигурировать](external_dicts_dict.md) произвольное количество словарей. Формат файла сохраняется даже если словарь один (т.е. `<yandex><dictionary> <!--configuration--> </dictionary></yandex>`).
+В одном файле можно [сконфигурировать](external_dicts_dict.md) произвольное количество словарей.
 
->можете преобразовывать значения по небольшому словарю, описав его в запросе `SELECT` (см. функцию [transform](../functions/other_functions.md)). Эта функциональность не связана с внешними словарями.
+Если вы создаёте внешние словари [DDL-запросами](../create.md#create-dictionary-query), то не задавайте конфигурацию словаря в конфигурации сервера.
+
+!!! attention "Внимание"
+    Можно преобразовывать значения по небольшому словарю, описав его в запросе `SELECT` (см. функцию [transform](../functions/other_functions.md)). Эта функциональность не связана с внешними словарями.
 
 
-Смотрите также:
+## Смотрите также {#ext-dicts-see-also}
 
 - [Настройка внешнего словаря](external_dicts_dict.md)
 - [Хранение словарей в памяти](external_dicts_dict_layout.md)
diff --git a/docs/ru/query_language/dicts/external_dicts_dict.md b/docs/ru/query_language/dicts/external_dicts_dict.md
index b004b2d8100..61946c10ee8 100644
--- a/docs/ru/query_language/dicts/external_dicts_dict.md
+++ b/docs/ru/query_language/dicts/external_dicts_dict.md
@@ -1,11 +1,15 @@
 # Настройка внешнего словаря {#dicts-external_dicts_dict}
 
-Конфигурация словаря имеет следующую структуру:
+XML-конфигурация словаря имеет следующую структуру:
 
 ```xml
 <dictionary>
     <name>dict_name</name>
 
+    <structure>
+      <!-- Complex key configuration -->
+    </structure>
+
     <source>
       <!-- Source configuration -->
     </source>
@@ -14,20 +18,29 @@
       <!-- Memory layout configuration -->
     </layout>
 
-    <structure>
-      <!-- Complex key configuration -->
-    </structure>
-
     <lifetime>
       <!-- Lifetime of dictionary in memory -->
     </lifetime>
 </dictionary>
 ```
 
-- name - Идентификатор, под которым словарь будет доступен для использования. Используйте символы `[a-zA-Z0-9_\-]`.
-- [source](external_dicts_dict_sources.md) - Источник словаря.
-- [layout](external_dicts_dict_layout.md) - Размещение словаря в памяти.
-- [structure](external_dicts_dict_structure.md) - Структура словаря. Ключ и атрибуты, которые можно получить по ключу.
-- [lifetime](external_dicts_dict_lifetime.md) - Периодичность обновления словарей.
+Соответствующий [DDL-запрос](../create.md#create-dictionary-query) имеет следующий вид:
+
+```sql
+CREATE DICTIONARY dict_name
+(
+    ... -- attributes
+)
+PRIMARY KEY ... -- complex or single key configuration
+SOURCE(...) -- Source configuration
+LAYOUT(...) -- Memory layout configuration
+LIFETIME(...) -- Lifetime of dictionary in memory
+```
+
+- `name` — Идентификатор, под которым словарь будет доступен для использования. Используйте символы `[a-zA-Z0-9_\-]`.
+- [source](external_dicts_dict_sources.md) — Источник словаря.
+- [layout](external_dicts_dict_layout.md) — Размещение словаря в памяти.
+- [structure](external_dicts_dict_structure.md) — Структура словаря. Ключ и атрибуты, которые можно получить по ключу.
+- [lifetime](external_dicts_dict_lifetime.md) — Периодичность обновления словарей.
 
 [Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/dicts/external_dicts_dict/) <!--hide-->
diff --git a/docs/ru/query_language/dicts/external_dicts_dict_layout.md b/docs/ru/query_language/dicts/external_dicts_dict_layout.md
index a1a35ae2c4d..0b0f8ab4b82 100644
--- a/docs/ru/query_language/dicts/external_dicts_dict_layout.md
+++ b/docs/ru/query_language/dicts/external_dicts_dict_layout.md
@@ -35,15 +35,25 @@
 ```
 
 
+Соответствущий [DDL-запрос](../create.md#create-dictionary-query):
+
+```sql
+CREATE DICTIONARY (...)
+...
+LAYOUT(LAYOUT_TYPE(param value)) -- layout settings
+...
+```
+
 ## Способы размещения словарей в памяти
 
--   [flat](#flat)
--   [hashed](#hashed)
--   [cache](#cache)
--   [range_hashed](#range-hashed)
--   [complex_key_hashed](#complex-key-hashed)
--   [complex_key_cache](#complex-key-cache)
--   [ip_trie](#ip-trie)
+- [flat](#flat)
+- [hashed](#hashed)
+- [sparse_hashed](#dicts-external_dicts_dict_layout-sparse_hashed)
+- [cache](#cache)
+- [range_hashed](#range-hashed)
+- [complex_key_hashed](#complex-key-hashed)
+- [complex_key_cache](#complex-key-cache)
+- [ip_trie](#ip-trie)
 
 ### flat
 
@@ -63,6 +73,12 @@
 </layout>
 ```
 
+или
+
+```sql
+LAYOUT(FLAT())
+```
+
 ### hashed
 
 Словарь полностью хранится в оперативной памяти в виде хэш-таблиц. Словарь может содержать произвольное количество элементов с произвольными идентификаторами. На практике, количество ключей может достигать десятков миллионов элементов.
@@ -77,6 +93,29 @@
 </layout>
 ```
 
+или
+
+```sql
+LAYOUT(HASHED())
+```
+
+### sparse_hashed {#dicts-external_dicts_dict_layout-sparse_hashed}
+
+Аналогичен `hashed`, но при этом занимает меньше места в памяти и генерирует более высокую загрузку CPU.
+
+Пример конфигурации:
+
+```xml
+<layout>
+  <sparse_hashed />
+</layout>
+```
+
+или
+
+```sql
+LAYOUT(SPARSE_HASHED())
+```
 
 ### complex_key_hashed 
 
@@ -90,6 +129,12 @@
 </layout>
 ```
 
+или
+
+```sql
+LAYOUT(COMPLEX_KEY_HASHED())
+```
+
 
 ### range_hashed
 
@@ -131,6 +176,19 @@
     ...
 ```
 
+или
+
+```sql
+CREATE DICTIONARY somedict (
+    id UInt64,
+    first Date,
+    last Date
+)
+PRIMARY KEY id
+LAYOUT(RANGE_HASHED())
+RANGE(MIN first MAX last)
+```
+
 Для работы с такими словарями в функцию `dictGetT` необходимо передавать дополнительный аргумент, для которого подбирается диапазон:
 
     dictGetT('dict_name', 'attr_name', id, date)
@@ -178,6 +236,18 @@
 </yandex>
 ```
 
+или
+
+```sql
+CREATE DICTIONARY somedict(
+    Abcdef UInt64,
+    StartTimeStamp UInt64,
+    EndTimeStamp UInt64,
+    XXXType String DEFAULT ''
+)
+PRIMARY KEY Abcdef
+RANGE(MIN StartTimeStamp MAX EndTimeStamp)
+```
 
 ### cache
 
@@ -204,6 +274,12 @@
 </layout>
 ```
 
+или
+
+```sql
+LAYOUT(CACHE(SIZE_IN_CELLS 1000000000))
+```
+
 Укажите достаточно большой размер кэша. Количество ячеек следует подобрать экспериментальным путём:
 
 1. Выставить некоторое значение.
@@ -265,6 +341,17 @@
     ...
 ```
 
+или
+
+```sql
+CREATE DICTIONARY somedict (
+    prefix String,
+    asn UInt32,
+    cca2 String DEFAULT '??'
+)
+PRIMARY KEY prefix
+```
+
 Этот ключ должен иметь только один атрибут типа `String`, содержащий допустимый префикс IP. Другие типы еще не поддерживаются.
 
 Для запросов необходимо использовать те же функции (`dictGetT` с кортежем), что и для словарей с составными ключами:
diff --git a/docs/ru/query_language/dicts/external_dicts_dict_lifetime.md b/docs/ru/query_language/dicts/external_dicts_dict_lifetime.md
index 2108d3e49ff..d18ac8b38af 100644
--- a/docs/ru/query_language/dicts/external_dicts_dict_lifetime.md
+++ b/docs/ru/query_language/dicts/external_dicts_dict_lifetime.md
@@ -14,6 +14,15 @@ ClickHouse периодически обновляет словари. Инте
     ...
 </dictionary>
 ```
+или
+
+```sql
+CREATE DICTIONARY (...)
+...
+LIFETIME(300)
+...
+```
+
 
 Настройка `<lifetime>0</lifetime>` запрещает обновление словарей.
 
@@ -32,6 +41,12 @@ ClickHouse периодически обновляет словари. Инте
 </dictionary>
 ```
 
+или
+
+```sql
+LIFETIME(MIN 300 MAX 360)
+```
+
 При обновлении словарей сервер ClickHouse применяет различную логику в зависимости от типа [источника](external_dicts_dict_sources.md):
 
 > -   У текстового файла проверяется время модификации. Если время изменилось по отношению к запомненному ранее, то словарь обновляется.
@@ -56,4 +71,12 @@ ClickHouse периодически обновляет словари. Инте
 </dictionary>
 ```
 
+или
+
+```sql
+...
+SOURCE(ODBC(... invalidate_query 'SELECT update_time FROM dictionary_source where id = 1'))
+...
+```
+
 [Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/dicts/external_dicts_dict_lifetime/) <!--hide-->
diff --git a/docs/ru/query_language/dicts/external_dicts_dict_sources.md b/docs/ru/query_language/dicts/external_dicts_dict_sources.md
index 8ef492e050f..58caf501847 100644
--- a/docs/ru/query_language/dicts/external_dicts_dict_sources.md
+++ b/docs/ru/query_language/dicts/external_dicts_dict_sources.md
@@ -3,7 +3,7 @@
 
 Внешний словарь можно подключить из множества источников.
 
-Общий вид конфигурации:
+Общий вид XML-конфигурации:
 
 ```xml
 <yandex>
@@ -20,6 +20,16 @@
 </yandex>
 ```
 
+Аналогичный [DDL-запрос](../create.md#create-dictionary-query):
+
+```sql
+CREATE DICTIONARY dict_name (...)
+...
+SOURCE(SOURCE_TYPE(param1 val1 ... paramN valN)) -- Source configuration
+...
+```
+
+
 Источник настраивается в разделе `source`.
 
 Типы источников (`source_type`):
@@ -48,10 +58,16 @@
 </source>
 ```
 
+или
+
+```sql
+SOURCE(FILE(path '/opt/dictionaries/os.tsv' format 'TabSeparated'))
+```
+
 Поля настройки:
 
--   `path` - Абсолютный путь к файлу.
--   `format` - Формат файла. Поддерживаются все форматы, описанные в разделе "[Форматы](../../interfaces/formats.md#formats)".
+- `path` — Абсолютный путь к файлу.
+- `format` — Формат файла. Поддерживаются все форматы, описанные в разделе "[Форматы](../../interfaces/formats.md#formats)".
 
 
 ## Исполняемый файл {#dicts-external_dicts_dict_sources-executable}
@@ -69,10 +85,16 @@
 </source>
 ```
 
+или
+
+```sql
+SOURCE(EXECUTABLE(command 'cat /opt/dictionaries/os.tsv' format 'TabSeparated'))
+```
+
 Поля настройки:
 
--   `command` - Абсолютный путь к исполняемому файлу или имя файла (если каталог программы прописан в `PATH`).
--   `format` - Формат файла. Поддерживаются все форматы, описанные в разделе "[Форматы](../../interfaces/formats.md#formats)".
+-   `command` — Абсолютный путь к исполняемому файлу или имя файла (если каталог программы прописан в `PATH`).
+-   `format` — Формат файла. Поддерживаются все форматы, описанные в разделе "[Форматы](../../interfaces/formats.md#formats)".
 
 
 ## HTTP(s) {#dicts-external_dicts_dict_sources-http}
@@ -86,16 +108,37 @@
     <http>
         <url>http://[::1]/os.tsv</url>
         <format>TabSeparated</format>
+        <credentials>
+            <user>user</user>
+            <password>password</password>
+        </credentials>
+        <headers>
+            <header>
+                <name>API-KEY</name>
+                <value>key</value>
+            </header>
+        </headers>
     </http>
 </source>
 ```
 
+или
+
+```sql
+SOURCE(HTTP(
+    url 'http://[::1]/os.tsv'
+    format 'TabSeparated'
+    credentials(user 'user' password 'password')
+    headers(header(name 'API-KEY' value 'key'))
+))
+```
+
 Чтобы ClickHouse смог обратиться к HTTPS-ресурсу, необходимо [настроить openSSL](../../operations/server_settings/settings.md) в конфигурации сервера.
 
 Поля настройки:
 
--   `url` - URL источника.
--   `format` - Формат файла. Поддерживаются все форматы, описанные в разделе "[Форматы](../../interfaces/formats.md#formats)".
+-   `url` — URL источника.
+-   `format` — Формат файла. Поддерживаются все форматы, описанные в разделе "[Форматы](../../interfaces/formats.md#formats)".
 
 
 ## ODBC {#dicts-external_dicts_dict_sources-odbc}
@@ -105,20 +148,33 @@
 Пример настройки:
 
 ```xml
-<odbc>
-    <db>DatabaseName</db>
-    <table>ShemaName.TableName</table>
-    <connection_string>DSN=some_parameters</connection_string>
-    <invalidate_query>SQL_QUERY</invalidate_query>
-</odbc>
+<source>
+    <odbc>
+        <db>DatabaseName</db>
+        <table>ShemaName.TableName</table>
+        <connection_string>DSN=some_parameters</connection_string>
+        <invalidate_query>SQL_QUERY</invalidate_query>
+    </odbc>
+</source>
+```
+
+или
+
+```sql
+SOURCE(ODBC(
+    db 'DatabaseName'
+    table 'SchemaName.TableName'
+    connection_string 'DSN=some_parameters'
+    invalidate_query 'SQL_QUERY'
+))
 ```
 
 Поля настройки:
 
--   `db` - имя базы данных. Не указывать, если имя базы задано в параметрах. `<connection_string>`.
--   `table` - имя таблицы и схемы, если она есть.
--   `connection_string` - строка соединения.
--   `invalidate_query` - запрос для проверки статуса словаря. Необязательный параметр. Читайте подробнее в разделе [Обновление словарей](external_dicts_dict_lifetime.md).
+-   `db` — имя базы данных. Не указывать, если имя базы задано в параметрах. `<connection_string>`.
+-   `table` — имя таблицы и схемы, если она есть.
+-   `connection_string` — строка соединения.
+-   `invalidate_query` — запрос для проверки статуса словаря. Необязательный параметр. Читайте подробнее в разделе [Обновление словарей](external_dicts_dict_lifetime.md).
 
 ClickHouse получает от ODBC-драйвера информацию о квотировании и квотирует настройки в запросах к драйверу, поэтому имя таблицы нужно указывать в соответствии с регистром имени таблицы в базе данных.
 
@@ -216,6 +272,18 @@ $ sudo apt-get install -y unixodbc odbcinst odbc-postgresql
 </yandex>
 ```
 
+или
+
+```sql
+CREATE DICTIONARY table_name (
+    id UInt64,
+    some_column UInt64 DEFAULT 0
+)
+PRIMARY KEY id
+SOURCE(ODBC(connection_string 'DSN=myconnection' table 'postgresql_table'))
+LAYOUT(HASHED())
+LIFETIME(MIN 300 MAX 360)
+
 Может понадобиться в `odbc.ini` указать полный путь до библиотеки с драйвером `DRIVER=/usr/local/lib/psqlodbcw.so`.
 
 ### Пример подключения MS SQL Server
@@ -299,6 +367,20 @@ $ sudo apt-get install tdsodbc freetds-bin sqsh
 </yandex>
 ```
 
+или
+
+```sql
+CREATE DICTIONARY test (
+    k UInt64,
+    s String DEFAULT ''
+)
+PRIMARY KEY k
+SOURCE(ODBC(table 'dict' connection_string 'DSN=MSSQL;UID=test;PWD=test'))
+LAYOUT(FLAT())
+LIFETIME(MIN 300 MAX 360)
+```
+
+
 ## СУБД
 
 
@@ -328,6 +410,22 @@ $ sudo apt-get install tdsodbc freetds-bin sqsh
 </source>
 ```
 
+или
+
+```sql
+SOURCE(MYSQL(
+    port 3306
+    user 'clickhouse'
+    password 'qwerty'
+    replica(host 'example01-1' priority 1)
+    replica(host 'example01-2' priority 1)
+    db 'db_name'
+    table 'table_name'
+    where 'id=10'
+    invalidate_query 'SQL_QUERY'
+))
+```
+
 Поля настройки:
 
 - `port` — порт сервера MySQL. Можно указать для всех реплик или для каждой в отдельности (внутри `<replica>`).
@@ -362,6 +460,21 @@ MySQL можно подключить на локальном хосте чер
 </source>
 ```
 
+или
+
+```sql
+SOURCE(MYSQL(
+    host 'localhost'
+    socket '/path/to/socket/file.sock'
+    user 'clickhouse'
+    password 'qwerty'
+    db 'db_name'
+    table 'table_name'
+    where 'id=10'
+    invalidate_query 'SQL_QUERY'
+))
+```
+
 
 ### ClickHouse {#dicts-external_dicts_dict_sources-clickhouse}
 
@@ -381,16 +494,30 @@ MySQL можно подключить на локальном хосте чер
 </source>
 ```
 
+или
+
+```sql
+SOURCE(CLICKHOUSE(
+    host 'example01-01-1'
+    port 9000
+    user 'default'
+    password ''
+    db 'default'
+    table 'ids'
+    where 'id=10'
+))
+```
+
 Поля настройки:
 
--   `host` - хост ClickHouse. Если host локальный, то запрос выполняется без сетевого взаимодействия. Чтобы повысить отказоустойчивость решения, можно создать таблицу типа [Distributed](../../operations/table_engines/distributed.md) и прописать её в дальнейших настройках.
--   `port` - порт сервера ClickHouse.
--   `user` - имя пользователя ClickHouse.
--   `password` - пароль пользователя ClickHouse.
--   `db` - имя базы данных.
--   `table` - имя таблицы.
--   `where` - условие выбора. Может отсутствовать.
--   `invalidate_query` - запрос для проверки статуса словаря. Необязательный параметр. Читайте подробнее в разделе [Обновление словарей](external_dicts_dict_lifetime.md).
+-   `host` — хост ClickHouse. Если host локальный, то запрос выполняется без сетевого взаимодействия. Чтобы повысить отказоустойчивость решения, можно создать таблицу типа [Distributed](../../operations/table_engines/distributed.md) и прописать её в дальнейших настройках.
+-   `port` — порт сервера ClickHouse.
+-   `user` — имя пользователя ClickHouse.
+-   `password` — пароль пользователя ClickHouse.
+-   `db` — имя базы данных.
+-   `table` — имя таблицы.
+-   `where` — условие выбора. Может отсутствовать.
+-   `invalidate_query` — запрос для проверки статуса словаря. Необязательный параметр. Читайте подробнее в разделе [Обновление словарей](external_dicts_dict_lifetime.md).
 
 
 ### MongoDB {#dicts-external_dicts_dict_sources-mongodb}
@@ -410,14 +537,27 @@ MySQL можно подключить на локальном хосте чер
 </source>
 ```
 
+или
+
+```sql
+SOURCE(MONGO(
+    host 'localhost'
+    port 27017
+    user ''
+    password ''
+    db 'test'
+    collection 'dictionary_source'
+))
+```
+
 Поля настройки:
 
--   `host` - хост MongoDB.
--   `port` - порт сервера MongoDB.
--   `user` - имя пользователя MongoDB.
--   `password` - пароль пользователя MongoDB.
--   `db` - имя базы данных.
--   `collection` - имя коллекции.
+-   `host` — хост MongoDB.
+-   `port` — порт сервера MongoDB.
+-   `user` — имя пользователя MongoDB.
+-   `password` — пароль пользователя MongoDB.
+-   `db` — имя базы данных.
+-   `collection` — имя коллекции.
 
 ### Redis {#dicts-external_dicts_dict_sources-redis}
 
@@ -434,6 +574,17 @@ MySQL можно подключить на локальном хосте чер
 </source>
 ```
 
+или
+
+```sql
+SOURCE(REDIS(
+    host 'localhost'
+    port 6379
+    storage_type 'simple'
+    db_index 0
+))
+```
+
 Поля настройки:
 
 - `host` – хост Redis.
diff --git a/docs/ru/query_language/dicts/external_dicts_dict_structure.md b/docs/ru/query_language/dicts/external_dicts_dict_structure.md
index c0f76db60b2..f9ab9d30dac 100644
--- a/docs/ru/query_language/dicts/external_dicts_dict_structure.md
+++ b/docs/ru/query_language/dicts/external_dicts_dict_structure.md
@@ -24,10 +24,10 @@
 Атрибуты описываются элементами:
 
 - `<id>` — [столбец с ключом](external_dicts_dict_structure.md#ext_dict_structure-key).
-- `<attribute>` — [столбец данных](external_dicts_dict_structure.md#ext_dict_structure-attributes). Можно задать несколько столбцов.
+- `<attribute>` — [столбец данных](external_dicts_dict_structure.md#ext_dict_structure-attributes). Можно задать несколько атрибутов.
 
 
-Запрос создания словаря:
+Создание словаря запросом:
 
 ```sql
 CREATE DICTIONARY dict_name (
@@ -48,10 +48,10 @@ PRIMARY KEY Id
 
 ClickHouse поддерживает следующие виды ключей:
 
-- Числовой ключ. `UInt64`. Описывается в теге `<id>`.
-- Составной ключ. Набор значений разного типа. Описывается в теге `<key>`.
+- Числовой ключ. `UInt64`. Описывается в теге `<id>` или ключевым словом `PRIMARY KEY`.
+- Составной ключ. Набор значений разного типа. Описывается в теге `<key>` или ключевым словом `PRIMARY KEY`.
 
-Структура может содержать либо `<id>` либо `<key>`.
+Структура может содержать либо `<id>` либо `<key>`. DDL-запрос может содержать только `PRIMARY KEY`.
 
 !!! warning "Обратите внимание"
     Ключ не надо дополнительно описывать в атрибутах.
@@ -72,6 +72,20 @@ ClickHouse поддерживает следующие виды ключей:
 
 - `name` — имя столбца с ключами.
 
+Для DDL-запроса:
+
+```sql
+CREATE DICTIONARY (
+    Id UInt64,
+    ...
+)
+PRIMARY KEY Id
+...
+```
+
+- `PRIMARY KEY` – имя столбца с ключами.
+
+
 ### Составной ключ
 
 Ключом может быть кортеж (`tuple`) из полей произвольных типов. В этом случае [layout](external_dicts_dict_layout.md) должен быть `complex_key_hashed` или `complex_key_cache`.
@@ -97,6 +111,18 @@ ClickHouse поддерживает следующие виды ключей:
 ...
 ```
 
+или
+
+```sql
+CREATE DICTIONARY (
+    field1 String,
+    field2 String
+    ...
+)
+PRIMARY KEY field1, field2
+...
+```
+
 При запросе в функции `dictGet*` в качестве ключа передаётся кортеж. Пример: `dictGetString('dict_name', 'attr_name', tuple('string for field1', num_for_field2))`.
 
 
@@ -119,6 +145,15 @@ ClickHouse поддерживает следующие виды ключей:
 </structure>
 ```
 
+или
+
+```sql
+CREATE DICTIONARY somename (
+    Name ClickHouseDataType DEFAULT '' EXPRESSION rand64() HIERARCHICAL INJECTIVE IS_OBJECT_ID
+)
+```
+
+
 Поля конфигурации:
 
 | Тег | Описание | Обязательный |
diff --git a/docs/ru/query_language/show.md b/docs/ru/query_language/show.md
index 4eec70a8002..680b42809c5 100644
--- a/docs/ru/query_language/show.md
+++ b/docs/ru/query_language/show.md
@@ -3,10 +3,10 @@
 ## SHOW CREATE TABLE
 
 ```sql
-SHOW CREATE [TEMPORARY] TABLE [db.]table [INTO OUTFILE filename] [FORMAT format]
+SHOW CREATE [TEMPORARY] [TABLE|DICTIONARY] [db.]table [INTO OUTFILE filename] [FORMAT format]
 ```
 
-Возвращает один столбец типа `String` с именем statement, содержащий одно значение — запрос `CREATE TABLE`, с помощью которого была создана указанная таблица.
+Возвращает один столбец типа `String` с именем statement, содержащий одно значение — запрос `CREATE TABLE`, с помощью которого был создан указанный объект.
 
 ## SHOW DATABASES {#show-databases}
 
@@ -62,3 +62,35 @@ SHOW TABLES FROM system LIKE '%co%' LIMIT 2
 │ collations                     │
 └────────────────────────────────┘
 ```
+
+## SHOW DICTIONARIES
+
+Выводит список [внешних словарей](dicts/external_dicts.md).
+
+```sql
+SHOW DICTIONARIES [FROM <db>] [LIKE '<pattern>'] [LIMIT <N>] [INTO OUTFILE <filename>] [FORMAT <format>]
+```
+
+Если секция `FROM` не указана, запрос возвращает список словарей из текущей базы данных.
+
+Аналогичный результат можно получить следующим запросом:
+
+```sql
+SELECT name FROM system.dictionaries WHERE database = <db> [AND name LIKE <pattern>] [LIMIT <N>] [INTO OUTFILE <filename>] [FORMAT <format>]
+```
+
+**Example**
+
+Запрос выводит первые две стоки из списка таблиц в базе данных `system`, имена которых содержат `reg`.
+
+```sql
+SHOW DICTIONARIES FROM db LIKE '%reg%' LIMIT 2
+```
+```text
+┌─name─────────┐
+│ regions      │
+│ region_names │
+└──────────────┘
+```
+
+[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/show/) <!--hide-->

From 49b86a638d9a21afa75dbd23cf1f50fdb7b83954 Mon Sep 17 00:00:00 2001
From: "dependabot-preview[bot]"
 <27856297+dependabot-preview[bot]@users.noreply.github.com>
Date: Thu, 20 Feb 2020 07:42:43 +0000
Subject: [PATCH 27/40] Bump requests from 2.21.0 to 2.23.0 in /docs/tools

Bumps [requests](https://github.com/psf/requests) from 2.21.0 to 2.23.0.
- [Release notes](https://github.com/psf/requests/releases)
- [Changelog](https://github.com/psf/requests/blob/master/HISTORY.md)
- [Commits](https://github.com/psf/requests/compare/v2.21.0...v2.23.0)

Signed-off-by: dependabot-preview[bot] <support@dependabot.com>
---
 docs/tools/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/tools/requirements.txt b/docs/tools/requirements.txt
index 078fa607d1f..2dbb65c2133 100644
--- a/docs/tools/requirements.txt
+++ b/docs/tools/requirements.txt
@@ -23,7 +23,7 @@ python-slugify==1.2.6
 pytz==2017.3
 PyYAML==5.3
 recommonmark==0.4.0
-requests==2.21.0
+requests==2.23.0
 singledispatch==3.4.0.3
 six==1.11.0
 snowballstemmer==1.2.1

From 694185c0c2413b58feb1867ba37e7bd9fda4108c Mon Sep 17 00:00:00 2001
From: "dependabot-preview[bot]"
 <27856297+dependabot-preview[bot]@users.noreply.github.com>
Date: Thu, 20 Feb 2020 07:42:59 +0000
Subject: [PATCH 28/40] Bump idna from 2.6 to 2.9 in /docs/tools

Bumps [idna](https://github.com/kjd/idna) from 2.6 to 2.9.
- [Release notes](https://github.com/kjd/idna/releases)
- [Changelog](https://github.com/kjd/idna/blob/master/HISTORY.rst)
- [Commits](https://github.com/kjd/idna/compare/v2.6...v2.9)

Signed-off-by: dependabot-preview[bot] <support@dependabot.com>
---
 docs/tools/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/tools/requirements.txt b/docs/tools/requirements.txt
index 078fa607d1f..63cfda54480 100644
--- a/docs/tools/requirements.txt
+++ b/docs/tools/requirements.txt
@@ -10,7 +10,7 @@ cssmin==0.2.0
 docutils==0.16
 futures==3.1.1
 htmlmin==0.1.12
-idna==2.6
+idna==2.9
 imagesize==1.2.0
 Jinja2==2.11.1
 jsmin==2.2.2

From 8230160a00997b8f0cd014a56b3078b98cf33929 Mon Sep 17 00:00:00 2001
From: "dependabot-preview[bot]"
 <27856297+dependabot-preview[bot]@users.noreply.github.com>
Date: Thu, 20 Feb 2020 07:43:22 +0000
Subject: [PATCH 29/40] Bump six from 1.11.0 to 1.14.0 in /docs/tools

Bumps [six](https://github.com/benjaminp/six) from 1.11.0 to 1.14.0.
- [Release notes](https://github.com/benjaminp/six/releases)
- [Changelog](https://github.com/benjaminp/six/blob/master/CHANGES)
- [Commits](https://github.com/benjaminp/six/compare/1.11.0...1.14.0)

Signed-off-by: dependabot-preview[bot] <support@dependabot.com>
---
 docs/tools/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/tools/requirements.txt b/docs/tools/requirements.txt
index 078fa607d1f..381011ed4d2 100644
--- a/docs/tools/requirements.txt
+++ b/docs/tools/requirements.txt
@@ -25,7 +25,7 @@ PyYAML==5.3
 recommonmark==0.4.0
 requests==2.21.0
 singledispatch==3.4.0.3
-six==1.11.0
+six==1.14.0
 snowballstemmer==1.2.1
 Sphinx==1.6.5
 sphinxcontrib-websupport==1.0.1

From 24153a5233ceceb44575f28726ea89907e7c8168 Mon Sep 17 00:00:00 2001
From: "dependabot-preview[bot]"
 <27856297+dependabot-preview[bot]@users.noreply.github.com>
Date: Thu, 20 Feb 2020 07:43:43 +0000
Subject: [PATCH 30/40] Bump certifi from 2017.11.5 to 2019.11.28 in
 /docs/tools

Bumps [certifi](https://certifi.io/) from 2017.11.5 to 2019.11.28.

Signed-off-by: dependabot-preview[bot] <support@dependabot.com>
---
 docs/tools/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/tools/requirements.txt b/docs/tools/requirements.txt
index 078fa607d1f..d1d6ef1a423 100644
--- a/docs/tools/requirements.txt
+++ b/docs/tools/requirements.txt
@@ -2,7 +2,7 @@ alabaster==0.7.12
 Babel==2.5.1
 backports-abc==0.5
 beautifulsoup4==4.8.2
-certifi==2017.11.5
+certifi==2019.11.28
 chardet==3.0.4
 click==6.7
 CommonMark==0.9.1

From 23ceddebbcea2101e3e76d6bfcf76e17c7835059 Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <mikhaylovnikitka@gmail.com>
Date: Thu, 20 Feb 2020 12:06:00 +0300
Subject: [PATCH 31/40] codestyle final 2

---
 dbms/programs/copier/ClusterPartition.h  |  2 +-
 dbms/programs/copier/Internals.cpp       |  2 +-
 dbms/programs/copier/Internals.h         |  8 ++++----
 dbms/programs/copier/TaskTableAndShard.h | 12 ++++++------
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/dbms/programs/copier/ClusterPartition.h b/dbms/programs/copier/ClusterPartition.h
index 89dd1c1d2a2..ed69bfa8c26 100644
--- a/dbms/programs/copier/ClusterPartition.h
+++ b/dbms/programs/copier/ClusterPartition.h
@@ -5,7 +5,7 @@
 namespace DB
 {
     /// Contains info about all shards that contain a partition
-    struct ClusterPartition 
+    struct ClusterPartition
     {
         double elapsed_time_seconds = 0;
         UInt64 bytes_copied = 0;
diff --git a/dbms/programs/copier/Internals.cpp b/dbms/programs/copier/Internals.cpp
index 84add0f2098..331bdeb25de 100644
--- a/dbms/programs/copier/Internals.cpp
+++ b/dbms/programs/copier/Internals.cpp
@@ -1,6 +1,6 @@
 #include "Internals.h"
 
-namespace DB 
+namespace DB
 {
 
 ConfigurationPtr getConfigurationFromXMLString(const std::string & xml_data)
diff --git a/dbms/programs/copier/Internals.h b/dbms/programs/copier/Internals.h
index 57fcd2bfb01..c1377a250d8 100644
--- a/dbms/programs/copier/Internals.h
+++ b/dbms/programs/copier/Internals.h
@@ -106,19 +106,19 @@ struct TaskStateWithOwner
     TaskState state{TaskState::Unknown};
     String owner;
 
-    static String getData(TaskState state, const String &owner) 
+    static String getData(TaskState state, const String &owner)
     {
         return TaskStateWithOwner(state, owner).toString();
     }
 
-    String toString() 
+    String toString()
     {
         WriteBufferFromOwnString wb;
         wb << static_cast<UInt32>(state) << "\n" << escape << owner;
         return wb.str();
     }
 
-    static TaskStateWithOwner fromString(const String & data) 
+    static TaskStateWithOwner fromString(const String & data)
     {
         ReadBufferFromString rb(data);
         TaskStateWithOwner res;
@@ -142,7 +142,7 @@ struct ShardPriority
     size_t hostname_difference = 0;
     UInt8 random = 0;
 
-    static bool greaterPriority(const ShardPriority & current, const ShardPriority & other) 
+    static bool greaterPriority(const ShardPriority & current, const ShardPriority & other)
     {
         return std::forward_as_tuple(current.is_remote, current.hostname_difference, current.random)
                < std::forward_as_tuple(other.is_remote, other.hostname_difference, other.random);
diff --git a/dbms/programs/copier/TaskTableAndShard.h b/dbms/programs/copier/TaskTableAndShard.h
index 4e4aaf18a96..f08cb208ee2 100644
--- a/dbms/programs/copier/TaskTableAndShard.h
+++ b/dbms/programs/copier/TaskTableAndShard.h
@@ -84,8 +84,8 @@ struct TaskTable
 };
 
 
- struct TaskShard 
- {
+struct TaskShard
+{
     TaskShard(TaskTable &parent, const ShardInfo &info_) : task_table(parent), info(info_) {}
 
     TaskTable & task_table;
@@ -222,13 +222,13 @@ inline TaskTable::TaskTable(TaskCluster & parent, const Poco::Util::AbstractConf
 }
 
 template<typename RandomEngine>
-inline void TaskTable::initShards(RandomEngine && random_engine) 
+inline void TaskTable::initShards(RandomEngine && random_engine)
 {
     const String & fqdn_name = getFQDNOrHostName();
     std::uniform_int_distribution<UInt8> get_urand(0, std::numeric_limits<UInt8>::max());
 
     // Compute the priority
-    for (auto & shard_info : cluster_pull->getShardsInfo()) 
+    for (auto & shard_info : cluster_pull->getShardsInfo())
     {
         TaskShardPtr task_shard = std::make_shared<TaskShard>(*this, shard_info);
         const auto & replicas = cluster_pull->getShardsAddresses().at(task_shard->indexInCluster());
@@ -239,14 +239,14 @@ inline void TaskTable::initShards(RandomEngine && random_engine)
 
     // Sort by priority
     std::sort(all_shards.begin(), all_shards.end(),
-              [](const TaskShardPtr & lhs, const TaskShardPtr & rhs) 
+              [](const TaskShardPtr & lhs, const TaskShardPtr & rhs)
               {
                   return ShardPriority::greaterPriority(lhs->priority, rhs->priority);
               });
 
     // Cut local shards
     auto it_first_remote = std::lower_bound(all_shards.begin(), all_shards.end(), 1,
-                                            [](const TaskShardPtr & lhs, UInt8 is_remote) 
+                                            [](const TaskShardPtr & lhs, UInt8 is_remote)
                                             {
                                                 return lhs->priority.is_remote < is_remote;
                                             });

From 5a67c02a5d18ed9e3df00760c96945d3400cf785 Mon Sep 17 00:00:00 2001
From: Alexander Kazakov <Akazz@users.noreply.github.com>
Date: Thu, 20 Feb 2020 12:53:02 +0300
Subject: [PATCH 32/40] In KeyCondition: Fixed execution of inversed predicates
 for non-strictly monotinic functional index (#9223)

* Tests for functional index

* Fixed execution of inversed predicates in functional index

When non-strictly monotonic functional index is used inverted predicated
may be executed incorrectly, which leads to multiple problems: #8821,
#9034
---
 dbms/src/Storages/MergeTree/KeyCondition.cpp  | 114 +++++++++++++++---
 dbms/src/Storages/MergeTree/KeyCondition.h    |   4 +-
 ...83_functional_index_in_mergetree.reference |  33 +++++
 .../01083_functional_index_in_mergetree.sql   |  33 +++++
 4 files changed, 163 insertions(+), 21 deletions(-)
 create mode 100644 dbms/tests/queries/0_stateless/01083_functional_index_in_mergetree.reference
 create mode 100644 dbms/tests/queries/0_stateless/01083_functional_index_in_mergetree.sql

diff --git a/dbms/src/Storages/MergeTree/KeyCondition.cpp b/dbms/src/Storages/MergeTree/KeyCondition.cpp
index 4ce58c85809..f8c7db4a423 100644
--- a/dbms/src/Storages/MergeTree/KeyCondition.cpp
+++ b/dbms/src/Storages/MergeTree/KeyCondition.cpp
@@ -264,6 +264,78 @@ const KeyCondition::AtomMap KeyCondition::atom_map
 };
 
 
+static const std::map<std::string, std::string> inverse_relations = {
+        {"equals", "notEquals"},
+        {"notEquals", "equals"},
+        {"less", "greaterOrEquals"},
+        {"greaterOrEquals", "less"},
+        {"greater", "lessOrEquals"},
+        {"lessOrEquals", "greater"},
+        {"in", "notIn"},
+        {"notIn", "in"},
+        {"like", "notLike"},
+        {"notLike", "like"},
+        {"empty", "notEmpty"},
+        {"notEmpty", "empty"},
+};
+
+
+bool isLogicalOperator(const String & func_name)
+{
+    return (func_name == "and" || func_name == "or" || func_name == "not" || func_name == "indexHint");
+}
+
+/// The node can be one of:
+///   - Logical operator (AND, OR, NOT and indexHint() - logical NOOP)
+///   - An "atom" (relational operator, constant, expression)
+///   - A logical constant expression
+///   - Any other function
+ASTPtr cloneASTWithInversionPushDown(const ASTPtr node, const bool need_inversion = false)
+{
+    const ASTFunction * func = node->as<ASTFunction>();
+
+    if (func && isLogicalOperator(func->name))
+    {
+        if (func->name == "not")
+        {
+            return cloneASTWithInversionPushDown(func->arguments->children.front(), !need_inversion);
+        }
+
+        const auto result_node = makeASTFunction(func->name);
+
+        /// indexHint() is a special case - logical NOOP function
+        if (result_node->name != "indexHint" && need_inversion)
+        {
+            result_node->name = (result_node->name == "and") ? "or" : "and";
+        }
+
+        if (func->arguments)
+        {
+            for (const auto & child : func->arguments->children)
+            {
+                result_node->arguments->children.push_back(cloneASTWithInversionPushDown(child, need_inversion));
+            }
+        }
+
+        return result_node;
+    }
+
+    const auto cloned_node = node->clone();
+
+    if (func && inverse_relations.find(func->name) != inverse_relations.cend())
+    {
+        if (need_inversion)
+        {
+            cloned_node->as<ASTFunction>()->name = inverse_relations.at(func->name);
+        }
+
+        return cloned_node;
+    }
+
+    return need_inversion ? makeASTFunction("not", cloned_node) : cloned_node;
+}
+
+
 inline bool Range::equals(const Field & lhs, const Field & rhs) { return applyVisitor(FieldVisitorAccurateEquals(), lhs, rhs); }
 inline bool Range::less(const Field & lhs, const Field & rhs) { return applyVisitor(FieldVisitorAccurateLess(), lhs, rhs); }
 
@@ -345,21 +417,23 @@ KeyCondition::KeyCondition(
       */
     Block block_with_constants = getBlockWithConstants(query_info.query, query_info.syntax_analyzer_result, context);
 
-    /// Trasform WHERE section to Reverse Polish notation
-    const auto & select = query_info.query->as<ASTSelectQuery &>();
-    if (select.where())
+    const ASTSelectQuery & select = query_info.query->as<ASTSelectQuery &>();
+    if (select.where() || select.prewhere())
     {
-        traverseAST(select.where(), context, block_with_constants);
+        ASTPtr filter_query;
+        if (select.where() && select.prewhere())
+            filter_query = makeASTFunction("and", select.where(), select.prewhere());
+        else
+            filter_query = select.where() ? select.where() : select.prewhere();
 
-        if (select.prewhere())
-        {
-            traverseAST(select.prewhere(), context, block_with_constants);
-            rpn.emplace_back(RPNElement::FUNCTION_AND);
-        }
-    }
-    else if (select.prewhere())
-    {
-        traverseAST(select.prewhere(), context, block_with_constants);
+        /** When non-strictly monotonic functions are employed in functional index (e.g. ORDER BY toStartOfHour(dateTime)),
+          * the use of NOT operator in predicate will result in the indexing algorithm leave out some data.
+          * This is caused by rewriting in KeyCondition::tryParseAtomFromAST of relational operators to less strict
+          * when parsing the AST into internal RPN representation.
+          * To overcome the problem, before parsing the AST we transform it to its semantically equivalent form where all NOT's
+          * are pushed down and applied (when possible) to leaf nodes.
+          */
+        traverseAST(cloneASTWithInversionPushDown(filter_query), context, block_with_constants);
     }
     else
     {
@@ -432,9 +506,9 @@ void KeyCondition::traverseAST(const ASTPtr & node, const Context & context, Blo
 {
     RPNElement element;
 
-    if (auto * func = node->as<ASTFunction>())
+    if (const auto * func = node->as<ASTFunction>())
     {
-        if (operatorFromAST(func, element))
+        if (tryParseLogicalOperatorFromAST(func, element))
         {
             auto & args = func->arguments->children;
             for (size_t i = 0, size = args.size(); i < size; ++i)
@@ -452,7 +526,7 @@ void KeyCondition::traverseAST(const ASTPtr & node, const Context & context, Blo
         }
     }
 
-    if (!atomFromAST(node, context, block_with_constants, element))
+    if (!tryParseAtomFromAST(node, context, block_with_constants, element))
     {
         element.function = RPNElement::FUNCTION_UNKNOWN;
     }
@@ -680,7 +754,7 @@ static void castValueToType(const DataTypePtr & desired_type, Field & src_value,
 }
 
 
-bool KeyCondition::atomFromAST(const ASTPtr & node, const Context & context, Block & block_with_constants, RPNElement & out)
+bool KeyCondition::tryParseAtomFromAST(const ASTPtr & node, const Context & context, Block & block_with_constants, RPNElement & out)
 {
     /** Functions < > = != <= >= in `notIn`, where one argument is a constant, and the other is one of columns of key,
       *  or itself, wrapped in a chain of possibly-monotonic functions,
@@ -768,7 +842,9 @@ bool KeyCondition::atomFromAST(const ASTPtr & node, const Context & context, Blo
                     func_name = "lessOrEquals";
                 else if (func_name == "lessOrEquals")
                     func_name = "greaterOrEquals";
-                else if (func_name == "in" || func_name == "notIn" || func_name == "like")
+                else if (func_name == "in" || func_name == "notIn" ||
+                         func_name == "like" || func_name == "notLike" ||
+                         func_name == "startsWith")
                 {
                     /// "const IN data_column" doesn't make sense (unlike "data_column IN const")
                     return false;
@@ -809,7 +885,7 @@ bool KeyCondition::atomFromAST(const ASTPtr & node, const Context & context, Blo
     return false;
 }
 
-bool KeyCondition::operatorFromAST(const ASTFunction * func, RPNElement & out)
+bool KeyCondition::tryParseLogicalOperatorFromAST(const ASTFunction * func, RPNElement & out)
 {
     /// Functions AND, OR, NOT.
     /** Also a special function `indexHint` - works as if instead of calling a function there are just parentheses
diff --git a/dbms/src/Storages/MergeTree/KeyCondition.h b/dbms/src/Storages/MergeTree/KeyCondition.h
index fd1d11c0ec8..004cfbc9ea8 100644
--- a/dbms/src/Storages/MergeTree/KeyCondition.h
+++ b/dbms/src/Storages/MergeTree/KeyCondition.h
@@ -369,8 +369,8 @@ private:
         BoolMask initial_mask) const;
 
     void traverseAST(const ASTPtr & node, const Context & context, Block & block_with_constants);
-    bool atomFromAST(const ASTPtr & node, const Context & context, Block & block_with_constants, RPNElement & out);
-    bool operatorFromAST(const ASTFunction * func, RPNElement & out);
+    bool tryParseAtomFromAST(const ASTPtr & node, const Context & context, Block & block_with_constants, RPNElement & out);
+    bool tryParseLogicalOperatorFromAST(const ASTFunction * func, RPNElement & out);
 
     /** Is node the key column
       *  or expression in which column of key is wrapped by chain of functions,
diff --git a/dbms/tests/queries/0_stateless/01083_functional_index_in_mergetree.reference b/dbms/tests/queries/0_stateless/01083_functional_index_in_mergetree.reference
new file mode 100644
index 00000000000..bff552df991
--- /dev/null
+++ b/dbms/tests/queries/0_stateless/01083_functional_index_in_mergetree.reference
@@ -0,0 +1,33 @@
+TP1
+7.51
+7.42
+7.41
+7.42
+7.41
+7.42
+7.41
+7.42
+7.41
+7.51
+TP2
+7.42
+7.41
+7.42
+7.51
+7.42
+7.41
+7.51
+7.51
+TP3
+7.42
+7.41
+7.51
+TP4
+7.42
+7.41
+7.42
+7.42
+7.41
+TP5
+7.41
+7.51
diff --git a/dbms/tests/queries/0_stateless/01083_functional_index_in_mergetree.sql b/dbms/tests/queries/0_stateless/01083_functional_index_in_mergetree.sql
new file mode 100644
index 00000000000..d0fbf3356c8
--- /dev/null
+++ b/dbms/tests/queries/0_stateless/01083_functional_index_in_mergetree.sql
@@ -0,0 +1,33 @@
+SET max_threads = 1;
+
+CREATE TABLE IF NOT EXISTS functional_index_mergetree (x Float64) ENGINE = MergeTree ORDER BY round(x);
+INSERT INTO functional_index_mergetree VALUES (7.42)(7.41)(7.51);
+
+SELECT 'TP1';
+SELECT * FROM functional_index_mergetree WHERE x > 7.42;
+SELECT * FROM functional_index_mergetree WHERE x < 7.49;
+SELECT * FROM functional_index_mergetree WHERE x < 7.5;
+
+SELECT * FROM functional_index_mergetree WHERE NOT (NOT x < 7.49);
+SELECT * FROM functional_index_mergetree WHERE NOT (NOT x < 7.5);
+SELECT * FROM functional_index_mergetree WHERE NOT (NOT x > 7.42);
+
+SELECT 'TP2';
+SELECT * FROM functional_index_mergetree WHERE NOT x > 7.49;
+SELECT * FROM functional_index_mergetree WHERE NOT x < 7.42;
+SELECT * FROM functional_index_mergetree WHERE NOT x < 7.41;
+SELECT * FROM functional_index_mergetree WHERE NOT x < 7.5;
+
+SELECT 'TP3';
+SELECT * FROM functional_index_mergetree WHERE x > 7.41 AND x < 7.51;
+SELECT * FROM functional_index_mergetree WHERE NOT (x > 7.41 AND x < 7.51);
+
+SELECT 'TP4';
+SELECT * FROM functional_index_mergetree WHERE NOT x < 7.41 AND NOT x > 7.49;
+SELECT * FROM functional_index_mergetree WHERE NOT x < 7.42 AND NOT x > 7.42;
+SELECT * FROM functional_index_mergetree WHERE (NOT x < 7.4) AND (NOT x > 7.49);
+
+SELECT 'TP5';
+SELECT * FROM functional_index_mergetree WHERE NOT or(NOT x, toUInt64(x) AND NOT floor(x) > 6, x >= 7.42 AND round(x) <= 7);
+
+DROP TABLE functional_index_mergetree;

From 7cf650653431d24e2920a586d1586ec428f0911a Mon Sep 17 00:00:00 2001
From: "Mikhail f. Shiryaev" <mr.felixoid@gmail.com>
Date: Thu, 20 Feb 2020 11:01:31 +0100
Subject: [PATCH 33/40] Update documentation for system.replicas

---
 docs/en/operations/system_tables.md | 119 ++++++++++++++--------------
 docs/ru/operations/system_tables.md | 119 ++++++++++++++--------------
 2 files changed, 116 insertions(+), 122 deletions(-)

diff --git a/docs/en/operations/system_tables.md b/docs/en/operations/system_tables.md
index 5244877c469..0eb1b8d67f6 100644
--- a/docs/en/operations/system_tables.md
+++ b/docs/en/operations/system_tables.md
@@ -694,76 +694,73 @@ FORMAT Vertical
 ```text
 Row 1:
 ──────
-database:           merge
-table:              visits
-engine:             ReplicatedCollapsingMergeTree
-is_leader:          1
-is_readonly:        0
-is_session_expired: 0
-future_parts:       1
-parts_to_check:     0
-zookeeper_path:     /clickhouse/tables/01-06/visits
-replica_name:       example01-06-1.yandex.ru
-replica_path:       /clickhouse/tables/01-06/visits/replicas/example01-06-1.yandex.ru
-columns_version:    9
-queue_size:         1
-inserts_in_queue:   0
-merges_in_queue:    1
-log_max_index:      596273
-log_pointer:        596274
-total_replicas:     2
-active_replicas:    2
+database:                   merge
+table:                      visits
+engine:                     ReplicatedCollapsingMergeTree
+is_leader:                  1
+can_become_leader:          1
+is_readonly:                0
+is_session_expired:         0
+future_parts:               1
+parts_to_check:             0
+zookeeper_path:             /clickhouse/tables/01-06/visits
+replica_name:               example01-06-1.yandex.ru
+replica_path:               /clickhouse/tables/01-06/visits/replicas/example01-06-1.yandex.ru
+columns_version:            9
+queue_size:                 1
+inserts_in_queue:           0
+merges_in_queue:            1
+part_mutations_in_queue:    0
+queue_oldest_time:          2020-02-20 08:34:30
+inserts_oldest_time:        0000-00-00 00:00:00
+merges_oldest_time:         2020-02-20 08:34:30
+part_mutations_oldest_time: 0000-00-00 00:00:00
+oldest_part_to_get:
+oldest_part_to_merge_to:    20200220_20284_20840_7
+oldest_part_to_mutate_to:
+log_max_index:              596273
+log_pointer:                596274
+last_queue_update:          2020-02-20 08:34:32
+absolute_delay:             0
+total_replicas:             2
+active_replicas:            2
 ```
 
 Columns:
 
-```text
-database:          Database name
-table:              Table name
-engine:            Table engine name
-
-is_leader:          Whether the replica is the leader.
-
-Only one replica at a time can be the leader. The leader is responsible for selecting background merges to perform.
+- `database` (`String`) - Database name
+- `table` (`String`) - Table name
+- `engine` (`String`) - Table engine name
+- `is_leader` (`UInt8`) - Whether the replica is the leader.  
+Only one replica at a time can be the leader. The leader is responsible for selecting background merges to perform.  
 Note that writes can be performed to any replica that is available and has a session in ZK, regardless of whether it is a leader.
-
-is_readonly:        Whether the replica is in read-only mode.
+- `can_become_leader` (`UInt8`) - Whether the replica can be elected as a leader.
+- `is_readonly` (`UInt8`) - Whether the replica is in read-only mode.  
 This mode is turned on if the config doesn't have sections with ZooKeeper, if an unknown error occurred when reinitializing sessions in ZooKeeper, and during session reinitialization in ZooKeeper.
-
-is_session_expired: Whether the session with ZooKeeper has expired.
-Basically the same as 'is_readonly'.
-
-future_parts:       The number of data parts that will appear as the result of INSERTs or merges that haven't been done yet.
-
-parts_to_check:    The number of data parts in the queue for verification.
-A part is put in the verification queue if there is suspicion that it might be damaged.
-
-zookeeper_path:     Path to table data in ZooKeeper.
-replica_name:       Replica name in ZooKeeper. Different replicas of the same table have different names.
-replica_path:      Path to replica data in ZooKeeper. The same as concatenating 'zookeeper_path/replicas/replica_path'.
-
-columns_version:    Version number of the table structure.
-Indicates how many times ALTER was performed. If replicas have different versions, it means some replicas haven't made all of the ALTERs yet.
-
-queue_size:         Size of the queue for operations waiting to be performed.
-Operations include inserting blocks of data, merges, and certain other actions.
-It usually coincides with 'future_parts'.
-
-inserts_in_queue:   Number of inserts of blocks of data that need to be made.
-Insertions are usually replicated fairly quickly. If this number is large, it means something is wrong.
-
-merges_in_queue:    The number of merges waiting to be made.
-Sometimes merges are lengthy, so this value may be greater than zero for a long time.
+- `is_session_expired` (`UInt8`) - the session with ZooKeeper has expired. Basically the same as `is_readonly`.
+- `future_parts` (`UInt32`) - The number of data parts that will appear as the result of INSERTs or merges that haven't been done yet.
+- `parts_to_check` (`UInt32`) - The number of data parts in the queue for verification. A part is put in the verification queue if there is suspicion that it might be damaged.
+- `zookeeper_path` (`String`) - Path to table data in ZooKeeper.
+- `replica_name` (`String`) - Replica name in ZooKeeper. Different replicas of the same table have different names.
+- `replica_path` (`String`) - Path to replica data in ZooKeeper. The same as concatenating 'zookeeper_path/replicas/replica_path'.
+- `columns_version` (`Int32`) - Version number of the table structure. Indicates how many times ALTER was performed. If replicas have different versions, it means some replicas haven't made all of the ALTERs yet.
+- `queue_size` (`UInt32`) - Size of the queue for operations waiting to be performed. Operations include inserting blocks of data, merges, and certain other actions. It usually coincides with `future_parts`.
+- `inserts_in_queue` (`UInt32`) - Number of inserts of blocks of data that need to be made. Insertions are usually replicated fairly quickly. If this number is large, it means something is wrong.
+- `merges_in_queue` (`UInt32`) - The number of merges waiting to be made. Sometimes merges are lengthy, so this value may be greater than zero for a long time.
+- `part_mutations_in_queue` (`UInt32`) - The number of mutations waiting to be made.
+- `queue_oldest_time` (`DateTime`) - If `queue_size` greater than 0, shows when the oldest operation was added to the queue.
+- `inserts_oldest_time` (`DateTime`) - See `queue_oldest_time`
+- `merges_oldest_time` (`DateTime`) - See `queue_oldest_time`
+- `part_mutations_oldest_time` (`DateTime`) - See `queue_oldest_time`
 
 The next 4 columns have a non-zero value only where there is an active session with ZK.
 
-log_max_index:      Maximum entry number in the log of general activity.
-log_pointer:        Maximum entry number in the log of general activity that the replica copied to its execution queue, plus one.
-If log_pointer is much smaller than log_max_index, something is wrong.
-
-total_replicas:     The total number of known replicas of this table.
-active_replicas:    The number of replicas of this table that have a session in ZooKeeper (i.e., the number of functioning replicas).
-```
+- `log_max_index` (`UInt64`) - Maximum entry number in the log of general activity.
+- `log_pointer` (`UInt64`) - Maximum entry number in the log of general activity that the replica copied to its execution queue, plus one. If `log_pointer` is much smaller than `log_max_index`, something is wrong.
+- `last_queue_update` (`DateTime`) - When the queue was updated last time.
+- `absolute_delay` (`UInt64`) - How big lag in seconds the current replica has.
+- `total_replicas` (`UInt8`) - The total number of known replicas of this table.
+- `active_replicas` (`UInt8`) - The number of replicas of this table that have a session in ZooKeeper (i.e., the number of functioning replicas).
 
 If you request all the columns, the table may work a bit slowly, since several reads from ZooKeeper are made for each row.
 If you don't request the last 4 columns (log_max_index, log_pointer, total_replicas, active_replicas), the table works quickly.
diff --git a/docs/ru/operations/system_tables.md b/docs/ru/operations/system_tables.md
index 400412325d2..dad397462d9 100644
--- a/docs/ru/operations/system_tables.md
+++ b/docs/ru/operations/system_tables.md
@@ -741,76 +741,73 @@ FORMAT Vertical
 ```text
 Row 1:
 ──────
-database:           merge
-table:              visits
-engine:             ReplicatedCollapsingMergeTree
-is_leader:          1
-is_readonly:        0
-is_session_expired: 0
-future_parts:       1
-parts_to_check:     0
-zookeeper_path:     /clickhouse/tables/01-06/visits
-replica_name:       example01-06-1.yandex.ru
-replica_path:       /clickhouse/tables/01-06/visits/replicas/example01-06-1.yandex.ru
-columns_version:    9
-queue_size:         1
-inserts_in_queue:   0
-merges_in_queue:    1
-log_max_index:      596273
-log_pointer:        596274
-total_replicas:     2
-active_replicas:    2
+database:                   merge
+table:                      visits
+engine:                     ReplicatedCollapsingMergeTree
+is_leader:                  1
+can_become_leader:          1
+is_readonly:                0
+is_session_expired:         0
+future_parts:               1
+parts_to_check:             0
+zookeeper_path:             /clickhouse/tables/01-06/visits
+replica_name:               example01-06-1.yandex.ru
+replica_path:               /clickhouse/tables/01-06/visits/replicas/example01-06-1.yandex.ru
+columns_version:            9
+queue_size:                 1
+inserts_in_queue:           0
+merges_in_queue:            1
+part_mutations_in_queue:    0
+queue_oldest_time:          2020-02-20 08:34:30
+inserts_oldest_time:        0000-00-00 00:00:00
+merges_oldest_time:         2020-02-20 08:34:30
+part_mutations_oldest_time: 0000-00-00 00:00:00
+oldest_part_to_get:
+oldest_part_to_merge_to:    20200220_20284_20840_7
+oldest_part_to_mutate_to:
+log_max_index:              596273
+log_pointer:                596274
+last_queue_update:          2020-02-20 08:34:32
+absolute_delay:             0
+total_replicas:             2
+active_replicas:            2
 ```
 
 Столбцы:
 
-```text
-database:           имя БД
-table:              имя таблицы
-engine:             имя движка таблицы
-
-is_leader:          является ли реплика лидером
-
-В один момент времени, не более одной из реплик является лидером. Лидер отвечает за выбор фоновых слияний, которые следует произвести.
+- `database` (`String`) - имя БД.
+- `table` (`String`) - имя таблицы.
+- `engine` (`String`) - имя движка таблицы.
+- `is_leader` (`UInt8`) - является ли реплика лидером.  
+В один момент времени, не более одной из реплик является лидером. Лидер отвечает за выбор фоновых слияний, которые следует произвести.  
 Замечу, что запись можно осуществлять на любую реплику (доступную и имеющую сессию в ZK), независимо от лидерства.
-
-is_readonly:        находится ли реплика в режиме "только для чтения"
+- `can_become_leader` (`UInt8`) - может ли реплика быть выбрана лидером.
+- `is_readonly` (`UInt8`) - находится ли реплика в режиме "только для чтения"  
 Этот режим включается, если в конфиге нет секции с ZK; если при переинициализации сессии в ZK произошла неизвестная ошибка; во время переинициализации сессии с ZK.
-
-is_session_expired: истекла ли сессия с ZK.
-В основном, то же самое, что и is_readonly.
-
-future_parts:       количество кусков с данными, которые появятся в результате INSERT-ов или слияний, которых ещё предстоит сделать
-
-parts_to_check:     количество кусков с данными в очереди на проверку
-Кусок помещается в очередь на проверку, если есть подозрение, что он может быть битым.
-
-zookeeper_path:     путь к данным таблицы в ZK
-replica_name:       имя реплики в ZK; разные реплики одной таблицы имеют разное имя
-replica_path:       путь к данным реплики в ZK. То же самое, что конкатенация zookeeper_path/replicas/replica_path.
-
-columns_version:    номер версии структуры таблицы
-Обозначает, сколько раз был сделан ALTER. Если на репликах разные версии, значит некоторые реплики сделали ещё не все ALTER-ы.
-
-queue_size:         размер очереди действий, которых предстоит сделать
-К действиям относятся вставки блоков данных, слияния, и некоторые другие действия.
-Как правило, совпадает с future_parts.
-
-inserts_in_queue:   количество вставок блоков данных, которых предстоит сделать
-Обычно вставки должны быстро реплицироваться. Если величина большая - значит что-то не так.
-
-merges_in_queue:    количество слияний, которых предстоит сделать
-Бывают длинные слияния - то есть, это значение может быть больше нуля продолжительное время.
+- `is_session_expired` (`UInt8`) - истекла ли сессия с ZK. В основном, то же самое, что и `is_readonly`.
+- `future_parts` (`UInt32`) - количество кусков с данными, которые появятся в результате INSERT-ов или слияний, которых ещё предстоит сделать
+- `parts_to_check` (`UInt32`) - количество кусков с данными в очереди на проверку. Кусок помещается в очередь на проверку, если есть подозрение, что он может быть битым.
+- `zookeeper_path` (`String`) - путь к данным таблицы в ZK.
+- `replica_name` (`String`) - имя реплики в ZK; разные реплики одной таблицы имеют разное имя.
+- `replica_path` (`String`) - путь к данным реплики в ZK. То же самое, что конкатенация zookeeper_path/replicas/replica_path.
+- `columns_version` (`Int32`) - номер версии структуры таблицы. Обозначает, сколько раз был сделан ALTER. Если на репликах разные версии, значит некоторые реплики сделали ещё не все ALTER-ы.
+- `queue_size` (`UInt32`) - размер очереди действий, которые предстоит сделать. К действиям относятся вставки блоков данных, слияния, и некоторые другие действия. Как правило, совпадает с future_parts.
+- `inserts_in_queue` (`UInt32`) - количество вставок блоков данных, которые предстоит сделать. Обычно вставки должны быстро реплицироваться. Если величина большая - значит что-то не так.
+- `merges_in_queue` (`UInt32`) - количество слияний, которые предстоит сделать. Бывают длинные слияния - то есть, это значение может быть больше нуля продолжительное время.
+- `part_mutations_in_queue` (`UInt32`) - количество мутаций, которые предстоит сделать.
+- `queue_oldest_time` (`DateTime`) - если `queue_size` больше 0, показывает, когда была добавлена в очередь самая старая операция.
+- `inserts_oldest_time` (`DateTime`) - см. `queue_oldest_time`.
+- `merges_oldest_time` (`DateTime`) - см. `queue_oldest_time`.
+- `part_mutations_oldest_time` (`DateTime`) - см. `queue_oldest_time`.
 
 Следующие 4 столбца имеют ненулевое значение только если активна сессия с ZK.
 
-log_max_index:      максимальный номер записи в общем логе действий
-log_pointer:        максимальный номер записи из общего лога действий, которую реплика скопировала в свою очередь для выполнения, плюс единица
-Если log_pointer сильно меньше log_max_index, значит что-то не так.
-
-total_replicas:     общее число известных реплик этой таблицы
-active_replicas:    число реплик этой таблицы, имеющих сессию в ZK; то есть, число работающих реплик
-```
+- `log_max_index` (`UInt64`) - максимальный номер записи в общем логе действий.
+- `log_pointer` (`UInt64`) - максимальный номер записи из общего лога действий, которую реплика скопировала в свою очередь для выполнения, плюс единица. Если log_pointer сильно меньше log_max_index, значит что-то не так.
+- `last_queue_update` (`DateTime`) - When the queue was updated last time.
+- `absolute_delay` (`UInt64`) - How big lag in seconds the current replica has.
+- `total_replicas` (`UInt8`) - общее число известных реплик этой таблицы.
+- `active_replicas` (`UInt8`) - число реплик этой таблицы, имеющих сессию в ZK; то есть, число работающих реплик.
 
 Если запрашивать все столбцы, то таблица может работать слегка медленно, так как на каждую строчку делается несколько чтений из ZK.
 Если не запрашивать последние 4 столбца (log_max_index, log_pointer, total_replicas, active_replicas), то таблица работает быстро.

From ee2cb00aa1bb254e343110af07d6846c5658136d Mon Sep 17 00:00:00 2001
From: Ivan Lezhankin <ilezhankin@yandex-team.ru>
Date: Thu, 20 Feb 2020 15:46:27 +0300
Subject: [PATCH 34/40] Use getAllPhysical()

---
 dbms/src/Storages/StorageLog.cpp | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/dbms/src/Storages/StorageLog.cpp b/dbms/src/Storages/StorageLog.cpp
index 0c44081d68f..e9c539ea9a5 100644
--- a/dbms/src/Storages/StorageLog.cpp
+++ b/dbms/src/Storages/StorageLog.cpp
@@ -541,15 +541,8 @@ void StorageLog::truncate(const ASTPtr &, const Context &, TableStructureWriteLo
 const StorageLog::Marks & StorageLog::getMarksWithRealRowCount() const
 {
     /// There should be at least one physical column
-    auto begin = getColumns().begin();
-    while (begin != getColumns().end() && begin->default_desc == ColumnDefaultKind::Alias)
-        ++begin;
-
-    if (begin == getColumns().end())
-        throw Exception("No physical columns found!", ErrorCodes::LOGICAL_ERROR);
-
-    const String & column_name = begin->name;
-    const IDataType & column_type = *begin->type;
+    const String & column_name = getColumns().getAllPhysical().begin()->name;
+    const IDataType & column_type = *getColumns().getAllPhysical().begin()->type;
     String filename;
 
     /** We take marks from first column.

From 87b32a283178c3b9b07c7e4b8b4ce1074b1bc8cc Mon Sep 17 00:00:00 2001
From: Ivan Blinkov <github@blinkov.ru>
Date: Thu, 20 Feb 2020 16:32:48 +0300
Subject: [PATCH 35/40] Create events.js

---
 website/workers/events.js | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)
 create mode 100644 website/workers/events.js

diff --git a/website/workers/events.js b/website/workers/events.js
new file mode 100644
index 00000000000..653139af9f9
--- /dev/null
+++ b/website/workers/events.js
@@ -0,0 +1,34 @@
+addEventListener('fetch', event => {
+  event.respondWith(handleRequest(event.request))
+})
+
+async function handleRequest(request) {
+  let raw = await fetch('https://raw.githubusercontent.com/ClickHouse/ClickHouse/master/README.md');
+  let text = await raw.text();
+  let lines = text.split('\n');
+  let skip = true;
+  let events = [];
+  for (let idx in lines) {
+      let line = lines[idx];
+      if (skip) {
+          if (line.includes('Upcoming Events')) {
+              skip = false;
+          }
+      } else {
+          if (!line) { continue; };
+          line = line.split('](');
+          var tail = line[1].split(') ');
+          events.push({
+            'signup_link': tail[0],
+            'event_name': line[0].replace('* [', ''),
+            'event_date': tail[1].slice(0, -1).replace('on ', '')
+          });
+      }
+  }
+           
+  let response = new Response(JSON.stringify({
+    'events': events
+  }));
+  response.headers.set('Content-Type', 'application/json');
+  return response;
+}

From 9748f8dcf65eff3a35baf4e06e1671bb2f6e74dd Mon Sep 17 00:00:00 2001
From: chertus <chertus@gmail.com>
Date: Thu, 20 Feb 2020 16:33:14 +0300
Subject: [PATCH 36/40] fix bug with comma join and in

---
 .../Interpreters/CrossToInnerJoinVisitor.cpp  | 23 ++++---------------
 .../0_stateless/00863_comma_join_in.sql       |  2 ++
 ...01083_cross_to_inner_with_in_bug.reference |  1 +
 .../01083_cross_to_inner_with_in_bug.sql      | 13 +++++++++++
 4 files changed, 21 insertions(+), 18 deletions(-)
 create mode 100644 dbms/tests/queries/0_stateless/01083_cross_to_inner_with_in_bug.reference
 create mode 100644 dbms/tests/queries/0_stateless/01083_cross_to_inner_with_in_bug.sql

diff --git a/dbms/src/Interpreters/CrossToInnerJoinVisitor.cpp b/dbms/src/Interpreters/CrossToInnerJoinVisitor.cpp
index 54d5205c4c2..740fe35e936 100644
--- a/dbms/src/Interpreters/CrossToInnerJoinVisitor.cpp
+++ b/dbms/src/Interpreters/CrossToInnerJoinVisitor.cpp
@@ -124,15 +124,12 @@ public:
         {
             /// leave other comparisons as is
         }
-        else if (functionIsLikeOperator(node.name)) /// LIKE, NOT LIKE
+        else if (functionIsLikeOperator(node.name) || /// LIKE, NOT LIKE
+                 functionIsInOperator(node.name))  /// IN, NOT IN
         {
-            /// leave as is
-        }
-        else if (functionIsInOperator(node.name)) /// IN, NOT IN
-        {
-            if (auto ident = node.arguments->children.at(0)->as<ASTIdentifier>())
-                if (size_t min_table = checkIdentifier(*ident))
-                    asts_to_join_on[min_table].push_back(ast);
+            /// leave as is. It's not possible to make push down here cause of unknown aliases and not implemented JOIN predicates.
+            ///     select a as b form t1, t2 where t1.x = t2.x and b in(42)
+            ///     select a as b form t1 inner join t2 on t1.x = t2.x and b in(42)
         }
         else
         {
@@ -202,16 +199,6 @@ private:
         }
         return 0;
     }
-
-    size_t checkIdentifier(const ASTIdentifier & identifier)
-    {
-        size_t best_table_pos = 0;
-        bool match = IdentifierSemantic::chooseTable(identifier, tables, best_table_pos);
-
-        if (match && joined_tables[best_table_pos].canAttachOnExpression())
-            return best_table_pos;
-        return 0;
-    }
 };
 
 using CheckExpressionMatcher = ConstOneTypeMatcher<CheckExpressionVisitorData, false>;
diff --git a/dbms/tests/queries/0_stateless/00863_comma_join_in.sql b/dbms/tests/queries/0_stateless/00863_comma_join_in.sql
index 8cfc00627dc..ebccd351c8a 100644
--- a/dbms/tests/queries/0_stateless/00863_comma_join_in.sql
+++ b/dbms/tests/queries/0_stateless/00863_comma_join_in.sql
@@ -10,6 +10,8 @@ insert into test1_00863 (id, code) select number, toString(number) FROM numbers(
 insert into test3_00863 (id, code) select number, toString(number) FROM numbers(100000);
 insert into test2_00863 (id, code, test1_id, test3_id) select number, toString(number), number, number FROM numbers(100000);
 
+SET max_memory_usage = 50000000;
+
 select test2_00863.id
 from test1_00863, test2_00863, test3_00863
 where test1_00863.code in ('1', '2', '3')
diff --git a/dbms/tests/queries/0_stateless/01083_cross_to_inner_with_in_bug.reference b/dbms/tests/queries/0_stateless/01083_cross_to_inner_with_in_bug.reference
new file mode 100644
index 00000000000..699fa0cd95c
--- /dev/null
+++ b/dbms/tests/queries/0_stateless/01083_cross_to_inner_with_in_bug.reference
@@ -0,0 +1 @@
+2	2	1
diff --git a/dbms/tests/queries/0_stateless/01083_cross_to_inner_with_in_bug.sql b/dbms/tests/queries/0_stateless/01083_cross_to_inner_with_in_bug.sql
new file mode 100644
index 00000000000..f6d788512f8
--- /dev/null
+++ b/dbms/tests/queries/0_stateless/01083_cross_to_inner_with_in_bug.sql
@@ -0,0 +1,13 @@
+drop table if exists ax;
+drop table if exists bx;
+
+create table ax (A Int64, B Int64) Engine = Memory;
+create table bx (A Int64) Engine = Memory;
+
+insert into ax values (1, 1), (2, 1);
+insert into bx values (2), (4);
+
+select * from bx, ax where ax.A = bx.A and ax.B in (1,2);
+
+drop table ax;
+drop table bx;

From 0c686baf4f810f9aba2d3da9dd0c1e068ec4a93f Mon Sep 17 00:00:00 2001
From: Ivan Blinkov <github@blinkov.ru>
Date: Thu, 20 Feb 2020 16:33:19 +0300
Subject: [PATCH 37/40] Create repo.js

---
 website/workers/repo.js | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 website/workers/repo.js

diff --git a/website/workers/repo.js b/website/workers/repo.js
new file mode 100644
index 00000000000..470391cf225
--- /dev/null
+++ b/website/workers/repo.js
@@ -0,0 +1,10 @@
+addEventListener('fetch', event => {
+  event.respondWith(handleRequest(event.request))
+})
+
+async function handleRequest(request) {
+  let url = new URL(request.url);
+  url.hostname = 'repo.yandex.ru';
+  url.pathname = '/clickhouse' + url.pathname;
+  return fetch(url)
+}

From 9191e5f17facdcdaefb6757575d48d8588457552 Mon Sep 17 00:00:00 2001
From: Nikolai Kochetov <nik-kochetov@yandex-team.ru>
Date: Thu, 20 Feb 2020 18:26:20 +0300
Subject: [PATCH 38/40] Fix NDEBUG in PipelineExecutor.

---
 .../Processors/Executors/PipelineExecutor.cpp | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/dbms/src/Processors/Executors/PipelineExecutor.cpp b/dbms/src/Processors/Executors/PipelineExecutor.cpp
index 728554c80cf..f4be5a84518 100644
--- a/dbms/src/Processors/Executors/PipelineExecutor.cpp
+++ b/dbms/src/Processors/Executors/PipelineExecutor.cpp
@@ -263,7 +263,7 @@ bool PipelineExecutor::prepareProcessor(UInt64 pid, size_t thread_number, Queue
     std::vector<Edge *> updated_direct_edges;
 
     {
-#ifndef N_DEBUG
+#ifndef NDEBUG
         Stopwatch watch;
 #endif
 
@@ -279,7 +279,7 @@ bool PipelineExecutor::prepareProcessor(UInt64 pid, size_t thread_number, Queue
             return false;
         }
 
-#ifndef N_DEBUG
+#ifndef NDEBUG
         node.execution_state->preparation_time_ns += watch.elapsed();
 #endif
 
@@ -468,7 +468,7 @@ void PipelineExecutor::execute(size_t num_threads)
     }
     catch (...)
     {
-#ifndef N_DEBUG
+#ifndef NDEBUG
         LOG_TRACE(log, "Exception while executing query. Current state:\n" << dumpPipeline());
 #endif
         throw;
@@ -491,7 +491,7 @@ void PipelineExecutor::execute(size_t num_threads)
 
 void PipelineExecutor::executeSingleThread(size_t thread_num, size_t num_threads)
 {
-#ifndef N_DEBUG
+#ifndef NDEBUG
     UInt64 total_time_ns = 0;
     UInt64 execution_time_ns = 0;
     UInt64 processing_time_ns = 0;
@@ -577,13 +577,13 @@ void PipelineExecutor::executeSingleThread(size_t thread_num, size_t num_threads
             addJob(state);
 
             {
-#ifndef N_DEBUG
+#ifndef NDEBUG
                 Stopwatch execution_time_watch;
 #endif
 
                 state->job();
 
-#ifndef N_DEBUG
+#ifndef NDEBUG
                 execution_time_ns += execution_time_watch.elapsed();
 #endif
             }
@@ -594,7 +594,7 @@ void PipelineExecutor::executeSingleThread(size_t thread_num, size_t num_threads
             if (finished)
                 break;
 
-#ifndef N_DEBUG
+#ifndef NDEBUG
             Stopwatch processing_time_watch;
 #endif
 
@@ -648,13 +648,13 @@ void PipelineExecutor::executeSingleThread(size_t thread_num, size_t num_threads
                     doExpandPipeline(task, false);
             }
 
-#ifndef N_DEBUG
+#ifndef NDEBUG
             processing_time_ns += processing_time_watch.elapsed();
 #endif
         }
     }
 
-#ifndef N_DEBUG
+#ifndef NDEBUG
     total_time_ns = total_time_watch.elapsed();
     wait_time_ns = total_time_ns - execution_time_ns - processing_time_ns;
 
@@ -769,7 +769,7 @@ String PipelineExecutor::dumpPipeline() const
             WriteBufferFromOwnString buffer;
             buffer << "(" << node.execution_state->num_executed_jobs << " jobs";
 
-#ifndef N_DEBUG
+#ifndef NDEBUG
             buffer << ", execution time: " << node.execution_state->execution_time_ns / 1e9 << " sec.";
             buffer << ", preparation time: " << node.execution_state->preparation_time_ns / 1e9 << " sec.";
 #endif

From d211b1a4826155d7fcaea23c963d2040e701e067 Mon Sep 17 00:00:00 2001
From: Ivan Lezhankin <ilezhankin@yandex-team.ru>
Date: Thu, 20 Feb 2020 18:40:23 +0300
Subject: [PATCH 39/40] Don't use refs on vanishing objects

---
 dbms/src/Storages/StorageLog.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dbms/src/Storages/StorageLog.cpp b/dbms/src/Storages/StorageLog.cpp
index e9c539ea9a5..b079d5d7a8a 100644
--- a/dbms/src/Storages/StorageLog.cpp
+++ b/dbms/src/Storages/StorageLog.cpp
@@ -541,8 +541,8 @@ void StorageLog::truncate(const ASTPtr &, const Context &, TableStructureWriteLo
 const StorageLog::Marks & StorageLog::getMarksWithRealRowCount() const
 {
     /// There should be at least one physical column
-    const String & column_name = getColumns().getAllPhysical().begin()->name;
-    const IDataType & column_type = *getColumns().getAllPhysical().begin()->type;
+    const String column_name = getColumns().getAllPhysical().begin()->name;
+    const auto column_type = getColumns().getAllPhysical().begin()->type;
     String filename;
 
     /** We take marks from first column.
@@ -550,7 +550,7 @@ const StorageLog::Marks & StorageLog::getMarksWithRealRowCount() const
       * (Example: for Array data type, first stream is array sizes; and number of array sizes is the number of arrays).
       */
     IDataType::SubstreamPath substream_root_path;
-    column_type.enumerateStreams([&](const IDataType::SubstreamPath & substream_path)
+    column_type->enumerateStreams([&](const IDataType::SubstreamPath & substream_path)
     {
         if (filename.empty())
             filename = IDataType::getFileNameForStream(column_name, substream_path);

From 3ae2282209ce504a6314d24204128aaaf73e7d40 Mon Sep 17 00:00:00 2001
From: Alexander Kuzmenkov <akuzm@yandex-team.ru>
Date: Thu, 20 Feb 2020 19:28:21 +0300
Subject: [PATCH 40/40] performance comparison

---
 docker/test/performance-comparison/compare.sh | 85 ++++++++++++++-----
 .../test/performance-comparison/entrypoint.sh |  2 +-
 docker/test/performance-comparison/perf.py    |  2 +-
 3 files changed, 67 insertions(+), 22 deletions(-)

diff --git a/docker/test/performance-comparison/compare.sh b/docker/test/performance-comparison/compare.sh
index eba6aba0637..c782a52592b 100755
--- a/docker/test/performance-comparison/compare.sh
+++ b/docker/test/performance-comparison/compare.sh
@@ -27,13 +27,16 @@ function download
         wget -nv -nd -c "https://clickhouse-builds.s3.yandex.net/$left_pr/$left_sha/performance/performance.tgz" -O- | tar -C left --strip-components=1 -zxv  &
         wget -nv -nd -c "https://clickhouse-builds.s3.yandex.net/$right_pr/$right_sha/performance/performance.tgz" -O- | tar -C right --strip-components=1 -zxv &
     else
-        wget -nv -nd -c "https://clickhouse-builds.s3.yandex.net/$left_pr/$left_sha/performance/performance.tgz" -O- | tar -C left --strip-components=1 -zxv && cp -al left right
+        wget -nv -nd -c "https://clickhouse-builds.s3.yandex.net/$left_pr/$left_sha/performance/performance.tgz" -O- | tar -C left --strip-components=1 -zxv && cp -a left right &
     fi
 
     cd db0 && wget -nv -nd -c "https://s3.mds.yandex.net/clickhouse-private-datasets/hits_10m_single/partitions/hits_10m_single.tar" -O- | tar -xv &
     cd db0 && wget -nv -nd -c "https://s3.mds.yandex.net/clickhouse-private-datasets/hits_100m_single/partitions/hits_100m_single.tar" -O- | tar -xv &
     cd db0 && wget -nv -nd -c "https://clickhouse-datasets.s3.yandex.net/hits/partitions/hits_v1.tar" -O- | tar -xv &
     cd db0 && wget -nv -nd -c "https://clickhouse-datasets.s3.yandex.net/values_with_expressions/partitions/test_values.tar" -O- | tar -xv &
+
+    mkdir ~/fg ; cd ~/fg && wget -nv -nd -c "https://raw.githubusercontent.com/brendangregg/FlameGraph/master/flamegraph.pl" && chmod +x ~/fg/flamegraph.pl &
+
     wait
 }
 
@@ -223,13 +226,13 @@ function get_profiles
 function report
 {
 
-for x in *.tsv
+for x in {right,left}-{addresses,{query,trace}-log}.tsv
 do
     # FIXME This loop builds column definitons from TSVWithNamesAndTypes in an
     # absolutely atrocious way. This should be done by the file() function itself.
     paste -d' ' \
-        <(sed -n '1s/\t/\n/gp' "$x" | sed 's/\(^.*$\)/"\1"/') \
-        <(sed -n '2s/\t/\n/gp' "$x" ) \
+        <(sed -n '1{s/\t/\n/g;p;q}' "$x" | sed 's/\(^.*$\)/"\1"/') \
+        <(sed -n '2{s/\t/\n/g;p;q}' "$x" ) \
         | tr '\n' ', ' | sed 's/,$//' > "$x.columns"
 done
 
@@ -301,43 +304,85 @@ create view right_query_log as select *
 create view right_trace_log as select *
     from file('right-trace-log.tsv', TSVWithNamesAndTypes, '$(cat right-trace-log.tsv.columns)');
 
-create view right_addresses as select *
+create view right_addresses_src as select *
     from file('right-addresses.tsv', TSVWithNamesAndTypes, '$(cat right-addresses.tsv.columns)');
 
-create table unstable_query_ids engine File(TSVWithNamesAndTypes, 'unstable-query-ids.rep') as
-    select query_id from right_query_log
+create table right_addresses_join engine Join(any, left, address) as
+    select addr address, name from right_addresses_src;
+
+create table unstable_query_runs engine File(TSVWithNamesAndTypes, 'unstable-query-runs.rep') as
+    select query_id, query from right_query_log
     join unstable_queries_tsv using query
+    where query_id not like 'prewarm %'
     ;
 
-create table unstable_query_metrics engine File(TSVWithNamesAndTypes, 'unstable-query-metrics.rep') as
+create table unstable_query_log engine File(Vertical, 'unstable-query-log.rep') as
+    select * from right_query_log
+    where query_id in (select query_id from unstable_query_runs);
+
+create table unstable_run_metrics engine File(TSVWithNamesAndTypes, 'unstable-run-metrics.rep') as
     select ProfileEvents.Values value, ProfileEvents.Names metric, query_id, query
     from right_query_log array join ProfileEvents
-    where query_id in (unstable_query_ids)
+    where query_id in (select query_id from unstable_query_runs)
     ;
 
-create table unstable_query_traces engine File(TSVWithNamesAndTypes, 'unstable-query-traces.rep') as
-    select count() value, right_addresses.name metric,
-        unstable_query_ids.query_id, any(right_query_log.query) query
-    from unstable_query_ids
-    join right_query_log on right_query_log.query_id = unstable_query_ids.query_id
-    join right_trace_log on right_trace_log.query_id = unstable_query_ids.query_id
-    join right_addresses on addr = arrayJoin(trace)
-    group by unstable_query_ids.query_id, metric
+create table unstable_run_metrics_2 engine File(TSVWithNamesAndTypes, 'unstable-run-metrics-2.rep') as
+    select v, n, query_id, query
+    from
+        (select
+            ['memory_usage', 'read_bytes', 'written_bytes'] n,
+            [memory_usage, read_bytes, written_bytes] v,
+            query,
+            query_id
+        from right_query_log
+        where query_id in (select query_id from unstable_query_runs))
+    array join n, v;
+
+create table unstable_run_traces engine File(TSVWithNamesAndTypes, 'unstable-run-traces.rep') as
+    select count() value, joinGet(right_addresses_join, 'name', arrayJoin(trace)) metric,
+        unstable_query_runs.query_id, any(unstable_query_runs.query) query
+    from unstable_query_runs
+    join right_trace_log on right_trace_log.query_id = unstable_query_runs.query_id
+    group by unstable_query_runs.query_id, metric
     order by count() desc
     ;
 
 create table metric_devation engine File(TSVWithNamesAndTypes, 'metric-deviation.rep') as
     select floor((q[3] - q[1])/q[2], 3) d,
-        quantilesExact(0.05, 0.5, 0.95)(value) q, metric, query
-    from (select * from unstable_query_metrics
-        union all select * from unstable_query_traces)
+        quantilesExact(0, 0.5, 1)(value) q, metric, query
+    from (select * from unstable_run_metrics
+        union all select * from unstable_run_traces
+        union all select * from unstable_run_metrics_2)
     join queries using query
     group by query, metric
     having d > 0.5
     order by any(rd[3]) desc, d desc
     ;
+
+create table stacks engine File(TSV, 'stacks.rep') as
+    select
+        query,
+        arrayStringConcat(
+            arrayMap(x -> joinGet(right_addresses_join, 'name', x),
+                arrayReverse(trace)
+            ),
+            ';'
+        ) readable_trace,
+        count()
+    from right_trace_log
+    join unstable_query_runs using query_id
+    group by query, trace
+    ;
 "
 
+IFS=$'\n'
+for q in $(cut -d'	' -f1 stacks.rep | sort | uniq)
+do
+    grep -F "$q" stacks.rep | cut -d'	' -f 2- | tee "$q.stacks.rep" | ~/fg/flamegraph.pl > "$q.svg" &
+done
+wait
+unset IFS
+
 # Remember that grep sets error code when nothing is found, hence the bayan
 # operator
 grep Exception:[^:] *-err.log > run-errors.log ||:
diff --git a/docker/test/performance-comparison/entrypoint.sh b/docker/test/performance-comparison/entrypoint.sh
index bb5c2f6dbee..1248492914e 100755
--- a/docker/test/performance-comparison/entrypoint.sh
+++ b/docker/test/performance-comparison/entrypoint.sh
@@ -59,5 +59,5 @@ set +m
 
 dmesg > dmesg.log
 
-7z a /output/output.7z *.log *.tsv *.html *.txt *.rep
+7z a /output/output.7z *.log *.tsv *.html *.txt *.rep *.svg
 cp compare.log /output
diff --git a/docker/test/performance-comparison/perf.py b/docker/test/performance-comparison/perf.py
index 1205fc97ffd..13ba6301444 100755
--- a/docker/test/performance-comparison/perf.py
+++ b/docker/test/performance-comparison/perf.py
@@ -116,7 +116,7 @@ for q in test_queries:
     # Prewarm: run once on both servers. Helps to bring the data into memory,
     # precompile the queries, etc.
     for conn_index, c in enumerate(connections):
-        res = c.execute(q)
+        res = c.execute(q, query_id = 'prewarm {} {}'.format(0, q))
         print('prewarm\t' + tsv_escape(q) + '\t' + str(conn_index) + '\t' + str(c.last_query.elapsed))
 
     # Now, perform measured runs.