From 18fe1c796b6e2995d4de51e28f769bc0ae0ebf58 Mon Sep 17 00:00:00 2001
From: Pavel Kovalenko <jokserfn@yandex-team.ru>
Date: Tue, 22 Dec 2020 21:47:47 +0300
Subject: [PATCH 001/122] Ability to backup-restore metadata files for DiskS3
 (WIP)

---
 src/Disks/DiskCacheWrapper.cpp |  13 --
 src/Disks/DiskCacheWrapper.h   |   1 -
 src/Disks/DiskLocal.cpp        |   5 -
 src/Disks/DiskLocal.h          |   2 -
 src/Disks/DiskMemory.cpp       |   5 -
 src/Disks/DiskMemory.h         |   2 -
 src/Disks/IDisk.h              |   3 -
 src/Disks/S3/DiskS3.cpp        | 342 +++++++++++++++++++++++++++++----
 src/Disks/S3/DiskS3.h          |  31 ++-
 9 files changed, 331 insertions(+), 73 deletions(-)
diff --git a/src/Disks/DiskCacheWrapper.cpp b/src/Disks/DiskCacheWrapper.cpp
index 7ce963380d4..89bab7cfa98 100644
--- a/src/Disks/DiskCacheWrapper.cpp
+++ b/src/Disks/DiskCacheWrapper.cpp
@@ -239,19 +239,6 @@ void DiskCacheWrapper::replaceFile(const String & from_path, const String & to_p
     DiskDecorator::replaceFile(from_path, to_path);
 }
 
-void DiskCacheWrapper::copyFile(const String & from_path, const String & to_path)
-{
-    if (cache_disk->exists(from_path))
-    {
-        auto dir_path = getDirectoryPath(to_path);
-        if (!cache_disk->exists(dir_path))
-            cache_disk->createDirectories(dir_path);
-
-        cache_disk->copyFile(from_path, to_path);
-    }
-    DiskDecorator::copyFile(from_path, to_path);
-}
-
 void DiskCacheWrapper::remove(const String & path)
 {
     if (cache_disk->exists(path))
diff --git a/src/Disks/DiskCacheWrapper.h b/src/Disks/DiskCacheWrapper.h
index b0b373d900c..711ad5280ec 100644
--- a/src/Disks/DiskCacheWrapper.h
+++ b/src/Disks/DiskCacheWrapper.h
@@ -32,7 +32,6 @@ public:
     void moveDirectory(const String & from_path, const String & to_path) override;
     void moveFile(const String & from_path, const String & to_path) override;
     void replaceFile(const String & from_path, const String & to_path) override;
-    void copyFile(const String & from_path, const String & to_path) override;
     std::unique_ptr<ReadBufferFromFileBase>
     readFile(const String & path, size_t buf_size, size_t estimated_size, size_t aio_threshold, size_t mmap_threshold) const override;
     std::unique_ptr<WriteBufferFromFileBase>
diff --git a/src/Disks/DiskLocal.cpp b/src/Disks/DiskLocal.cpp
index cde9b3c5a41..364b5bf4e2f 100644
--- a/src/Disks/DiskLocal.cpp
+++ b/src/Disks/DiskLocal.cpp
@@ -220,11 +220,6 @@ void DiskLocal::replaceFile(const String & from_path, const String & to_path)
         from_file.renameTo(to_file.path());
 }
 
-void DiskLocal::copyFile(const String & from_path, const String & to_path)
-{
-    Poco::File(disk_path + from_path).copyTo(disk_path + to_path);
-}
-
 std::unique_ptr<ReadBufferFromFileBase>
 DiskLocal::readFile(const String & path, size_t buf_size, size_t estimated_size, size_t aio_threshold, size_t mmap_threshold) const
 {
diff --git a/src/Disks/DiskLocal.h b/src/Disks/DiskLocal.h
index 762a8502faa..eac95c543ef 100644
--- a/src/Disks/DiskLocal.h
+++ b/src/Disks/DiskLocal.h
@@ -67,8 +67,6 @@ public:
 
     void replaceFile(const String & from_path, const String & to_path) override;
 
-    void copyFile(const String & from_path, const String & to_path) override;
-
     void copy(const String & from_path, const std::shared_ptr<IDisk> & to_disk, const String & to_path) override;
 
     void listFiles(const String & path, std::vector<String> & file_names) override;
diff --git a/src/Disks/DiskMemory.cpp b/src/Disks/DiskMemory.cpp
index d185263d48c..ef68ad19191 100644
--- a/src/Disks/DiskMemory.cpp
+++ b/src/Disks/DiskMemory.cpp
@@ -314,11 +314,6 @@ void DiskMemory::replaceFileImpl(const String & from_path, const String & to_pat
     files.insert(std::move(node));
 }
 
-void DiskMemory::copyFile(const String & /*from_path*/, const String & /*to_path*/)
-{
-    throw Exception("Method copyFile is not implemented for memory disks", ErrorCodes::NOT_IMPLEMENTED);
-}
-
 std::unique_ptr<ReadBufferFromFileBase> DiskMemory::readFile(const String & path, size_t /*buf_size*/, size_t, size_t, size_t) const
 {
     std::lock_guard lock(mutex);
diff --git a/src/Disks/DiskMemory.h b/src/Disks/DiskMemory.h
index 4d4b947098b..5c81051eaa4 100644
--- a/src/Disks/DiskMemory.h
+++ b/src/Disks/DiskMemory.h
@@ -60,8 +60,6 @@ public:
 
     void replaceFile(const String & from_path, const String & to_path) override;
 
-    void copyFile(const String & from_path, const String & to_path) override;
-
     void listFiles(const String & path, std::vector<String> & file_names) override;
 
     std::unique_ptr<ReadBufferFromFileBase> readFile(
diff --git a/src/Disks/IDisk.h b/src/Disks/IDisk.h
index ac0f5a2ae8f..d20c1327509 100644
--- a/src/Disks/IDisk.h
+++ b/src/Disks/IDisk.h
@@ -127,9 +127,6 @@ public:
     /// If a file with `to_path` path already exists, it will be replaced.
     virtual void replaceFile(const String & from_path, const String & to_path) = 0;
 
-    /// Copy the file from `from_path` to `to_path`.
-    virtual void copyFile(const String & from_path, const String & to_path) = 0;
-
     /// Recursively copy data containing at `from_path` to `to_path` located at `to_disk`.
     virtual void copy(const String & from_path, const std::shared_ptr<IDisk> & to_disk, const String & to_path);
 
diff --git a/src/Disks/S3/DiskS3.cpp b/src/Disks/S3/DiskS3.cpp
index 4786c05f8b0..d4b2f43b70a 100644
--- a/src/Disks/S3/DiskS3.cpp
+++ b/src/Disks/S3/DiskS3.cpp
@@ -23,6 +23,8 @@
 #include <aws/s3/model/CopyObjectRequest.h>
 #include <aws/s3/model/DeleteObjectsRequest.h>
 #include <aws/s3/model/GetObjectRequest.h>
+#include <aws/s3/model/ListObjectsV2Request.h>
+#include <aws/s3/model/HeadObjectRequest.h>
 
 #include <boost/algorithm/string.hpp>
 
@@ -32,6 +34,7 @@ namespace DB
 
 namespace ErrorCodes
 {
+    extern const int S3_ERROR;
     extern const int FILE_ALREADY_EXISTS;
     extern const int CANNOT_SEEK_THROUGH_FILE;
     extern const int UNKNOWN_FORMAT;
@@ -76,12 +79,12 @@ String getRandomName()
 }
 
 template <typename Result, typename Error>
-void throwIfError(Aws::Utils::Outcome<Result, Error> && response)
+void throwIfError(Aws::Utils::Outcome<Result, Error> & response)
 {
     if (!response.IsSuccess())
     {
         const auto & err = response.GetError();
-        throw Exception(err.GetMessage(), static_cast<int>(err.GetErrorType()));
+        throw Exception(std::to_string(static_cast<int>(err.GetErrorType())) + ": " + err.GetMessage(), ErrorCodes::S3_ERROR);
     }
 }
 
@@ -613,45 +616,31 @@ void DiskS3::moveFile(const String & from_path, const String & to_path)
 {
     if (exists(to_path))
         throw Exception("File already exists: " + to_path, ErrorCodes::FILE_ALREADY_EXISTS);
+
+    if (send_metadata)
+    {
+        auto revision = ++revision_counter;
+        const DiskS3::ObjectMetadata object_metadata {
+            {"from_path", from_path},
+            {"to_path", to_path}
+        };
+        createFileOperationObject("rename", revision, object_metadata);
+    }
+
     Poco::File(metadata_path + from_path).renameTo(metadata_path + to_path);
 }
 
 void DiskS3::replaceFile(const String & from_path, const String & to_path)
 {
-    Poco::File from_file(metadata_path + from_path);
-    Poco::File to_file(metadata_path + to_path);
-    if (to_file.exists())
+    if (exists(to_path))
     {
-        Poco::File tmp_file(metadata_path + to_path + ".old");
-        to_file.renameTo(tmp_file.path());
-        from_file.renameTo(metadata_path + to_path);
-        remove(to_path + ".old");
+        const String tmp_path = to_path + ".old";
+        moveFile(to_path, tmp_path);
+        moveFile(from_path, to_path);
+        remove(tmp_path);
     }
     else
-        from_file.renameTo(to_file.path());
-}
-
-void DiskS3::copyFile(const String & from_path, const String & to_path)
-{
-    if (exists(to_path))
-        remove(to_path);
-
-    auto from = readMeta(from_path);
-    auto to = createMeta(to_path);
-
-    for (const auto & [path, size] : from.s3_objects)
-    {
-        auto new_path = getRandomName();
-        Aws::S3::Model::CopyObjectRequest req;
-        req.SetCopySource(bucket + "/" + s3_root_path + path);
-        req.SetBucket(bucket);
-        req.SetKey(s3_root_path + new_path);
-        throwIfError(client->CopyObject(req));
-
-        to.addObject(new_path, size);
-    }
-
-    to.save();
+        moveFile(from_path, to_path);
 }
 
 std::unique_ptr<ReadBufferFromFileBase> DiskS3::readFile(const String & path, size_t buf_size, size_t, size_t, size_t) const
@@ -673,7 +662,17 @@ std::unique_ptr<WriteBufferFromFileBase> DiskS3::writeFile(const String & path,
 
     /// Path to store new S3 object.
     auto s3_path = getRandomName();
-    auto object_metadata = createObjectMetadata(path);
+
+    std::optional<ObjectMetadata> object_metadata;
+    if (send_metadata)
+    {
+        auto revision = ++revision_counter;
+        object_metadata = {
+            {"path", path}
+        };
+        s3_path = "r" + revisionToString(revision) + "-file-" + s3_path;
+    }
+
     if (!exist || mode == WriteMode::Rewrite)
     {
         /// If metadata file exists - remove and create new.
@@ -727,6 +726,15 @@ void DiskS3::removeMeta(const String & path, AwsS3KeyKeeper & keys)
         }
         else /// In other case decrement number of references, save metadata and delete file.
         {
+            if (send_metadata)
+            {
+                auto revision = ++revision_counter;
+                const ObjectMetadata object_metadata {
+                    {"path", path}
+                };
+                createFileOperationObject("remove", revision, object_metadata);
+            }
+
             --metadata.ref_count;
             metadata.save();
             file.remove();
@@ -780,7 +788,8 @@ void DiskS3::removeAws(const AwsS3KeyKeeper & keys)
             Aws::S3::Model::DeleteObjectsRequest request;
             request.SetBucket(bucket);
             request.SetDelete(delkeys);
-            throwIfError(client->DeleteObjects(request));
+            auto outcome = client->DeleteObjects(request);
+            throwIfError(outcome);
         }
     }
 }
@@ -840,6 +849,16 @@ Poco::Timestamp DiskS3::getLastModified(const String & path)
 
 void DiskS3::createHardLink(const String & src_path, const String & dst_path)
 {
+    if (send_metadata)
+    {
+        auto revision = ++revision_counter;
+        const ObjectMetadata object_metadata {
+            {"src_path", src_path},
+            {"dst_path", dst_path}
+        };
+        createFileOperationObject("hardlink", revision, object_metadata);
+    }
+
     /// Increment number of references.
     auto src = readMeta(src_path);
     ++src.ref_count;
@@ -889,12 +908,257 @@ void DiskS3::shutdown()
     client->DisableRequestProcessing();
 }
 
-std::optional<DiskS3::ObjectMetadata> DiskS3::createObjectMetadata(const String & path) const
+void DiskS3::createFileOperationObject(const String & operation_name, UInt64 revision, const DiskS3::ObjectMetadata & metadata)
 {
-    if (send_metadata)
-        return (DiskS3::ObjectMetadata){{"path", path}};
+    const String key = "meta/r" + revisionToString(revision) + "-" + operation_name;
+    WriteBufferFromS3 buffer(client, bucket, s3_root_path + key, min_upload_part_size, max_single_part_upload_size, metadata);
+    buffer.write('0');
+    buffer.finalize();
+}
 
-    return {};
+void DiskS3::startup()
+{
+    if (!send_metadata)
+        return;
+
+    LOG_INFO(&Poco::Logger::get("DiskS3"), "Starting up disk {}", name);
+
+    /// Find last revision.
+    UInt64 l = 0, r = (static_cast<UInt64>(1)) << 63;
+    while (r - l > 1)
+    {
+        auto revision = (r - l) >> 1;
+        auto revision_str = revisionToString(revision);
+        /// Check that object or metaobject with such revision exists.
+        if (checkObjectExists(s3_root_path + "r" + revision_str)
+            || checkObjectExists(s3_root_path + "meta/r" + revision_str))
+            l = revision;
+        else
+            r = revision;
+    }
+    revision_counter = l;
+    LOG_INFO(&Poco::Logger::get("DiskS3"), "Found last revision number {}", revision_counter);
+}
+
+bool DiskS3::checkObjectExists(const String & prefix)
+{
+    Aws::S3::Model::ListObjectsV2Request request;
+    request.SetBucket(bucket);
+    request.SetPrefix(prefix);
+    request.SetMaxKeys(1);
+
+    auto outcome = client->ListObjectsV2(request);
+    throwIfError(outcome);
+
+    return !outcome.GetResult().GetContents().empty();
+}
+
+struct DiskS3::RestoreInformation
+{
+    UInt64 revision = (static_cast<UInt64>(1)) << 63;
+    String bucket;
+    String path;
+};
+
+void DiskS3::restore()
+{
+    if (!exists(restore_file))
+        return;
+
+    RestoreInformation information;
+    ///TODO: read restore information from restore_file.
+
+    restoreFiles(information.bucket, information.path, information.revision);
+    restoreFileOperations(information.bucket, information.path, information.revision);
+}
+
+Aws::S3::Model::HeadObjectResult DiskS3::headObject(const String & source_bucket, const String & key)
+{
+    Aws::S3::Model::HeadObjectRequest request;
+    request.SetBucket(source_bucket);
+    request.SetKey(key);
+
+    auto outcome = client->HeadObject(request);
+    throwIfError(outcome);
+
+    return outcome.GetResultWithOwnership();
+}
+
+void DiskS3::listObjects(const String & source_bucket, const String & source_path, std::function<bool(const Aws::S3::Model::ListObjectsV2Result &)> callback)
+{
+    Aws::S3::Model::ListObjectsV2Request request;
+    request.SetBucket(source_bucket);
+    request.SetPrefix(source_path);
+    request.SetMaxKeys(1000);
+
+    Aws::S3::Model::ListObjectsV2Outcome outcome;
+    do
+    {
+        outcome = client->ListObjectsV2(request);
+        throwIfError(outcome);
+
+        bool should_continue = callback(outcome.GetResult());
+
+        if (!should_continue)
+            break;
+
+        request.SetContinuationToken(outcome.GetResult().GetNextContinuationToken());
+    } while (outcome.GetResult().GetIsTruncated());
+}
+
+void DiskS3::restoreFiles(const String & source_bucket, const String & source_path, UInt64 revision)
+{
+    LOG_INFO(&Poco::Logger::get("DiskS3"), "Starting restore files for disk {}", name);
+
+    std::vector<std::future<void>> results;
+
+    listObjects(source_bucket, source_path, [this, &source_bucket, &revision, &results](auto list_result) {
+        std::vector<String> keys;
+        for (const auto & row : list_result.GetContents())
+        {
+            const String & key = row.GetKey();
+
+            /// Skip meta objects. They will be processed separately.
+            if (key.find("/meta/") != String::npos)
+                continue;
+
+            /// Filter early if it's possible to get revision from key.
+            if (extractRevisionFromKey(key) > revision)
+                continue;
+
+            keys.push_back(key);
+        }
+
+        if (!keys.empty())
+        {
+            auto result = getExecutor().execute([this, &source_bucket, keys]() { processRestoreFiles(source_bucket, keys);
+            });
+
+            results.push_back(std::move(result));
+        }
+
+        return true;
+    });
+
+    for (auto & result : results)
+        result.wait();
+    for (auto & result : results)
+        result.get();
+
+    LOG_INFO(&Poco::Logger::get("DiskS3"), "Files are restored for disk {}", name);
+}
+
+inline String getDirectoryPath(const String & path)
+{
+    return Poco::Path{path}.setFileName("").toString();
+}
+
+void DiskS3::processRestoreFiles(const String & source_bucket, Strings keys)
+{
+    for (const auto & key : keys)
+    {
+        Aws::S3::Model::HeadObjectRequest request;
+        request.SetBucket(source_bucket);
+        request.SetKey(key);
+
+        auto outcome = client->HeadObject(request);
+        throwIfError(outcome);
+
+        auto object_metadata = outcome.GetResult().GetMetadata();
+
+        /// If object has 'path' in metadata then restore it.
+        auto path = object_metadata.find("path");
+        if (path == object_metadata.end())
+            continue;
+
+        createDirectories(getDirectoryPath(path->second));
+        auto metadata = createMeta(path->second);
+
+        /// TODO: shrink common prefix of s3_root_path and key.
+        auto relative_key = key;
+        metadata.addObject(relative_key, outcome.GetResult().GetContentLength());
+
+        /// TODO: Copy object to configured bucket if source_bucket is different.
+
+        metadata.save();
+    }
+}
+
+void DiskS3::restoreFileOperations(const String & source_bucket, const String & source_path, UInt64 revision)
+{
+    LOG_INFO(&Poco::Logger::get("DiskS3"), "Starting restore file operations for disk {}", name);
+
+    /// Temporarily disable sending metadata.
+    send_metadata = false;
+
+    listObjects(source_bucket, source_path + "meta/", [this, &source_bucket, &revision](auto list_result) {
+        const String rename = "rename";
+        const String remove = "remove";
+        const String hardlink = "hardlink";
+
+        for (const auto & row : list_result.GetContents())
+        {
+            const String & key = row.GetKey();
+
+            /// Stop processing when get revision more than required.
+            /// S3 ensures that keys will be listed in ascending UTF-8 bytes order.
+            if (extractRevisionFromKey(key) > revision)
+                return false;
+
+            auto operation = extractOperationFromKey(key);
+            auto object_metadata = headObject(source_bucket, key).GetMetadata();
+            if (operation == rename)
+            {
+                auto from_path = object_metadata["from_path"];
+                auto to_path = object_metadata["to_path"];
+                if (exists(from_path))
+                    moveFile(from_path, to_path);
+            }
+            else if (operation == remove)
+            {
+                removeIfExists(object_metadata["path"]);
+            }
+            else if (operation == hardlink)
+            {
+                auto src_path = object_metadata["src_path"];
+                auto dst_path = object_metadata["dst_path"];
+                /// Skip hardlinks to shadow (backup) directory.
+                if (exists(src_path) && dst_path.find("/shadow/") != String::npos)
+                    createHardLink(src_path, dst_path);
+            }
+        }
+
+        return true;
+    });
+
+    send_metadata = true;
+
+    LOG_INFO(&Poco::Logger::get("DiskS3"), "File operations restored for disk {}", name);
+}
+
+UInt64 DiskS3::extractRevisionFromKey(const String & key)
+{
+    /// TODO: Implement.
+    return 0;
+}
+
+String DiskS3::extractOperationFromKey(const String & key)
+{
+    /// TODO: Implement.
+    return "";
+}
+
+String DiskS3::revisionToString(UInt64 revision)
+{
+    static constexpr size_t max_digits = 19;
+
+    /// Align revision number with leading zeroes to have strict lexicographical order of them.
+    auto revision_str = std::to_string(revision);
+    auto digits_to_align = max_digits - revision_str.length();
+    for (size_t i = 0; i < digits_to_align; ++i)
+        revision_str = "0" + revision_str;
+
+    return revision_str;
 }
 
 }
diff --git a/src/Disks/S3/DiskS3.h b/src/Disks/S3/DiskS3.h
index f62c603adda..dfaa3136642 100644
--- a/src/Disks/S3/DiskS3.h
+++ b/src/Disks/S3/DiskS3.h
@@ -1,10 +1,14 @@
 #pragma once
 
+#include <atomic>
 #include "Disks/DiskFactory.h"
 #include "Disks/Executor.h"
 #include "ProxyConfiguration.h"
 
 #include <aws/s3/S3Client.h>
+#include <aws/s3/model/HeadObjectResult.h>
+#include <aws/s3/model/ListObjectsV2Result.h>
+
 #include <Poco/DirectoryIterator.h>
 
 
@@ -19,12 +23,16 @@ namespace DB
 class DiskS3 : public IDisk
 {
 public:
+    /// File contains restore information
+    const String restore_file = "restore";
+
     using ObjectMetadata = std::map<std::string, std::string>;
 
     friend class DiskS3Reservation;
 
     class AwsS3KeyKeeper;
     struct Metadata;
+    struct RestoreInformation;
 
     DiskS3(
         String name_,
@@ -74,8 +82,6 @@ public:
 
     void replaceFile(const String & from_path, const String & to_path) override;
 
-    void copyFile(const String & from_path, const String & to_path) override;
-
     void listFiles(const String & path, std::vector<String> & file_names) override;
 
     std::unique_ptr<ReadBufferFromFileBase> readFile(
@@ -114,17 +120,34 @@ public:
 
     void shutdown() override;
 
+    /// Actions performed after disk creation.
+    void startup();
+
+    /// Restore S3 metadata files on file system.
+    void restore();
+
 private:
     bool tryReserve(UInt64 bytes);
 
     void removeMeta(const String & path, AwsS3KeyKeeper & keys);
     void removeMetaRecursive(const String & path, AwsS3KeyKeeper & keys);
     void removeAws(const AwsS3KeyKeeper & keys);
-    std::optional<ObjectMetadata> createObjectMetadata(const String & path) const;
 
     Metadata readMeta(const String & path) const;
     Metadata createMeta(const String & path) const;
 
+    void createFileOperationObject(const String & operation_name, UInt64 revision, const ObjectMetadata & metadata);
+    String revisionToString(UInt64 revision);
+    bool checkObjectExists(const String & prefix);
+
+    Aws::S3::Model::HeadObjectResult headObject(const String & source_bucket, const String & key);
+    void listObjects(const String & source_bucket, const String & source_path, std::function<bool(const Aws::S3::Model::ListObjectsV2Result &)> callback);
+    void restoreFiles(const String & source_bucket, const String & source_path, UInt64 revision);
+    void processRestoreFiles(const String & source_bucket, std::vector<String> keys);
+    void restoreFileOperations(const String & source_bucket, const String & source_path, UInt64 revision);
+    UInt64 extractRevisionFromKey(const String & key);
+    String extractOperationFromKey(const String & key);
+
 private:
     const String name;
     std::shared_ptr<Aws::S3::S3Client> client;
@@ -140,6 +163,8 @@ private:
     UInt64 reserved_bytes = 0;
     UInt64 reservation_count = 0;
     std::mutex reservation_mutex;
+
+    std::atomic<UInt64> revision_counter;
 };
 
 }

From cc3b5958b047fc7c7f41557a9148deb63330e38f Mon Sep 17 00:00:00 2001
From: Pavel Kovalenko <jokserfn@yandex-team.ru>
Date: Wed, 23 Dec 2020 15:35:52 +0300
Subject: [PATCH 002/122] Ability to backup-restore metadata files for DiskS3
 (WIP)

---
 src/Disks/DiskCacheWrapper.cpp  |  15 +--
 src/Disks/DiskCacheWrapper.h    |   1 -
 src/Disks/DiskDecorator.cpp     |   5 -
 src/Disks/DiskDecorator.h       |   1 -
 src/Disks/IDisk.h               |   7 ++
 src/Disks/S3/DiskS3.cpp         | 207 +++++++++++++++++++++-----------
 src/Disks/S3/DiskS3.h           |  23 ++--
 src/Disks/S3/registerDiskS3.cpp |   3 +
 8 files changed, 171 insertions(+), 91 deletions(-)

diff --git a/src/Disks/DiskCacheWrapper.cpp b/src/Disks/DiskCacheWrapper.cpp
index 89bab7cfa98..d44f5a8e0d4 100644
--- a/src/Disks/DiskCacheWrapper.cpp
+++ b/src/Disks/DiskCacheWrapper.cpp
@@ -139,7 +139,7 @@ DiskCacheWrapper::readFile(const String & path, size_t buf_size, size_t estimate
         {
             try
             {
-                auto dir_path = getDirectoryPath(path);
+                auto dir_path = directoryPath(path);
                 if (!cache_disk->exists(dir_path))
                     cache_disk->createDirectories(dir_path);
 
@@ -182,7 +182,7 @@ DiskCacheWrapper::writeFile(const String & path, size_t buf_size, WriteMode mode
 
     LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Write file {} to cache", backQuote(path));
 
-    auto dir_path = getDirectoryPath(path);
+    auto dir_path = directoryPath(path);
     if (!cache_disk->exists(dir_path))
         cache_disk->createDirectories(dir_path);
 
@@ -217,7 +217,7 @@ void DiskCacheWrapper::moveFile(const String & from_path, const String & to_path
 {
     if (cache_disk->exists(from_path))
     {
-        auto dir_path = getDirectoryPath(to_path);
+        auto dir_path = directoryPath(to_path);
         if (!cache_disk->exists(dir_path))
             cache_disk->createDirectories(dir_path);
 
@@ -230,7 +230,7 @@ void DiskCacheWrapper::replaceFile(const String & from_path, const String & to_p
 {
     if (cache_disk->exists(from_path))
     {
-        auto dir_path = getDirectoryPath(to_path);
+        auto dir_path = directoryPath(to_path);
         if (!cache_disk->exists(dir_path))
             cache_disk->createDirectories(dir_path);
 
@@ -257,7 +257,7 @@ void DiskCacheWrapper::createHardLink(const String & src_path, const String & ds
 {
     if (cache_disk->exists(src_path))
     {
-        auto dir_path = getDirectoryPath(dst_path);
+        auto dir_path = directoryPath(dst_path);
         if (!cache_disk->exists(dir_path))
             cache_disk->createDirectories(dir_path);
 
@@ -278,11 +278,6 @@ void DiskCacheWrapper::createDirectories(const String & path)
     DiskDecorator::createDirectories(path);
 }
 
-inline String DiskCacheWrapper::getDirectoryPath(const String & path)
-{
-    return Poco::Path{path}.setFileName("").toString();
-}
-
 /// TODO: Current reservation mechanism leaks IDisk abstraction details.
 /// This hack is needed to return proper disk pointer (wrapper instead of implementation) from reservation object.
 class ReservationDelegate : public IReservation
diff --git a/src/Disks/DiskCacheWrapper.h b/src/Disks/DiskCacheWrapper.h
index 711ad5280ec..0722c2dab84 100644
--- a/src/Disks/DiskCacheWrapper.h
+++ b/src/Disks/DiskCacheWrapper.h
@@ -43,7 +43,6 @@ public:
 
 private:
     std::shared_ptr<FileDownloadMetadata> acquireDownloadMetadata(const String & path) const;
-    static String getDirectoryPath(const String & path);
 
     /// Disk to cache files.
     std::shared_ptr<DiskLocal> cache_disk;
diff --git a/src/Disks/DiskDecorator.cpp b/src/Disks/DiskDecorator.cpp
index aaa54005f6f..8441803a2af 100644
--- a/src/Disks/DiskDecorator.cpp
+++ b/src/Disks/DiskDecorator.cpp
@@ -103,11 +103,6 @@ void DiskDecorator::replaceFile(const String & from_path, const String & to_path
     delegate->replaceFile(from_path, to_path);
 }
 
-void DiskDecorator::copyFile(const String & from_path, const String & to_path)
-{
-    delegate->copyFile(from_path, to_path);
-}
-
 void DiskDecorator::copy(const String & from_path, const std::shared_ptr<IDisk> & to_disk, const String & to_path)
 {
     delegate->copy(from_path, to_disk, to_path);
diff --git a/src/Disks/DiskDecorator.h b/src/Disks/DiskDecorator.h
index 1ce3c3ea773..eed3c77abf6 100644
--- a/src/Disks/DiskDecorator.h
+++ b/src/Disks/DiskDecorator.h
@@ -32,7 +32,6 @@ public:
     void createFile(const String & path) override;
     void moveFile(const String & from_path, const String & to_path) override;
     void replaceFile(const String & from_path, const String & to_path) override;
-    void copyFile(const String & from_path, const String & to_path) override;
     void copy(const String & from_path, const std::shared_ptr<IDisk> & to_disk, const String & to_path) override;
     void listFiles(const String & path, std::vector<String> & file_names) override;
     std::unique_ptr<ReadBufferFromFileBase>
diff --git a/src/Disks/IDisk.h b/src/Disks/IDisk.h
index d20c1327509..7d3e498a40b 100644
--- a/src/Disks/IDisk.h
+++ b/src/Disks/IDisk.h
@@ -262,4 +262,11 @@ inline String fileName(const String & path)
 {
     return Poco::Path(path).getFileName();
 }
+
+/// Return directory path for the specified path.
+inline String directoryPath(const String & path)
+{
+    return Poco::Path(path).setFileName("").toString();
+}
+
 }
diff --git a/src/Disks/S3/DiskS3.cpp b/src/Disks/S3/DiskS3.cpp
index d4b2f43b70a..318fda72368 100644
--- a/src/Disks/S3/DiskS3.cpp
+++ b/src/Disks/S3/DiskS3.cpp
@@ -41,6 +41,7 @@ namespace ErrorCodes
     extern const int INCORRECT_DISK_INDEX;
     extern const int NOT_IMPLEMENTED;
     extern const int PATH_ACCESS_DENIED;
+    extern const int LOGICAL_ERROR;
 }
 
 
@@ -849,7 +850,8 @@ Poco::Timestamp DiskS3::getLastModified(const String & path)
 
 void DiskS3::createHardLink(const String & src_path, const String & dst_path)
 {
-    if (send_metadata)
+    /// We don't need to record hardlinks created to shadow folder.
+    if (send_metadata && dst_path.find("/shadow/") != String::npos)
     {
         auto revision = ++revision_counter;
         const ObjectMetadata object_metadata {
@@ -910,7 +912,7 @@ void DiskS3::shutdown()
 
 void DiskS3::createFileOperationObject(const String & operation_name, UInt64 revision, const DiskS3::ObjectMetadata & metadata)
 {
-    const String key = "meta/r" + revisionToString(revision) + "-" + operation_name;
+    const String key = "operations/r" + revisionToString(revision) + "-" + operation_name;
     WriteBufferFromS3 buffer(client, bucket, s3_root_path + key, min_upload_part_size, max_single_part_upload_size, metadata);
     buffer.write('0');
     buffer.finalize();
@@ -929,9 +931,9 @@ void DiskS3::startup()
     {
         auto revision = (r - l) >> 1;
         auto revision_str = revisionToString(revision);
-        /// Check that object or metaobject with such revision exists.
+        /// Check that file or operation with such revision exists.
         if (checkObjectExists(s3_root_path + "r" + revision_str)
-            || checkObjectExists(s3_root_path + "meta/r" + revision_str))
+            || checkObjectExists(s3_root_path + "operations/r" + revision_str))
             l = revision;
         else
             r = revision;
@@ -953,25 +955,6 @@ bool DiskS3::checkObjectExists(const String & prefix)
     return !outcome.GetResult().GetContents().empty();
 }
 
-struct DiskS3::RestoreInformation
-{
-    UInt64 revision = (static_cast<UInt64>(1)) << 63;
-    String bucket;
-    String path;
-};
-
-void DiskS3::restore()
-{
-    if (!exists(restore_file))
-        return;
-
-    RestoreInformation information;
-    ///TODO: read restore information from restore_file.
-
-    restoreFiles(information.bucket, information.path, information.revision);
-    restoreFileOperations(information.bucket, information.path, information.revision);
-}
-
 Aws::S3::Model::HeadObjectResult DiskS3::headObject(const String & source_bucket, const String & key)
 {
     Aws::S3::Model::HeadObjectRequest request;
@@ -1006,24 +989,102 @@ void DiskS3::listObjects(const String & source_bucket, const String & source_pat
     } while (outcome.GetResult().GetIsTruncated());
 }
 
-void DiskS3::restoreFiles(const String & source_bucket, const String & source_path, UInt64 revision)
+void DiskS3::copyObject(const String & src_bucket, const String & src_key, const String & dst_bucket, const String & dst_key)
+{
+    Aws::S3::Model::CopyObjectRequest request;
+    request.SetCopySource(src_bucket + "/" + src_key);
+    request.SetBucket(dst_bucket);
+    request.SetKey(dst_key);
+
+    auto outcome = client->CopyObject(request);
+    throwIfError(outcome);
+}
+
+struct DiskS3::RestoreInformation
+{
+    UInt64 revision = (static_cast<UInt64>(1)) << 63;
+    String bucket;
+    String path;
+};
+
+void DiskS3::readRestoreInformation(DiskS3::RestoreInformation & restore_information)
+{
+    ReadBufferFromFile buffer(metadata_path + restore_file, 512);
+    buffer.next();
+
+    /// Empty file - just restore all metadata.
+    if (!buffer.hasPendingData())
+        return;
+
+    try
+    {
+        readIntText(restore_information.revision, buffer);
+        assertChar('\n', buffer);
+
+        if (!buffer.hasPendingData())
+            return;
+
+        readText(restore_information.bucket, buffer);
+        assertChar('\n', buffer);
+
+        if (!buffer.hasPendingData())
+            return;
+
+        readText(restore_information.path, buffer);
+        assertChar('\n', buffer);
+
+        if (buffer.hasPendingData())
+            throw Exception("Extra information at the end of restore file", ErrorCodes::UNKNOWN_FORMAT);
+    }
+    catch (const Exception & e)
+    {
+        throw Exception("Failed to read restore information", e, ErrorCodes::UNKNOWN_FORMAT);
+    }
+}
+
+void DiskS3::restore()
+{
+    if (!exists(restore_file))
+        return;
+
+    try
+    {
+        RestoreInformation information;
+        information.bucket = bucket;
+        information.path = s3_root_path;
+
+        readRestoreInformation(information);
+
+        ///TODO: Cleanup FS and bucket if previous restore was failed.
+
+        restoreFiles(information.bucket, information.path, information.revision);
+        restoreFileOperations(information.bucket, information.path, information.revision);
+    }
+    catch (const Exception & e)
+    {
+        throw Exception("Failed to restore disk: " + name, e, ErrorCodes::LOGICAL_ERROR);
+    }
+}
+
+void DiskS3::restoreFiles(const String & source_bucket, const String & source_path, UInt64 target_revision)
 {
     LOG_INFO(&Poco::Logger::get("DiskS3"), "Starting restore files for disk {}", name);
 
     std::vector<std::future<void>> results;
-
-    listObjects(source_bucket, source_path, [this, &source_bucket, &revision, &results](auto list_result) {
+    listObjects(source_bucket, source_path, [this, &source_bucket, &source_path, &target_revision, &results](auto list_result)
+    {
         std::vector<String> keys;
         for (const auto & row : list_result.GetContents())
         {
             const String & key = row.GetKey();
 
-            /// Skip meta objects. They will be processed separately.
-            if (key.find("/meta/") != String::npos)
+            /// Skip file operations objects. They will be processed separately.
+            if (key.find("/operations/") != String::npos)
                 continue;
 
+            auto [revision, _] = extractRevisionAndOperationFromKey(key);
             /// Filter early if it's possible to get revision from key.
-            if (extractRevisionFromKey(key) > revision)
+            if (revision > target_revision)
                 continue;
 
             keys.push_back(key);
@@ -1031,7 +1092,9 @@ void DiskS3::restoreFiles(const String & source_bucket, const String & source_pa
 
         if (!keys.empty())
         {
-            auto result = getExecutor().execute([this, &source_bucket, keys]() { processRestoreFiles(source_bucket, keys);
+            auto result = getExecutor().execute([this, &source_bucket, &source_path, keys]()
+            {
+                processRestoreFiles(source_bucket, source_path, keys);
             });
 
             results.push_back(std::move(result));
@@ -1048,50 +1111,45 @@ void DiskS3::restoreFiles(const String & source_bucket, const String & source_pa
     LOG_INFO(&Poco::Logger::get("DiskS3"), "Files are restored for disk {}", name);
 }
 
-inline String getDirectoryPath(const String & path)
-{
-    return Poco::Path{path}.setFileName("").toString();
-}
-
-void DiskS3::processRestoreFiles(const String & source_bucket, Strings keys)
+void DiskS3::processRestoreFiles(const String & source_bucket, const String & source_path, Strings keys)
 {
     for (const auto & key : keys)
     {
-        Aws::S3::Model::HeadObjectRequest request;
-        request.SetBucket(source_bucket);
-        request.SetKey(key);
-
-        auto outcome = client->HeadObject(request);
-        throwIfError(outcome);
-
-        auto object_metadata = outcome.GetResult().GetMetadata();
+        auto head_result = headObject(source_bucket, key);
+        auto object_metadata = head_result.GetMetadata();
 
         /// If object has 'path' in metadata then restore it.
-        auto path = object_metadata.find("path");
-        if (path == object_metadata.end())
+        auto path_entry = object_metadata.find("path");
+        if (path_entry == object_metadata.end())
+        {
+            LOG_WARNING(&Poco::Logger::get("DiskS3"), "Skip key {} because it doesn't have path key in metadata", key);
             continue;
+        }
 
-        createDirectories(getDirectoryPath(path->second));
-        auto metadata = createMeta(path->second);
+        const auto & path = path_entry->second;
 
-        /// TODO: shrink common prefix of s3_root_path and key.
-        auto relative_key = key;
-        metadata.addObject(relative_key, outcome.GetResult().GetContentLength());
+        createDirectories(directoryPath(path));
+        auto metadata = createMeta(path);
 
-        /// TODO: Copy object to configured bucket if source_bucket is different.
+        auto relative_key = shrinkKey(source_path, key);
+        metadata.addObject(relative_key, head_result.GetContentLength());
+
+        /// Copy object to bucket configured for current DiskS3 instance.
+        if (bucket != source_bucket)
+            copyObject(source_bucket, key, bucket, s3_root_path + relative_key);
 
         metadata.save();
     }
 }
 
-void DiskS3::restoreFileOperations(const String & source_bucket, const String & source_path, UInt64 revision)
+void DiskS3::restoreFileOperations(const String & source_bucket, const String & source_path, UInt64 target_revision)
 {
     LOG_INFO(&Poco::Logger::get("DiskS3"), "Starting restore file operations for disk {}", name);
 
-    /// Temporarily disable sending metadata.
-    send_metadata = false;
+    /// Disable sending metadata if we restore metadata to the same bucket.
+    send_metadata = bucket != source_bucket;
 
-    listObjects(source_bucket, source_path + "meta/", [this, &source_bucket, &revision](auto list_result) {
+    listObjects(source_bucket, source_path + "operations/", [this, &source_bucket, &target_revision](auto list_result) {
         const String rename = "rename";
         const String remove = "remove";
         const String hardlink = "hardlink";
@@ -1100,12 +1158,22 @@ void DiskS3::restoreFileOperations(const String & source_bucket, const String &
         {
             const String & key = row.GetKey();
 
+            auto [revision, operation] = extractRevisionAndOperationFromKey(key);
+            if (revision == 0)
+            {
+                LOG_WARNING(&Poco::Logger::get("DiskS3"), "Skip key {} with unknown revision", revision);
+                continue;
+            }
+
             /// Stop processing when get revision more than required.
             /// S3 ensures that keys will be listed in ascending UTF-8 bytes order.
-            if (extractRevisionFromKey(key) > revision)
+            if (revision > target_revision)
                 return false;
 
-            auto operation = extractOperationFromKey(key);
+            /// Keep original revision if restore to different bucket.
+            if (send_metadata)
+                revision_counter = revision - 1;
+
             auto object_metadata = headObject(source_bucket, key).GetMetadata();
             if (operation == rename)
             {
@@ -1122,8 +1190,7 @@ void DiskS3::restoreFileOperations(const String & source_bucket, const String &
             {
                 auto src_path = object_metadata["src_path"];
                 auto dst_path = object_metadata["dst_path"];
-                /// Skip hardlinks to shadow (backup) directory.
-                if (exists(src_path) && dst_path.find("/shadow/") != String::npos)
+                if (exists(src_path))
                     createHardLink(src_path, dst_path);
             }
         }
@@ -1136,21 +1203,27 @@ void DiskS3::restoreFileOperations(const String & source_bucket, const String &
     LOG_INFO(&Poco::Logger::get("DiskS3"), "File operations restored for disk {}", name);
 }
 
-UInt64 DiskS3::extractRevisionFromKey(const String & key)
+std::tuple<UInt64, String> DiskS3::extractRevisionAndOperationFromKey(const String & key)
 {
-    /// TODO: Implement.
-    return 0;
+    UInt64 revision = 0;
+    String operation;
+
+    re2::RE2::FullMatch(key, key_regexp, &revision, &operation);
+
+    return {revision, operation};
 }
 
-String DiskS3::extractOperationFromKey(const String & key)
+String DiskS3::shrinkKey(const String & path, const String & key)
 {
-    /// TODO: Implement.
-    return "";
+    if (!key.starts_with(path))
+        throw Exception("The key " + key + " prefix mismatch with given " + path, ErrorCodes::LOGICAL_ERROR);
+
+    return key.substr(path.length());
 }
 
 String DiskS3::revisionToString(UInt64 revision)
 {
-    static constexpr size_t max_digits = 19;
+    static constexpr size_t max_digits = 19; /// UInt64 max digits in decimal representation.
 
     /// Align revision number with leading zeroes to have strict lexicographical order of them.
     auto revision_str = std::to_string(revision);
diff --git a/src/Disks/S3/DiskS3.h b/src/Disks/S3/DiskS3.h
index dfaa3136642..532ddcbd858 100644
--- a/src/Disks/S3/DiskS3.h
+++ b/src/Disks/S3/DiskS3.h
@@ -10,6 +10,7 @@
 #include <aws/s3/model/ListObjectsV2Result.h>
 
 #include <Poco/DirectoryIterator.h>
+#include <re2/re2.h>
 
 
 namespace DB
@@ -137,16 +138,22 @@ private:
     Metadata createMeta(const String & path) const;
 
     void createFileOperationObject(const String & operation_name, UInt64 revision, const ObjectMetadata & metadata);
-    String revisionToString(UInt64 revision);
-    bool checkObjectExists(const String & prefix);
+    static String revisionToString(UInt64 revision);
 
+    bool checkObjectExists(const String & prefix);
     Aws::S3::Model::HeadObjectResult headObject(const String & source_bucket, const String & key);
     void listObjects(const String & source_bucket, const String & source_path, std::function<bool(const Aws::S3::Model::ListObjectsV2Result &)> callback);
-    void restoreFiles(const String & source_bucket, const String & source_path, UInt64 revision);
-    void processRestoreFiles(const String & source_bucket, std::vector<String> keys);
-    void restoreFileOperations(const String & source_bucket, const String & source_path, UInt64 revision);
-    UInt64 extractRevisionFromKey(const String & key);
-    String extractOperationFromKey(const String & key);
+    void copyObject(const String & src_bucket, const String & src_key, const String & dst_bucket, const String & dst_key);
+
+    void readRestoreInformation(RestoreInformation & restore_information);
+    void restoreFiles(const String & source_bucket, const String & source_path, UInt64 target_revision);
+    void processRestoreFiles(const String & source_bucket, const String & source_path, std::vector<String> keys);
+    void restoreFileOperations(const String & source_bucket, const String & source_path, UInt64 target_revision);
+
+    /// Remove 'path' prefix from 'key' to get relative key.
+    /// It's needed to store keys to metadata files in RELATIVE_PATHS version.
+    static String shrinkKey(const String & path, const String & key);
+    std::tuple<UInt64, String> extractRevisionAndOperationFromKey(const String & key);
 
 private:
     const String name;
@@ -165,6 +172,8 @@ private:
     std::mutex reservation_mutex;
 
     std::atomic<UInt64> revision_counter;
+    /// Key has format: ../../r{revision}-{operation}
+    const re2::RE2 key_regexp {".*/r(\\d+)-(\\w+).*"};
 };
 
 }
diff --git a/src/Disks/S3/registerDiskS3.cpp b/src/Disks/S3/registerDiskS3.cpp
index fd658d95327..14aecb89517 100644
--- a/src/Disks/S3/registerDiskS3.cpp
+++ b/src/Disks/S3/registerDiskS3.cpp
@@ -160,6 +160,9 @@ void registerDiskS3(DiskFactory & factory)
             checkRemoveAccess(*s3disk);
         }
 
+        s3disk->restore();
+        s3disk->startup();
+
         bool cache_enabled = config.getBool(config_prefix + ".cache_enabled", true);
 
         if (cache_enabled)

From 2848b32af1768ad0b681550a7b967c72d4e6a0fb Mon Sep 17 00:00:00 2001
From: Pavel Kovalenko <jokserfn@yandex-team.ru>
Date: Wed, 23 Dec 2020 18:11:37 +0300
Subject: [PATCH 003/122] Ability to backup-restore metadata files for DiskS3
 (WIP)

---
 src/Disks/S3/DiskS3.cpp         | 71 ++++++++++++++++++++-------------
 src/Disks/S3/DiskS3.h           | 14 +++++--
 src/Disks/S3/registerDiskS3.cpp |  4 +-
 3 files changed, 56 insertions(+), 33 deletions(-)

diff --git a/src/Disks/S3/DiskS3.cpp b/src/Disks/S3/DiskS3.cpp
index 318fda72368..97a7dc4939f 100644
--- a/src/Disks/S3/DiskS3.cpp
+++ b/src/Disks/S3/DiskS3.cpp
@@ -504,17 +504,17 @@ private:
     CurrentMetrics::Increment metric_increment;
 };
 
-/// Runs tasks asynchronously using global thread pool.
+/// Runs tasks asynchronously using thread pool.
 class AsyncExecutor : public Executor
 {
 public:
-    explicit AsyncExecutor() = default;
+    explicit AsyncExecutor(int thread_pool_size) : pool(ThreadPool(thread_pool_size)) { }
 
     std::future<void> execute(std::function<void()> task) override
     {
         auto promise = std::make_shared<std::promise<void>>();
 
-        GlobalThreadPool::instance().scheduleOrThrowOnError(
+        pool.scheduleOrThrowOnError(
             [promise, task]()
             {
                 try
@@ -535,6 +535,9 @@ public:
 
         return promise->get_future();
     }
+
+private:
+    ThreadPool pool;
 };
 
 
@@ -548,8 +551,10 @@ DiskS3::DiskS3(
     size_t min_upload_part_size_,
     size_t max_single_part_upload_size_,
     size_t min_bytes_for_seek_,
-    bool send_metadata_)
-    : IDisk(std::make_unique<AsyncExecutor>())
+    bool send_metadata_,
+    int thread_pool_size_,
+    int list_object_keys_size_)
+    : IDisk(std::make_unique<AsyncExecutor>(thread_pool_size_))
     , name(std::move(name_))
     , client(std::move(client_))
     , proxy_configuration(std::move(proxy_configuration_))
@@ -560,6 +565,7 @@ DiskS3::DiskS3(
     , max_single_part_upload_size(max_single_part_upload_size_)
     , min_bytes_for_seek(min_bytes_for_seek_)
     , send_metadata(send_metadata_)
+    , list_object_keys_size(list_object_keys_size_)
 {
 }
 
@@ -727,15 +733,6 @@ void DiskS3::removeMeta(const String & path, AwsS3KeyKeeper & keys)
         }
         else /// In other case decrement number of references, save metadata and delete file.
         {
-            if (send_metadata)
-            {
-                auto revision = ++revision_counter;
-                const ObjectMetadata object_metadata {
-                    {"path", path}
-                };
-                createFileOperationObject("remove", revision, object_metadata);
-            }
-
             --metadata.ref_count;
             metadata.save();
             file.remove();
@@ -926,7 +923,7 @@ void DiskS3::startup()
     LOG_INFO(&Poco::Logger::get("DiskS3"), "Starting up disk {}", name);
 
     /// Find last revision.
-    UInt64 l = 0, r = (static_cast<UInt64>(1)) << 63;
+    UInt64 l = 0, r = LATEST_REVISION;
     while (r - l > 1)
     {
         auto revision = (r - l) >> 1;
@@ -1002,7 +999,7 @@ void DiskS3::copyObject(const String & src_bucket, const String & src_key, const
 
 struct DiskS3::RestoreInformation
 {
-    UInt64 revision = (static_cast<UInt64>(1)) << 63;
+    UInt64 revision = LATEST_REVISION;
     String bucket;
     String path;
 };
@@ -1054,6 +1051,20 @@ void DiskS3::restore()
         information.path = s3_root_path;
 
         readRestoreInformation(information);
+        if (information.revision == 0)
+            information.revision = LATEST_REVISION;
+
+        if (information.bucket == bucket)
+        {
+            /// In this case we need to additionally cleanup S3 from objects with later revision.
+            /// Will be simply just restore to different path.
+            if (information.path == s3_root_path && information.revision != LATEST_REVISION)
+                throw Exception("Restoring to the same bucket and path is allowed if revision is latest (0)", ErrorCodes::BAD_ARGUMENTS);
+
+            /// This case complicates S3 cleanup in case of unsuccessful restore.
+            if (information.path != s3_root_path && (information.path.starts_with(s3_root_path) || s3_root_path.starts_with(information.path)))
+                throw Exception("Restoring to the same bucket is allowed only if restore paths are same or not prefixes of each other", ErrorCodes::BAD_ARGUMENTS);
+        }
 
         ///TODO: Cleanup FS and bucket if previous restore was failed.
 
@@ -1122,7 +1133,7 @@ void DiskS3::processRestoreFiles(const String & source_bucket, const String & so
         auto path_entry = object_metadata.find("path");
         if (path_entry == object_metadata.end())
         {
-            LOG_WARNING(&Poco::Logger::get("DiskS3"), "Skip key {} because it doesn't have path key in metadata", key);
+            LOG_WARNING(&Poco::Logger::get("DiskS3"), "Skip key {} because it doesn't have 'path' key in metadata", key);
             continue;
         }
 
@@ -1134,11 +1145,13 @@ void DiskS3::processRestoreFiles(const String & source_bucket, const String & so
         auto relative_key = shrinkKey(source_path, key);
         metadata.addObject(relative_key, head_result.GetContentLength());
 
-        /// Copy object to bucket configured for current DiskS3 instance.
-        if (bucket != source_bucket)
+        /// Copy object if we restore to different bucket / path.
+        if (bucket != source_bucket || s3_root_path != source_path)
             copyObject(source_bucket, key, bucket, s3_root_path + relative_key);
 
         metadata.save();
+
+        LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Restored {} file", path);
     }
 }
 
@@ -1146,12 +1159,12 @@ void DiskS3::restoreFileOperations(const String & source_bucket, const String &
 {
     LOG_INFO(&Poco::Logger::get("DiskS3"), "Starting restore file operations for disk {}", name);
 
-    /// Disable sending metadata if we restore metadata to the same bucket.
-    send_metadata = bucket != source_bucket;
+    /// Enable record file operations if we restore to different bucket / path.
+    send_metadata = bucket != source_bucket || s3_root_path != source_path;
 
-    listObjects(source_bucket, source_path + "operations/", [this, &source_bucket, &target_revision](auto list_result) {
+    listObjects(source_bucket, source_path + "operations/", [this, &source_bucket, &target_revision](auto list_result)
+    {
         const String rename = "rename";
-        const String remove = "remove";
         const String hardlink = "hardlink";
 
         for (const auto & row : list_result.GetContents())
@@ -1170,7 +1183,7 @@ void DiskS3::restoreFileOperations(const String & source_bucket, const String &
             if (revision > target_revision)
                 return false;
 
-            /// Keep original revision if restore to different bucket.
+            /// Keep original revision if restore to different bucket / path.
             if (send_metadata)
                 revision_counter = revision - 1;
 
@@ -1180,18 +1193,20 @@ void DiskS3::restoreFileOperations(const String & source_bucket, const String &
                 auto from_path = object_metadata["from_path"];
                 auto to_path = object_metadata["to_path"];
                 if (exists(from_path))
+                {
                     moveFile(from_path, to_path);
-            }
-            else if (operation == remove)
-            {
-                removeIfExists(object_metadata["path"]);
+                    LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Restored rename {} -> {}", from_path, to_path);
+                }
             }
             else if (operation == hardlink)
             {
                 auto src_path = object_metadata["src_path"];
                 auto dst_path = object_metadata["dst_path"];
                 if (exists(src_path))
+                {
                     createHardLink(src_path, dst_path);
+                    LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Restored hardlink {} -> {}", src_path, dst_path);
+                }
             }
         }
 
diff --git a/src/Disks/S3/DiskS3.h b/src/Disks/S3/DiskS3.h
index 532ddcbd858..0140104c10f 100644
--- a/src/Disks/S3/DiskS3.h
+++ b/src/Disks/S3/DiskS3.h
@@ -24,9 +24,6 @@ namespace DB
 class DiskS3 : public IDisk
 {
 public:
-    /// File contains restore information
-    const String restore_file = "restore";
-
     using ObjectMetadata = std::map<std::string, std::string>;
 
     friend class DiskS3Reservation;
@@ -45,7 +42,9 @@ public:
         size_t min_upload_part_size_,
         size_t max_single_part_upload_size_,
         size_t min_bytes_for_seek_,
-        bool send_metadata_);
+        bool send_metadata_,
+        int thread_pool_size_,
+        int list_object_keys_size_);
 
     const String & getName() const override { return name; }
 
@@ -172,6 +171,13 @@ private:
     std::mutex reservation_mutex;
 
     std::atomic<UInt64> revision_counter;
+    static constexpr UInt64 LATEST_REVISION = (static_cast<UInt64>(1)) << 63;
+
+    /// File contains restore information
+    const String restore_file = "restore";
+    /// The number of keys listed in one request (1000 is max value).
+    int list_object_keys_size;
+
     /// Key has format: ../../r{revision}-{operation}
     const re2::RE2 key_regexp {".*/r(\\d+)-(\\w+).*"};
 };
diff --git a/src/Disks/S3/registerDiskS3.cpp b/src/Disks/S3/registerDiskS3.cpp
index 14aecb89517..88344b975bd 100644
--- a/src/Disks/S3/registerDiskS3.cpp
+++ b/src/Disks/S3/registerDiskS3.cpp
@@ -150,7 +150,9 @@ void registerDiskS3(DiskFactory & factory)
             context.getSettingsRef().s3_min_upload_part_size,
             context.getSettingsRef().s3_max_single_part_upload_size,
             config.getUInt64(config_prefix + ".min_bytes_for_seek", 1024 * 1024),
-            config.getBool(config_prefix + ".send_object_metadata", false));
+            config.getBool(config_prefix + ".send_object_metadata", false),
+            config.getInt(config_prefix + ".thread_pool_size", 16),
+            config.getInt(config_prefix + ".list_object_keys_size", 1000));
 
         /// This code is used only to check access to the corresponding disk.
         if (!config.getBool(config_prefix + ".skip_access_check", false))

From 0856b2c5144171f73eb36afcec500a261ed34258 Mon Sep 17 00:00:00 2001
From: Pavel Kovalenko <jokserfn@yandex-team.ru>
Date: Mon, 11 Jan 2021 20:37:08 +0300
Subject: [PATCH 004/122] Ability to backup-restore metadata files for DiskS3
 (fixes and tests)

---
 src/Disks/DiskCacheWrapper.cpp                |  12 +-
 src/Disks/IDisk.h                             |   3 +
 src/Disks/S3/DiskS3.cpp                       |  83 ++++++++------
 src/Disks/S3/DiskS3.h                         |   7 +-
 src/Disks/S3/registerDiskS3.cpp               |   2 +-
 src/Storages/MergeTree/MergeTreeData.cpp      |  10 +-
 tests/integration/helpers/cluster.py          |  41 +++----
 .../config.d/bg_processing_pool_conf.xml      |   5 +
 .../configs/config.d/log_conf.xml             |  12 ++
 .../configs/config.d/storage_conf.xml         |  34 ++++++
 .../configs/config.d/users.xml                |   5 +
 .../configs/config.xml                        |  20 ++++
 .../test_merge_tree_s3_restore/test.py        | 106 ++++++++++++++++++
 13 files changed, 269 insertions(+), 71 deletions(-)
 create mode 100644 tests/integration/test_merge_tree_s3_restore/configs/config.d/bg_processing_pool_conf.xml
 create mode 100644 tests/integration/test_merge_tree_s3_restore/configs/config.d/log_conf.xml
 create mode 100644 tests/integration/test_merge_tree_s3_restore/configs/config.d/storage_conf.xml
 create mode 100644 tests/integration/test_merge_tree_s3_restore/configs/config.d/users.xml
 create mode 100644 tests/integration/test_merge_tree_s3_restore/configs/config.xml
 create mode 100644 tests/integration/test_merge_tree_s3_restore/test.py

diff --git a/src/Disks/DiskCacheWrapper.cpp b/src/Disks/DiskCacheWrapper.cpp
index d44f5a8e0d4..8dc8a005f57 100644
--- a/src/Disks/DiskCacheWrapper.cpp
+++ b/src/Disks/DiskCacheWrapper.cpp
@@ -108,7 +108,7 @@ DiskCacheWrapper::readFile(const String & path, size_t buf_size, size_t estimate
     if (!cache_file_predicate(path))
         return DiskDecorator::readFile(path, buf_size, estimated_size, aio_threshold, mmap_threshold);
 
-    LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Read file {} from cache", backQuote(path));
+    LOG_DEBUG(&Poco::Logger::get("DiskCache"), "Read file {} from cache", backQuote(path));
 
     if (cache_disk->exists(path))
         return cache_disk->readFile(path, buf_size, estimated_size, aio_threshold, mmap_threshold);
@@ -122,11 +122,11 @@ DiskCacheWrapper::readFile(const String & path, size_t buf_size, size_t estimate
         {
             /// This thread will responsible for file downloading to cache.
             metadata->status = DOWNLOADING;
-            LOG_DEBUG(&Poco::Logger::get("DiskS3"), "File {} doesn't exist in cache. Will download it", backQuote(path));
+            LOG_DEBUG(&Poco::Logger::get("DiskCache"), "File {} doesn't exist in cache. Will download it", backQuote(path));
         }
         else if (metadata->status == DOWNLOADING)
         {
-            LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Waiting for file {} download to cache", backQuote(path));
+            LOG_DEBUG(&Poco::Logger::get("DiskCache"), "Waiting for file {} download to cache", backQuote(path));
             metadata->condition.wait(lock, [metadata] { return metadata->status == DOWNLOADED || metadata->status == ERROR; });
         }
     }
@@ -151,11 +151,11 @@ DiskCacheWrapper::readFile(const String & path, size_t buf_size, size_t estimate
                 }
                 cache_disk->moveFile(tmp_path, path);
 
-                LOG_DEBUG(&Poco::Logger::get("DiskS3"), "File {} downloaded to cache", backQuote(path));
+                LOG_DEBUG(&Poco::Logger::get("DiskCache"), "File {} downloaded to cache", backQuote(path));
             }
             catch (...)
             {
-                tryLogCurrentException("DiskS3", "Failed to download file + " + backQuote(path) + " to cache");
+                tryLogCurrentException("DiskCache", "Failed to download file + " + backQuote(path) + " to cache");
                 result_status = ERROR;
             }
         }
@@ -180,7 +180,7 @@ DiskCacheWrapper::writeFile(const String & path, size_t buf_size, WriteMode mode
     if (!cache_file_predicate(path))
         return DiskDecorator::writeFile(path, buf_size, mode, estimated_size, aio_threshold);
 
-    LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Write file {} to cache", backQuote(path));
+    LOG_DEBUG(&Poco::Logger::get("DiskCache"), "Write file {} to cache", backQuote(path));
 
     auto dir_path = directoryPath(path);
     if (!cache_disk->exists(dir_path))
diff --git a/src/Disks/IDisk.h b/src/Disks/IDisk.h
index 7d3e498a40b..a26d5015ba0 100644
--- a/src/Disks/IDisk.h
+++ b/src/Disks/IDisk.h
@@ -195,6 +195,9 @@ public:
     /// Returns executor to perform asynchronous operations.
     virtual Executor & getExecutor() { return *executor; }
 
+    /// Invoked when partitions freeze is invoked.
+    virtual void onFreeze(const String &) { }
+
 private:
     std::unique_ptr<Executor> executor;
 };
diff --git a/src/Disks/S3/DiskS3.cpp b/src/Disks/S3/DiskS3.cpp
index 97a7dc4939f..a13fa148413 100644
--- a/src/Disks/S3/DiskS3.cpp
+++ b/src/Disks/S3/DiskS3.cpp
@@ -924,19 +924,24 @@ void DiskS3::startup()
 
     /// Find last revision.
     UInt64 l = 0, r = LATEST_REVISION;
-    while (r - l > 1)
+    while (l < r)
     {
-        auto revision = (r - l) >> 1;
+        LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Check revision in bounds {}-{}", l, r);
+
+        auto revision = l + (r - l + 1) / 2;
         auto revision_str = revisionToString(revision);
-        /// Check that file or operation with such revision exists.
+
+        LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Check object with revision {}", revision);
+
+        /// Check file or operation with such revision exists.
         if (checkObjectExists(s3_root_path + "r" + revision_str)
             || checkObjectExists(s3_root_path + "operations/r" + revision_str))
             l = revision;
         else
-            r = revision;
+            r = revision - 1;
     }
     revision_counter = l;
-    LOG_INFO(&Poco::Logger::get("DiskS3"), "Found last revision number {}", revision_counter);
+    LOG_INFO(&Poco::Logger::get("DiskS3"), "Found last revision number {} for disk {}", revision_counter, name);
 }
 
 bool DiskS3::checkObjectExists(const String & prefix)
@@ -969,7 +974,7 @@ void DiskS3::listObjects(const String & source_bucket, const String & source_pat
     Aws::S3::Model::ListObjectsV2Request request;
     request.SetBucket(source_bucket);
     request.SetPrefix(source_path);
-    request.SetMaxKeys(1000);
+    request.SetMaxKeys(list_object_keys_size);
 
     Aws::S3::Model::ListObjectsV2Outcome outcome;
     do
@@ -1000,13 +1005,13 @@ void DiskS3::copyObject(const String & src_bucket, const String & src_key, const
 struct DiskS3::RestoreInformation
 {
     UInt64 revision = LATEST_REVISION;
-    String bucket;
-    String path;
+    String source_bucket;
+    String source_path;
 };
 
 void DiskS3::readRestoreInformation(DiskS3::RestoreInformation & restore_information)
 {
-    ReadBufferFromFile buffer(metadata_path + restore_file, 512);
+    ReadBufferFromFile buffer(metadata_path + restore_file_name, 512);
     buffer.next();
 
     /// Empty file - just restore all metadata.
@@ -1021,13 +1026,13 @@ void DiskS3::readRestoreInformation(DiskS3::RestoreInformation & restore_informa
         if (!buffer.hasPendingData())
             return;
 
-        readText(restore_information.bucket, buffer);
+        readText(restore_information.source_bucket, buffer);
         assertChar('\n', buffer);
 
         if (!buffer.hasPendingData())
             return;
 
-        readText(restore_information.path, buffer);
+        readText(restore_information.source_path, buffer);
         assertChar('\n', buffer);
 
         if (buffer.hasPendingData())
@@ -1041,35 +1046,42 @@ void DiskS3::readRestoreInformation(DiskS3::RestoreInformation & restore_informa
 
 void DiskS3::restore()
 {
-    if (!exists(restore_file))
+    if (!exists(restore_file_name))
         return;
 
     try
     {
         RestoreInformation information;
-        information.bucket = bucket;
-        information.path = s3_root_path;
+        information.source_bucket = bucket;
+        information.source_path = s3_root_path;
 
         readRestoreInformation(information);
         if (information.revision == 0)
             information.revision = LATEST_REVISION;
+        if (!information.source_path.ends_with('/'))
+            information.source_path += '/';
 
-        if (information.bucket == bucket)
+        if (information.source_bucket == bucket)
         {
             /// In this case we need to additionally cleanup S3 from objects with later revision.
             /// Will be simply just restore to different path.
-            if (information.path == s3_root_path && information.revision != LATEST_REVISION)
+            if (information.source_path == s3_root_path && information.revision != LATEST_REVISION)
                 throw Exception("Restoring to the same bucket and path is allowed if revision is latest (0)", ErrorCodes::BAD_ARGUMENTS);
 
             /// This case complicates S3 cleanup in case of unsuccessful restore.
-            if (information.path != s3_root_path && (information.path.starts_with(s3_root_path) || s3_root_path.starts_with(information.path)))
-                throw Exception("Restoring to the same bucket is allowed only if restore paths are same or not prefixes of each other", ErrorCodes::BAD_ARGUMENTS);
+            if (information.source_path != s3_root_path && s3_root_path.starts_with(information.source_path))
+                throw Exception("Restoring to the same bucket is allowed only if source path is not a sub-path of configured path in S3 disk", ErrorCodes::BAD_ARGUMENTS);
         }
 
         ///TODO: Cleanup FS and bucket if previous restore was failed.
 
-        restoreFiles(information.bucket, information.path, information.revision);
-        restoreFileOperations(information.bucket, information.path, information.revision);
+        restoreFiles(information.source_bucket, information.source_path, information.revision);
+        restoreFileOperations(information.source_bucket, information.source_path, information.revision);
+
+        Poco::File restore_file(metadata_path + restore_file_name);
+        restore_file.remove();
+
+        LOG_INFO(&Poco::Logger::get("DiskS3"), "Restore disk {} finished", name);
     }
     catch (const Exception & e)
     {
@@ -1093,7 +1105,7 @@ void DiskS3::restoreFiles(const String & source_bucket, const String & source_pa
             if (key.find("/operations/") != String::npos)
                 continue;
 
-            auto [revision, _] = extractRevisionAndOperationFromKey(key);
+            const auto [revision, _] = extractRevisionAndOperationFromKey(key);
             /// Filter early if it's possible to get revision from key.
             if (revision > target_revision)
                 continue;
@@ -1129,11 +1141,11 @@ void DiskS3::processRestoreFiles(const String & source_bucket, const String & so
         auto head_result = headObject(source_bucket, key);
         auto object_metadata = head_result.GetMetadata();
 
-        /// If object has 'path' in metadata then restore it.
+        /// Restore file if object has 'path' in metadata.
         auto path_entry = object_metadata.find("path");
         if (path_entry == object_metadata.end())
         {
-            LOG_WARNING(&Poco::Logger::get("DiskS3"), "Skip key {} because it doesn't have 'path' key in metadata", key);
+            LOG_WARNING(&Poco::Logger::get("DiskS3"), "Skip key {} because it doesn't have 'path' in metadata", key);
             continue;
         }
 
@@ -1141,17 +1153,16 @@ void DiskS3::processRestoreFiles(const String & source_bucket, const String & so
 
         createDirectories(directoryPath(path));
         auto metadata = createMeta(path);
-
         auto relative_key = shrinkKey(source_path, key);
-        metadata.addObject(relative_key, head_result.GetContentLength());
 
         /// Copy object if we restore to different bucket / path.
         if (bucket != source_bucket || s3_root_path != source_path)
             copyObject(source_bucket, key, bucket, s3_root_path + relative_key);
 
+        metadata.addObject(relative_key, head_result.GetContentLength());
         metadata.save();
 
-        LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Restored {} file", path);
+        LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Restored file {}", path);
     }
 }
 
@@ -1159,7 +1170,7 @@ void DiskS3::restoreFileOperations(const String & source_bucket, const String &
 {
     LOG_INFO(&Poco::Logger::get("DiskS3"), "Starting restore file operations for disk {}", name);
 
-    /// Enable record file operations if we restore to different bucket / path.
+    /// Enable recording file operations if we restore to different bucket / path.
     send_metadata = bucket != source_bucket || s3_root_path != source_path;
 
     listObjects(source_bucket, source_path + "operations/", [this, &source_bucket, &target_revision](auto list_result)
@@ -1171,15 +1182,15 @@ void DiskS3::restoreFileOperations(const String & source_bucket, const String &
         {
             const String & key = row.GetKey();
 
-            auto [revision, operation] = extractRevisionAndOperationFromKey(key);
-            if (revision == 0)
+            const auto [revision, operation] = extractRevisionAndOperationFromKey(key);
+            if (revision == UNKNOWN_REVISION)
             {
-                LOG_WARNING(&Poco::Logger::get("DiskS3"), "Skip key {} with unknown revision", revision);
+                LOG_WARNING(&Poco::Logger::get("DiskS3"), "Skip key {} with unknown revision", key);
                 continue;
             }
 
-            /// Stop processing when get revision more than required.
-            /// S3 ensures that keys will be listed in ascending UTF-8 bytes order.
+            /// S3 ensures that keys will be listed in ascending UTF-8 bytes order (revision order).
+            /// We can stop processing if revision of the object is already more than required.
             if (revision > target_revision)
                 return false;
 
@@ -1220,7 +1231,7 @@ void DiskS3::restoreFileOperations(const String & source_bucket, const String &
 
 std::tuple<UInt64, String> DiskS3::extractRevisionAndOperationFromKey(const String & key)
 {
-    UInt64 revision = 0;
+    UInt64 revision = UNKNOWN_REVISION;
     String operation;
 
     re2::RE2::FullMatch(key, key_regexp, &revision, &operation);
@@ -1249,4 +1260,10 @@ String DiskS3::revisionToString(UInt64 revision)
     return revision_str;
 }
 
+void DiskS3::onFreeze(const String & path)
+{
+    WriteBufferFromFile revision_file_buf(metadata_path + path + "revision.txt", 32);
+    writeIntText(revision_counter.load(), revision_file_buf);
+}
+
 }
diff --git a/src/Disks/S3/DiskS3.h b/src/Disks/S3/DiskS3.h
index 0140104c10f..c330bf0c4e6 100644
--- a/src/Disks/S3/DiskS3.h
+++ b/src/Disks/S3/DiskS3.h
@@ -126,6 +126,8 @@ public:
     /// Restore S3 metadata files on file system.
     void restore();
 
+    void onFreeze(const String & path) override;
+
 private:
     bool tryReserve(UInt64 bytes);
 
@@ -172,9 +174,10 @@ private:
 
     std::atomic<UInt64> revision_counter;
     static constexpr UInt64 LATEST_REVISION = (static_cast<UInt64>(1)) << 63;
+    static constexpr UInt64 UNKNOWN_REVISION = 0;
 
-    /// File contains restore information
-    const String restore_file = "restore";
+    /// File at path {metadata_path}/restore indicates that metadata restore is needed and contains restore information
+    const String restore_file_name = "restore";
     /// The number of keys listed in one request (1000 is max value).
     int list_object_keys_size;
 
diff --git a/src/Disks/S3/registerDiskS3.cpp b/src/Disks/S3/registerDiskS3.cpp
index 88344b975bd..ad5ab15e30e 100644
--- a/src/Disks/S3/registerDiskS3.cpp
+++ b/src/Disks/S3/registerDiskS3.cpp
@@ -150,7 +150,7 @@ void registerDiskS3(DiskFactory & factory)
             context.getSettingsRef().s3_min_upload_part_size,
             context.getSettingsRef().s3_max_single_part_upload_size,
             config.getUInt64(config_prefix + ".min_bytes_for_seek", 1024 * 1024),
-            config.getBool(config_prefix + ".send_object_metadata", false),
+            config.getBool(config_prefix + ".send_metadata", false),
             config.getInt(config_prefix + ".thread_pool_size", 16),
             config.getInt(config_prefix + ".list_object_keys_size", 1000));
 
diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp
index 240759b29c7..ddc0e7c7808 100644
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@@ -3604,6 +3604,10 @@ PartitionCommandsResultInfo MergeTreeData::freezePartitionsByMatcher(MatcherFn m
     const auto data_parts = getDataParts();
 
     String backup_name = (!with_name.empty() ? escapeForFileName(with_name) : toString(increment));
+    String backup_path = shadow_path + backup_name + "/";
+
+    for (const auto & disk : getStoragePolicy()->getDisks())
+        disk->onFreeze(backup_path);
 
     PartitionCommandsResultInfo result;
 
@@ -3613,12 +3617,10 @@ PartitionCommandsResultInfo MergeTreeData::freezePartitionsByMatcher(MatcherFn m
         if (!matcher(part))
             continue;
 
-        part->volume->getDisk()->createDirectories(shadow_path);
-
-        String backup_path = shadow_path + backup_name + "/";
-
         LOG_DEBUG(log, "Freezing part {} snapshot will be placed at {}", part->name, backup_path);
 
+        part->volume->getDisk()->createDirectories(shadow_path);
+
         String backup_part_path = backup_path + relative_data_path + part->relative_path;
         if (auto part_in_memory = asInMemoryPart(part))
             part_in_memory->flushToDisk(backup_path + relative_data_path, part->relative_path, metadata_snapshot);
diff --git a/tests/integration/helpers/cluster.py b/tests/integration/helpers/cluster.py
index a65a420cd5b..3a2723d29e3 100644
--- a/tests/integration/helpers/cluster.py
+++ b/tests/integration/helpers/cluster.py
@@ -1040,32 +1040,25 @@ class ClickHouseInstance:
         return self.http_query(sql=sql, data=data, params=params, user=user, password=password,
                                expect_fail_and_get_error=True)
 
-    def kill_clickhouse(self, stop_start_wait_sec=5):
-        pid = self.get_process_pid("clickhouse")
-        if not pid:
-            raise Exception("No clickhouse found")
-        self.exec_in_container(["bash", "-c", "kill -9 {}".format(pid)], user='root')
-        time.sleep(stop_start_wait_sec)
-
-    def restore_clickhouse(self, retries=100):
-        pid = self.get_process_pid("clickhouse")
-        if pid:
-            raise Exception("ClickHouse has already started")
-        self.exec_in_container(["bash", "-c", "{} --daemon".format(CLICKHOUSE_START_COMMAND)], user=str(os.getuid()))
-        from helpers.test_tools import assert_eq_with_retry
-        # wait start
-        assert_eq_with_retry(self, "select 1", "1", retry_count=retries)
-
-    def restart_clickhouse(self, stop_start_wait_sec=5, kill=False):
+    def stop_clickhouse(self, start_wait_sec=5, kill=False):
         if not self.stay_alive:
-            raise Exception("clickhouse can be restarted only with stay_alive=True instance")
+            raise Exception("clickhouse can be stopped only with stay_alive=True instance")
 
         self.exec_in_container(["bash", "-c", "pkill {} clickhouse".format("-9" if kill else "")], user='root')
-        time.sleep(stop_start_wait_sec)
+        time.sleep(start_wait_sec)
+
+    def start_clickhouse(self, stop_wait_sec=5):
+        if not self.stay_alive:
+            raise Exception("clickhouse can be started again only with stay_alive=True instance")
+
         self.exec_in_container(["bash", "-c", "{} --daemon".format(CLICKHOUSE_START_COMMAND)], user=str(os.getuid()))
         # wait start
         from helpers.test_tools import assert_eq_with_retry
-        assert_eq_with_retry(self, "select 1", "1", retry_count=int(stop_start_wait_sec / 0.5), sleep_time=0.5)
+        assert_eq_with_retry(self, "select 1", "1", retry_count=int(stop_wait_sec / 0.5), sleep_time=0.5)
+
+    def restart_clickhouse(self, stop_start_wait_sec=5, kill=False):
+        self.stop_clickhouse(stop_start_wait_sec, kill)
+        self.start_clickhouse(stop_start_wait_sec)
 
     def exec_in_container(self, cmd, detach=False, nothrow=False, **kwargs):
         container_id = self.get_docker_handle().id
@@ -1085,9 +1078,7 @@ class ClickHouseInstance:
         return self.cluster.copy_file_to_container(container_id, local_path, dest_path)
 
     def get_process_pid(self, process_name):
-        output = self.exec_in_container(["bash", "-c",
-                                         "ps ax | grep '{}' | grep -v 'grep' | grep -v 'bash -c' | awk '{{print $1}}'".format(
-                                             process_name)])
+        output = self.exec_in_container(["pidof", "{}".format(process_name)])
         if output:
             try:
                 pid = int(output.split('\n')[0].strip())
@@ -1403,7 +1394,7 @@ class ClickHouseKiller(object):
         self.clickhouse_node = clickhouse_node
 
     def __enter__(self):
-        self.clickhouse_node.kill_clickhouse()
+        self.clickhouse_node.stop_clickhouse()
 
     def __exit__(self, exc_type, exc_val, exc_tb):
-        self.clickhouse_node.restore_clickhouse()
+        self.clickhouse_node.start_clickhouse()
diff --git a/tests/integration/test_merge_tree_s3_restore/configs/config.d/bg_processing_pool_conf.xml b/tests/integration/test_merge_tree_s3_restore/configs/config.d/bg_processing_pool_conf.xml
new file mode 100644
index 00000000000..a756c4434ea
--- /dev/null
+++ b/tests/integration/test_merge_tree_s3_restore/configs/config.d/bg_processing_pool_conf.xml
@@ -0,0 +1,5 @@
+<yandex>
+    <background_processing_pool_thread_sleep_seconds>0.5</background_processing_pool_thread_sleep_seconds>
+    <background_processing_pool_task_sleep_seconds_when_no_work_min>0.5</background_processing_pool_task_sleep_seconds_when_no_work_min>
+    <background_processing_pool_task_sleep_seconds_when_no_work_max>0.5</background_processing_pool_task_sleep_seconds_when_no_work_max>
+</yandex>
diff --git a/tests/integration/test_merge_tree_s3_restore/configs/config.d/log_conf.xml b/tests/integration/test_merge_tree_s3_restore/configs/config.d/log_conf.xml
new file mode 100644
index 00000000000..318a6bca95d
--- /dev/null
+++ b/tests/integration/test_merge_tree_s3_restore/configs/config.d/log_conf.xml
@@ -0,0 +1,12 @@
+<yandex>
+    <shutdown_wait_unfinished>3</shutdown_wait_unfinished>
+    <logger>
+        <level>trace</level>
+        <log>/var/log/clickhouse-server/log.log</log>
+        <errorlog>/var/log/clickhouse-server/log.err.log</errorlog>
+        <size>1000M</size>
+        <count>10</count>
+        <stderr>/var/log/clickhouse-server/stderr.log</stderr>
+        <stdout>/var/log/clickhouse-server/stdout.log</stdout>
+    </logger>
+</yandex>
diff --git a/tests/integration/test_merge_tree_s3_restore/configs/config.d/storage_conf.xml b/tests/integration/test_merge_tree_s3_restore/configs/config.d/storage_conf.xml
new file mode 100644
index 00000000000..9361a21efca
--- /dev/null
+++ b/tests/integration/test_merge_tree_s3_restore/configs/config.d/storage_conf.xml
@@ -0,0 +1,34 @@
+<yandex>
+    <storage_configuration>
+        <disks>
+            <s3>
+                <type>s3</type>
+                <endpoint>http://minio1:9001/root/data/</endpoint>
+                <access_key_id>minio</access_key_id>
+                <secret_access_key>minio123</secret_access_key>
+                <send_metadata>true</send_metadata>
+                <list_object_keys_size>1</list_object_keys_size> <!-- To effectively test restore parallelism -->
+            </s3>
+            <hdd>
+                <type>local</type>
+                <path>/</path>
+            </hdd>
+        </disks>
+        <policies>
+            <s3>
+                <volumes>
+                    <main>
+                        <disk>s3</disk>
+                    </main>
+                    <external>
+                        <disk>hdd</disk>
+                    </external>
+                </volumes>
+            </s3>
+        </policies>
+    </storage_configuration>
+
+    <merge_tree>
+        <min_bytes_for_wide_part>0</min_bytes_for_wide_part>
+    </merge_tree>
+</yandex>
diff --git a/tests/integration/test_merge_tree_s3_restore/configs/config.d/users.xml b/tests/integration/test_merge_tree_s3_restore/configs/config.d/users.xml
new file mode 100644
index 00000000000..797113053f4
--- /dev/null
+++ b/tests/integration/test_merge_tree_s3_restore/configs/config.d/users.xml
@@ -0,0 +1,5 @@
+<yandex>
+    <profiles>
+        <default/>
+    </profiles>
+</yandex>
diff --git a/tests/integration/test_merge_tree_s3_restore/configs/config.xml b/tests/integration/test_merge_tree_s3_restore/configs/config.xml
new file mode 100644
index 00000000000..24b7344df3a
--- /dev/null
+++ b/tests/integration/test_merge_tree_s3_restore/configs/config.xml
@@ -0,0 +1,20 @@
+<?xml version="1.0"?>
+<yandex>
+    <tcp_port>9000</tcp_port>
+    <listen_host>127.0.0.1</listen_host>
+
+    <openSSL>
+        <client>
+            <cacheSessions>true</cacheSessions>
+            <verificationMode>none</verificationMode>
+            <invalidCertificateHandler>
+                <name>AcceptCertificateHandler</name>
+            </invalidCertificateHandler>
+        </client>
+    </openSSL>
+
+    <max_concurrent_queries>500</max_concurrent_queries>
+    <mark_cache_size>5368709120</mark_cache_size>
+    <path>./clickhouse/</path>
+    <users_config>users.xml</users_config>
+</yandex>
diff --git a/tests/integration/test_merge_tree_s3_restore/test.py b/tests/integration/test_merge_tree_s3_restore/test.py
new file mode 100644
index 00000000000..6cafc077e81
--- /dev/null
+++ b/tests/integration/test_merge_tree_s3_restore/test.py
@@ -0,0 +1,106 @@
+import logging
+import random
+import string
+import time
+
+import pytest
+from helpers.cluster import ClickHouseCluster
+
+logging.getLogger().setLevel(logging.INFO)
+logging.getLogger().addHandler(logging.StreamHandler())
+
+
+@pytest.fixture(scope="module")
+def cluster():
+    try:
+        cluster = ClickHouseCluster(__file__)
+        cluster.add_instance("node", main_configs=["configs/config.d/storage_conf.xml",
+                                                   "configs/config.d/bg_processing_pool_conf.xml",
+                                                   "configs/config.d/log_conf.xml"], user_configs=[], with_minio=True, stay_alive=True)
+        logging.info("Starting cluster...")
+        cluster.start()
+        logging.info("Cluster started")
+
+        yield cluster
+    finally:
+        cluster.shutdown()
+
+
+def random_string(length):
+    letters = string.ascii_letters
+    return ''.join(random.choice(letters) for i in range(length))
+
+
+def generate_values(date_str, count, sign=1):
+    data = [[date_str, sign * (i + 1), random_string(10)] for i in range(count)]
+    data.sort(key=lambda tup: tup[1])
+    return ",".join(["('{}',{},'{}')".format(x, y, z) for x, y, z in data])
+
+
+def create_table(cluster, table_name, additional_settings=None):
+    node = cluster.instances["node"]
+
+    create_table_statement = """
+        CREATE TABLE {} (
+            dt Date,
+            id Int64,
+            data String,
+            INDEX min_max (id) TYPE minmax GRANULARITY 3
+        ) ENGINE=MergeTree()
+        PARTITION BY dt
+        ORDER BY (dt, id)
+        SETTINGS
+            storage_policy='s3',
+            old_parts_lifetime=600,
+            index_granularity=512
+        """.format(table_name)
+
+    if additional_settings:
+        create_table_statement += ","
+        create_table_statement += additional_settings
+
+    node.query(create_table_statement)
+
+
+@pytest.fixture(autouse=True)
+def drop_table(cluster):
+    yield
+    node = cluster.instances["node"]
+    minio = cluster.minio_client
+
+    node.query("DROP TABLE IF EXISTS s3_test NO DELAY")
+
+    for obj in list(minio.list_objects(cluster.minio_bucket, 'data/')):
+        minio.remove_object(cluster.minio_bucket, obj.object_name)
+
+
+# Restore to the same bucket and path with latest revision.
+def test_simple_full_restore(cluster):
+    create_table(cluster, "s3_test")
+
+    node = cluster.instances["node"]
+
+    node.query("INSERT INTO s3_test VALUES {}".format(generate_values('2020-01-03', 4096)))
+    node.query("INSERT INTO s3_test VALUES {}".format(generate_values('2020-01-04', 4096, -1)))
+    node.query("INSERT INTO s3_test VALUES {}".format(generate_values('2020-01-05', 4096)))
+    node.query("INSERT INTO s3_test VALUES {}".format(generate_values('2020-01-05', 4096, -1)))
+
+    # To ensure parts have merged
+    node.query("OPTIMIZE TABLE s3_test")
+
+    assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "({})".format(4096 * 4)
+    assert node.query("SELECT sum(id) FROM s3_test FORMAT Values") == "({})".format(0)
+
+    node.stop_clickhouse()
+    node.exec_in_container(['bash', '-c', 'rm -r /var/lib/clickhouse/disks/s3/*'], user='root')
+    node.start_clickhouse()
+
+    # All data is removed.
+    assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "({})".format(0)
+
+    node.stop_clickhouse()
+    node.exec_in_container(['bash', '-c', 'touch /var/lib/clickhouse/disks/s3/restore'], user='root')
+    node.start_clickhouse()
+
+    assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "({})".format(4096 * 4)
+    assert node.query("SELECT sum(id) FROM s3_test FORMAT Values") == "({})".format(0)

From 0164965bc0cd7557871bf53c11eb11dd4b934bb4 Mon Sep 17 00:00:00 2001
From: Pavel Kovalenko <jokserfn@yandex-team.ru>
Date: Mon, 11 Jan 2021 20:40:11 +0300
Subject: [PATCH 005/122] Fix get_process_pid

---
 tests/integration/helpers/cluster.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integration/helpers/cluster.py b/tests/integration/helpers/cluster.py
index 3a2723d29e3..43c553f5318 100644
--- a/tests/integration/helpers/cluster.py
+++ b/tests/integration/helpers/cluster.py
@@ -1078,7 +1078,7 @@ class ClickHouseInstance:
         return self.cluster.copy_file_to_container(container_id, local_path, dest_path)
 
     def get_process_pid(self, process_name):
-        output = self.exec_in_container(["pidof", "{}".format(process_name)])
+        output = self.exec_in_container(["bash", "-c", "pidof {}".format(process_name)])
         if output:
             try:
                 pid = int(output.split('\n')[0].strip())

From b09862b7b92d37238202871897b2897d15a86d72 Mon Sep 17 00:00:00 2001
From: Pavel Kovalenko <jokserfn@yandex-team.ru>
Date: Tue, 12 Jan 2021 20:18:40 +0300
Subject: [PATCH 006/122] Ability to backup-restore metadata files for DiskS3
 (fixes and tests)

---
 src/Disks/DiskCacheWrapper.cpp                |   3 +-
 src/Disks/DiskDecorator.cpp                   |   5 +
 src/Disks/DiskDecorator.h                     |   1 +
 src/Disks/S3/DiskS3.cpp                       |  15 +-
 src/Disks/S3/DiskS3.h                         |   6 +-
 src/Storages/MergeTree/MergeTreeData.cpp      |   2 +-
 .../MergeTree/MergeTreeDataMergerMutator.cpp  |   2 +-
 tests/integration/helpers/cluster.py          |  18 +-
 .../config.d/storage_conf_another_bucket.xml  |  34 +++
 .../storage_conf_another_bucket_path.xml      |  34 +++
 .../test_merge_tree_s3_restore/test.py        | 260 ++++++++++++++++--
 11 files changed, 336 insertions(+), 44 deletions(-)
 create mode 100644 tests/integration/test_merge_tree_s3_restore/configs/config.d/storage_conf_another_bucket.xml
 create mode 100644 tests/integration/test_merge_tree_s3_restore/configs/config.d/storage_conf_another_bucket_path.xml

diff --git a/src/Disks/DiskCacheWrapper.cpp b/src/Disks/DiskCacheWrapper.cpp
index 8dc8a005f57..eab3f1fddd7 100644
--- a/src/Disks/DiskCacheWrapper.cpp
+++ b/src/Disks/DiskCacheWrapper.cpp
@@ -255,7 +255,8 @@ void DiskCacheWrapper::removeRecursive(const String & path)
 
 void DiskCacheWrapper::createHardLink(const String & src_path, const String & dst_path)
 {
-    if (cache_disk->exists(src_path))
+    /// Don't create hardlinks for cache files to shadow directory as it just waste cache disk space.
+    if (cache_disk->exists(src_path) && !dst_path.starts_with("shadow/"))
     {
         auto dir_path = directoryPath(dst_path);
         if (!cache_disk->exists(dir_path))
diff --git a/src/Disks/DiskDecorator.cpp b/src/Disks/DiskDecorator.cpp
index 8441803a2af..a7154e12e8e 100644
--- a/src/Disks/DiskDecorator.cpp
+++ b/src/Disks/DiskDecorator.cpp
@@ -180,4 +180,9 @@ Executor & DiskDecorator::getExecutor()
     return delegate->getExecutor();
 }
 
+void DiskDecorator::onFreeze(const String & path)
+{
+    delegate->onFreeze(path);
+}
+
 }
diff --git a/src/Disks/DiskDecorator.h b/src/Disks/DiskDecorator.h
index eed3c77abf6..e3c036cf3e1 100644
--- a/src/Disks/DiskDecorator.h
+++ b/src/Disks/DiskDecorator.h
@@ -50,6 +50,7 @@ public:
     void sync(int fd) const override;
     const String getType() const override { return delegate->getType(); }
     Executor & getExecutor() override;
+    void onFreeze(const String & path) override;
 
 protected:
     DiskPtr delegate;
diff --git a/src/Disks/S3/DiskS3.cpp b/src/Disks/S3/DiskS3.cpp
index a13fa148413..5787457bf11 100644
--- a/src/Disks/S3/DiskS3.cpp
+++ b/src/Disks/S3/DiskS3.cpp
@@ -40,6 +40,7 @@ namespace ErrorCodes
     extern const int UNKNOWN_FORMAT;
     extern const int INCORRECT_DISK_INDEX;
     extern const int NOT_IMPLEMENTED;
+    extern const int BAD_ARGUMENTS;
     extern const int PATH_ACCESS_DENIED;
     extern const int LOGICAL_ERROR;
 }
@@ -848,7 +849,7 @@ Poco::Timestamp DiskS3::getLastModified(const String & path)
 void DiskS3::createHardLink(const String & src_path, const String & dst_path)
 {
     /// We don't need to record hardlinks created to shadow folder.
-    if (send_metadata && dst_path.find("/shadow/") != String::npos)
+    if (send_metadata && !dst_path.starts_with("shadow/"))
     {
         auto revision = ++revision_counter;
         const ObjectMetadata object_metadata {
@@ -1075,6 +1076,9 @@ void DiskS3::restore()
 
         ///TODO: Cleanup FS and bucket if previous restore was failed.
 
+        LOG_INFO(&Poco::Logger::get("DiskS3"), "Starting to restore disk {}. Revision: {}, Source bucket: {}, Source path: {}",
+                 name, information.revision, information.source_bucket, information.source_path);
+
         restoreFiles(information.source_bucket, information.source_path, information.revision);
         restoreFileOperations(information.source_bucket, information.source_path, information.revision);
 
@@ -1085,6 +1089,8 @@ void DiskS3::restore()
     }
     catch (const Exception & e)
     {
+        LOG_ERROR(&Poco::Logger::get("DiskS3"), "Failed to restore disk. Code: {}, e.displayText() = {}, Stack trace:\n\n{}", e.code(), e.displayText(), e.getStackTraceString());
+
         throw Exception("Failed to restore disk: " + name, e, ErrorCodes::LOGICAL_ERROR);
     }
 }
@@ -1206,7 +1212,7 @@ void DiskS3::restoreFileOperations(const String & source_bucket, const String &
                 if (exists(from_path))
                 {
                     moveFile(from_path, to_path);
-                    LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Restored rename {} -> {}", from_path, to_path);
+                    LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Revision {}. Restored rename {} -> {}", revision, from_path, to_path);
                 }
             }
             else if (operation == hardlink)
@@ -1215,8 +1221,9 @@ void DiskS3::restoreFileOperations(const String & source_bucket, const String &
                 auto dst_path = object_metadata["dst_path"];
                 if (exists(src_path))
                 {
+                    createDirectories(directoryPath(dst_path));
                     createHardLink(src_path, dst_path);
-                    LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Restored hardlink {} -> {}", src_path, dst_path);
+                    LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Revision {}. Restored hardlink {} -> {}", revision, src_path, dst_path);
                 }
             }
         }
@@ -1262,8 +1269,10 @@ String DiskS3::revisionToString(UInt64 revision)
 
 void DiskS3::onFreeze(const String & path)
 {
+    createDirectories(path);
     WriteBufferFromFile revision_file_buf(metadata_path + path + "revision.txt", 32);
     writeIntText(revision_counter.load(), revision_file_buf);
+    revision_file_buf.finalize();
 }
 
 }
diff --git a/src/Disks/S3/DiskS3.h b/src/Disks/S3/DiskS3.h
index c330bf0c4e6..bc5055b942a 100644
--- a/src/Disks/S3/DiskS3.h
+++ b/src/Disks/S3/DiskS3.h
@@ -126,6 +126,7 @@ public:
     /// Restore S3 metadata files on file system.
     void restore();
 
+    /// Dumps current revision counter into file 'revision.txt' at given path.
     void onFreeze(const String & path) override;
 
 private:
@@ -156,7 +157,6 @@ private:
     static String shrinkKey(const String & path, const String & key);
     std::tuple<UInt64, String> extractRevisionAndOperationFromKey(const String & key);
 
-private:
     const String name;
     std::shared_ptr<Aws::S3::S3Client> client;
     std::shared_ptr<S3::ProxyConfiguration> proxy_configuration;
@@ -176,9 +176,9 @@ private:
     static constexpr UInt64 LATEST_REVISION = (static_cast<UInt64>(1)) << 63;
     static constexpr UInt64 UNKNOWN_REVISION = 0;
 
-    /// File at path {metadata_path}/restore indicates that metadata restore is needed and contains restore information
+    /// File at path {metadata_path}/restore contains metadata restore information
     const String restore_file_name = "restore";
-    /// The number of keys listed in one request (1000 is max value).
+    /// The number of keys listed in one request (1000 is max value)
     int list_object_keys_size;
 
     /// Key has format: ../../r{revision}-{operation}
diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp
index ddc0e7c7808..6a64c69c987 100644
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@@ -3619,7 +3619,7 @@ PartitionCommandsResultInfo MergeTreeData::freezePartitionsByMatcher(MatcherFn m
 
         LOG_DEBUG(log, "Freezing part {} snapshot will be placed at {}", part->name, backup_path);
 
-        part->volume->getDisk()->createDirectories(shadow_path);
+        part->volume->getDisk()->createDirectories(backup_path);
 
         String backup_part_path = backup_path + relative_data_path + part->relative_path;
         if (auto part_in_memory = asInMemoryPart(part))
diff --git a/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp b/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp
index 2365ef141b6..9b0daba0749 100644
--- a/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp
@@ -1234,7 +1234,7 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataMergerMutator::mutatePartToTempor
             if (files_to_skip.count(it->name()))
                 continue;
 
-            String destination = new_part_tmp_path + "/";
+            String destination = new_part_tmp_path;
             String file_name = it->name();
             auto rename_it = std::find_if(files_to_rename.begin(), files_to_rename.end(), [&file_name](const auto & rename_pair) { return rename_pair.first == file_name; });
             if (rename_it != files_to_rename.end())
diff --git a/tests/integration/helpers/cluster.py b/tests/integration/helpers/cluster.py
index 43c553f5318..65f438b6575 100644
--- a/tests/integration/helpers/cluster.py
+++ b/tests/integration/helpers/cluster.py
@@ -147,6 +147,7 @@ class ClickHouseCluster:
         self.minio_certs_dir = None
         self.minio_host = "minio1"
         self.minio_bucket = "root"
+        self.minio_bucket_2 = "root2"
         self.minio_port = 9001
         self.minio_client = None  # type: Minio
         self.minio_redirect_host = "proxy1"
@@ -549,17 +550,18 @@ class ClickHouseCluster:
 
                 print("Connected to Minio.")
 
-                if minio_client.bucket_exists(self.minio_bucket):
-                    minio_client.remove_bucket(self.minio_bucket)
+                buckets = [self.minio_bucket, self.minio_bucket_2]
 
-                minio_client.make_bucket(self.minio_bucket)
-
-                print(("S3 bucket '%s' created", self.minio_bucket))
+                for bucket in buckets:
+                    if minio_client.bucket_exists(bucket):
+                        minio_client.remove_bucket(bucket)
+                    minio_client.make_bucket(bucket)
+                    print("S3 bucket '%s' created", bucket)
 
                 self.minio_client = minio_client
                 return
             except Exception as ex:
-                print(("Can't connect to Minio: %s", str(ex)))
+                print("Can't connect to Minio: %s", str(ex))
                 time.sleep(1)
 
         raise Exception("Can't wait Minio to start")
@@ -1078,7 +1080,9 @@ class ClickHouseInstance:
         return self.cluster.copy_file_to_container(container_id, local_path, dest_path)
 
     def get_process_pid(self, process_name):
-        output = self.exec_in_container(["bash", "-c", "pidof {}".format(process_name)])
+        output = self.exec_in_container(["bash", "-c",
+                                         "ps ax | grep '{}' | grep -v 'grep' | grep -v 'bash -c' | awk '{{print $1}}'".format(
+                                             process_name)])
         if output:
             try:
                 pid = int(output.split('\n')[0].strip())
diff --git a/tests/integration/test_merge_tree_s3_restore/configs/config.d/storage_conf_another_bucket.xml b/tests/integration/test_merge_tree_s3_restore/configs/config.d/storage_conf_another_bucket.xml
new file mode 100644
index 00000000000..645d1111ab8
--- /dev/null
+++ b/tests/integration/test_merge_tree_s3_restore/configs/config.d/storage_conf_another_bucket.xml
@@ -0,0 +1,34 @@
+<yandex>
+    <storage_configuration>
+        <disks>
+            <s3>
+                <type>s3</type>
+                <endpoint>http://minio1:9001/root2/data/</endpoint>
+                <access_key_id>minio</access_key_id>
+                <secret_access_key>minio123</secret_access_key>
+                <send_metadata>true</send_metadata>
+                <list_object_keys_size>1</list_object_keys_size> <!-- To effectively test restore parallelism -->
+            </s3>
+            <hdd>
+                <type>local</type>
+                <path>/</path>
+            </hdd>
+        </disks>
+        <policies>
+            <s3>
+                <volumes>
+                    <main>
+                        <disk>s3</disk>
+                    </main>
+                    <external>
+                        <disk>hdd</disk>
+                    </external>
+                </volumes>
+            </s3>
+        </policies>
+    </storage_configuration>
+
+    <merge_tree>
+        <min_bytes_for_wide_part>0</min_bytes_for_wide_part>
+    </merge_tree>
+</yandex>
diff --git a/tests/integration/test_merge_tree_s3_restore/configs/config.d/storage_conf_another_bucket_path.xml b/tests/integration/test_merge_tree_s3_restore/configs/config.d/storage_conf_another_bucket_path.xml
new file mode 100644
index 00000000000..42207674c79
--- /dev/null
+++ b/tests/integration/test_merge_tree_s3_restore/configs/config.d/storage_conf_another_bucket_path.xml
@@ -0,0 +1,34 @@
+<yandex>
+    <storage_configuration>
+        <disks>
+            <s3>
+                <type>s3</type>
+                <endpoint>http://minio1:9001/root2/another_data/</endpoint>
+                <access_key_id>minio</access_key_id>
+                <secret_access_key>minio123</secret_access_key>
+                <send_metadata>true</send_metadata>
+                <list_object_keys_size>1</list_object_keys_size> <!-- To effectively test restore parallelism -->
+            </s3>
+            <hdd>
+                <type>local</type>
+                <path>/</path>
+            </hdd>
+        </disks>
+        <policies>
+            <s3>
+                <volumes>
+                    <main>
+                        <disk>s3</disk>
+                    </main>
+                    <external>
+                        <disk>hdd</disk>
+                    </external>
+                </volumes>
+            </s3>
+        </policies>
+    </storage_configuration>
+
+    <merge_tree>
+        <min_bytes_for_wide_part>0</min_bytes_for_wide_part>
+    </merge_tree>
+</yandex>
diff --git a/tests/integration/test_merge_tree_s3_restore/test.py b/tests/integration/test_merge_tree_s3_restore/test.py
index 6cafc077e81..8859fa73299 100644
--- a/tests/integration/test_merge_tree_s3_restore/test.py
+++ b/tests/integration/test_merge_tree_s3_restore/test.py
@@ -14,9 +14,18 @@ logging.getLogger().addHandler(logging.StreamHandler())
 def cluster():
     try:
         cluster = ClickHouseCluster(__file__)
-        cluster.add_instance("node", main_configs=["configs/config.d/storage_conf.xml",
-                                                   "configs/config.d/bg_processing_pool_conf.xml",
-                                                   "configs/config.d/log_conf.xml"], user_configs=[], with_minio=True, stay_alive=True)
+        cluster.add_instance("node", main_configs=[
+            "configs/config.d/storage_conf.xml",
+            "configs/config.d/bg_processing_pool_conf.xml",
+            "configs/config.d/log_conf.xml"], user_configs=[], with_minio=True, stay_alive=True)
+        cluster.add_instance("node_another_bucket", main_configs=[
+            "configs/config.d/storage_conf_another_bucket.xml",
+            "configs/config.d/bg_processing_pool_conf.xml",
+            "configs/config.d/log_conf.xml"], user_configs=[], stay_alive=True)
+        cluster.add_instance("node_another_bucket_path", main_configs=[
+            "configs/config.d/storage_conf_another_bucket_path.xml",
+            "configs/config.d/bg_processing_pool_conf.xml",
+            "configs/config.d/log_conf.xml"], user_configs=[], stay_alive=True)
         logging.info("Starting cluster...")
         cluster.start()
         logging.info("Cluster started")
@@ -34,17 +43,18 @@ def random_string(length):
 def generate_values(date_str, count, sign=1):
     data = [[date_str, sign * (i + 1), random_string(10)] for i in range(count)]
     data.sort(key=lambda tup: tup[1])
-    return ",".join(["('{}',{},'{}')".format(x, y, z) for x, y, z in data])
+    return ",".join(["('{}',{},'{}',{})".format(x, y, z, 0) for x, y, z in data])
 
 
-def create_table(cluster, table_name, additional_settings=None):
-    node = cluster.instances["node"]
+def create_table(node, table_name, additional_settings=None):
+    node.query("CREATE DATABASE IF NOT EXISTS s3 ENGINE = Ordinary")
 
     create_table_statement = """
-        CREATE TABLE {} (
+        CREATE TABLE s3.{} (
             dt Date,
             id Int64,
             data String,
+            counter Int64,
             INDEX min_max (id) TYPE minmax GRANULARITY 3
         ) ENGINE=MergeTree()
         PARTITION BY dt
@@ -62,45 +72,239 @@ def create_table(cluster, table_name, additional_settings=None):
     node.query(create_table_statement)
 
 
+def purge_s3(cluster, bucket):
+    minio = cluster.minio_client
+    for obj in list(minio.list_objects(bucket, recursive=True)):
+        minio.remove_object(bucket, obj.object_name)
+
+
+def drop_s3_metadata(node):
+    node.exec_in_container(['bash', '-c', 'rm -rf /var/lib/clickhouse/disks/s3/*'], user='root')
+
+
+def drop_shadow_information(node):
+    node.exec_in_container(['bash', '-c', 'rm -rf /var/lib/clickhouse/shadow/*'], user='root')
+
+
+def create_restore_file(node, revision='0', bucket=None, path=None):
+    add_restore_option = 'echo -en "{}\n" >> /var/lib/clickhouse/disks/s3/restore'
+    node.exec_in_container(['bash', '-c', add_restore_option.format(revision)], user='root')
+    if bucket:
+        node.exec_in_container(['bash', '-c', add_restore_option.format(bucket)], user='root')
+    if path:
+        node.exec_in_container(['bash', '-c', add_restore_option.format(path)], user='root')
+
+
+def get_revision_counter(node, backup_number):
+    return node.exec_in_container(['bash', '-c', 'cat /var/lib/clickhouse/disks/s3/shadow/{}/revision.txt'.format(backup_number)], user='root')
+
+
 @pytest.fixture(autouse=True)
 def drop_table(cluster):
     yield
-    node = cluster.instances["node"]
-    minio = cluster.minio_client
 
-    node.query("DROP TABLE IF EXISTS s3_test NO DELAY")
+    node_names = ["node", "node_another_bucket", "node_another_bucket_path"]
 
-    for obj in list(minio.list_objects(cluster.minio_bucket, 'data/')):
-        minio.remove_object(cluster.minio_bucket, obj.object_name)
+    for node_name in node_names:
+        node = cluster.instances[node_name]
+        node.query("DROP TABLE IF EXISTS s3.test NO DELAY")
+
+        drop_s3_metadata(node)
+        drop_shadow_information(node)
+
+    buckets = [cluster.minio_bucket, cluster.minio_bucket_2]
+    for bucket in buckets:
+        purge_s3(cluster, bucket)
 
 
-# Restore to the same bucket and path with latest revision.
-def test_simple_full_restore(cluster):
-    create_table(cluster, "s3_test")
-
+def test_full_restore(cluster):
     node = cluster.instances["node"]
 
-    node.query("INSERT INTO s3_test VALUES {}".format(generate_values('2020-01-03', 4096)))
-    node.query("INSERT INTO s3_test VALUES {}".format(generate_values('2020-01-04', 4096, -1)))
-    node.query("INSERT INTO s3_test VALUES {}".format(generate_values('2020-01-05', 4096)))
-    node.query("INSERT INTO s3_test VALUES {}".format(generate_values('2020-01-05', 4096, -1)))
+    create_table(node, "test")
+
+    node.query("INSERT INTO s3.test VALUES {}".format(generate_values('2020-01-03', 4096)))
+    node.query("INSERT INTO s3.test VALUES {}".format(generate_values('2020-01-04', 4096, -1)))
+    node.query("INSERT INTO s3.test VALUES {}".format(generate_values('2020-01-05', 4096)))
+    node.query("INSERT INTO s3.test VALUES {}".format(generate_values('2020-01-05', 4096, -1)))
 
     # To ensure parts have merged
-    node.query("OPTIMIZE TABLE s3_test")
+    node.query("OPTIMIZE TABLE s3.test")
 
-    assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "({})".format(4096 * 4)
-    assert node.query("SELECT sum(id) FROM s3_test FORMAT Values") == "({})".format(0)
+    assert node.query("SELECT count(*) FROM s3.test FORMAT Values") == "({})".format(4096 * 4)
+    assert node.query("SELECT sum(id) FROM s3.test FORMAT Values") == "({})".format(0)
 
     node.stop_clickhouse()
-    node.exec_in_container(['bash', '-c', 'rm -r /var/lib/clickhouse/disks/s3/*'], user='root')
+    drop_s3_metadata(node)
     node.start_clickhouse()
 
     # All data is removed.
-    assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "({})".format(0)
+    assert node.query("SELECT count(*) FROM s3.test FORMAT Values") == "({})".format(0)
 
     node.stop_clickhouse()
-    node.exec_in_container(['bash', '-c', 'touch /var/lib/clickhouse/disks/s3/restore'], user='root')
+    create_restore_file(node)
     node.start_clickhouse()
 
-    assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "({})".format(4096 * 4)
-    assert node.query("SELECT sum(id) FROM s3_test FORMAT Values") == "({})".format(0)
+    assert node.query("SELECT count(*) FROM s3.test FORMAT Values") == "({})".format(4096 * 4)
+    assert node.query("SELECT sum(id) FROM s3.test FORMAT Values") == "({})".format(0)
+
+
+def test_restore_another_bucket_path(cluster):
+    node = cluster.instances["node"]
+
+    create_table(node, "test")
+
+    node.query("INSERT INTO s3.test VALUES {}".format(generate_values('2020-01-03', 4096)))
+    node.query("INSERT INTO s3.test VALUES {}".format(generate_values('2020-01-04', 4096, -1)))
+    node.query("INSERT INTO s3.test VALUES {}".format(generate_values('2020-01-05', 4096)))
+    node.query("INSERT INTO s3.test VALUES {}".format(generate_values('2020-01-05', 4096, -1)))
+
+    # To ensure parts have merged
+    node.query("OPTIMIZE TABLE s3.test")
+
+    assert node.query("SELECT count(*) FROM s3.test FORMAT Values") == "({})".format(4096 * 4)
+    assert node.query("SELECT sum(id) FROM s3.test FORMAT Values") == "({})".format(0)
+
+    node_another_bucket = cluster.instances["node_another_bucket"]
+
+    create_table(node_another_bucket, "test")
+
+    node_another_bucket.stop_clickhouse()
+    create_restore_file(node_another_bucket, bucket="root")
+    node_another_bucket.start_clickhouse()
+
+    assert node_another_bucket.query("SELECT count(*) FROM s3.test FORMAT Values") == "({})".format(4096 * 4)
+    assert node_another_bucket.query("SELECT sum(id) FROM s3.test FORMAT Values") == "({})".format(0)
+
+    node_another_bucket_path = cluster.instances["node_another_bucket_path"]
+
+    create_table(node_another_bucket_path, "test")
+
+    node_another_bucket_path.stop_clickhouse()
+    create_restore_file(node_another_bucket_path, bucket="root2", path="data")
+    node_another_bucket_path.start_clickhouse()
+
+    assert node_another_bucket_path.query("SELECT count(*) FROM s3.test FORMAT Values") == "({})".format(4096 * 4)
+    assert node_another_bucket_path.query("SELECT sum(id) FROM s3.test FORMAT Values") == "({})".format(0)
+
+
+def test_restore_different_revisions(cluster):
+    node = cluster.instances["node"]
+
+    create_table(node, "test")
+
+    node.query("INSERT INTO s3.test VALUES {}".format(generate_values('2020-01-03', 4096)))
+    node.query("INSERT INTO s3.test VALUES {}".format(generate_values('2020-01-04', 4096, -1)))
+
+    node.query("ALTER TABLE s3.test FREEZE")
+    revision1 = get_revision_counter(node, 1)
+
+    node.query("INSERT INTO s3.test VALUES {}".format(generate_values('2020-01-05', 4096)))
+    node.query("INSERT INTO s3.test VALUES {}".format(generate_values('2020-01-05', 4096, -1)))
+
+    node.query("ALTER TABLE s3.test FREEZE")
+    revision2 = get_revision_counter(node, 2)
+
+    # To ensure parts have merged
+    node.query("OPTIMIZE TABLE s3.test")
+
+    node.query("ALTER TABLE s3.test FREEZE")
+    revision3 = get_revision_counter(node, 3)
+
+    assert node.query("SELECT count(*) FROM s3.test FORMAT Values") == "({})".format(4096 * 4)
+    assert node.query("SELECT sum(id) FROM s3.test FORMAT Values") == "({})".format(0)
+    assert node.query("SELECT count(*) from system.parts where table = 'test'") == '5\n'
+
+    node_another_bucket = cluster.instances["node_another_bucket"]
+
+    create_table(node_another_bucket, "test")
+
+    # Restore to revision 1 (2 parts).
+    node_another_bucket.stop_clickhouse()
+    drop_s3_metadata(node_another_bucket)
+    purge_s3(cluster, cluster.minio_bucket_2)
+    create_restore_file(node_another_bucket, revision=revision1, bucket="root")
+    node_another_bucket.start_clickhouse()
+
+    assert node_another_bucket.query("SELECT count(*) FROM s3.test FORMAT Values") == "({})".format(4096 * 2)
+    assert node_another_bucket.query("SELECT sum(id) FROM s3.test FORMAT Values") == "({})".format(0)
+    assert node_another_bucket.query("SELECT count(*) from system.parts where table = 'test'") == '2\n'
+
+    # Restore to revision 2 (4 parts).
+    node_another_bucket.stop_clickhouse()
+    drop_s3_metadata(node_another_bucket)
+    purge_s3(cluster, cluster.minio_bucket_2)
+    create_restore_file(node_another_bucket, revision=revision2, bucket="root")
+    node_another_bucket.start_clickhouse()
+
+    assert node_another_bucket.query("SELECT count(*) FROM s3.test FORMAT Values") == "({})".format(4096 * 4)
+    assert node_another_bucket.query("SELECT sum(id) FROM s3.test FORMAT Values") == "({})".format(0)
+    assert node_another_bucket.query("SELECT count(*) from system.parts where table = 'test'") == '4\n'
+
+    # Restore to revision 3 (4 parts + 1 merged).
+    node_another_bucket.stop_clickhouse()
+    drop_s3_metadata(node_another_bucket)
+    purge_s3(cluster, cluster.minio_bucket_2)
+    create_restore_file(node_another_bucket, revision=revision3, bucket="root")
+    node_another_bucket.start_clickhouse()
+
+    assert node_another_bucket.query("SELECT count(*) FROM s3.test FORMAT Values") == "({})".format(4096 * 4)
+    assert node_another_bucket.query("SELECT sum(id) FROM s3.test FORMAT Values") == "({})".format(0)
+    assert node_another_bucket.query("SELECT count(*) from system.parts where table = 'test'") == '5\n'
+
+
+def test_restore_mutations(cluster):
+    node = cluster.instances["node"]
+
+    create_table(node, "test")
+
+    node.query("INSERT INTO s3.test VALUES {}".format(generate_values('2020-01-03', 4096)))
+    node.query("INSERT INTO s3.test VALUES {}".format(generate_values('2020-01-03', 4096, -1)))
+
+    node.query("ALTER TABLE s3.test FREEZE")
+    revision_before_mutation = get_revision_counter(node, 1)
+
+    node.query("ALTER TABLE s3.test UPDATE counter = 1 WHERE 1", settings={"mutations_sync": 2})
+
+    node.query("ALTER TABLE s3.test FREEZE")
+    revision_after_mutation = get_revision_counter(node, 2)
+
+    node_another_bucket = cluster.instances["node_another_bucket"]
+
+    create_table(node_another_bucket, "test")
+
+    # Restore to revision before mutation.
+    node_another_bucket.stop_clickhouse()
+    drop_s3_metadata(node_another_bucket)
+    purge_s3(cluster, cluster.minio_bucket_2)
+    create_restore_file(node_another_bucket, revision=revision_before_mutation, bucket="root")
+    node_another_bucket.start_clickhouse()
+
+    assert node_another_bucket.query("SELECT count(*) FROM s3.test FORMAT Values") == "({})".format(4096 * 2)
+    assert node_another_bucket.query("SELECT sum(id) FROM s3.test FORMAT Values") == "({})".format(0)
+    assert node_another_bucket.query("SELECT sum(counter) FROM s3.test FORMAT Values") == "({})".format(0)
+
+    # Restore to revision after mutation.
+    node_another_bucket.stop_clickhouse()
+    drop_s3_metadata(node_another_bucket)
+    purge_s3(cluster, cluster.minio_bucket_2)
+    create_restore_file(node_another_bucket, revision=revision_after_mutation, bucket="root")
+    node_another_bucket.start_clickhouse()
+
+    assert node_another_bucket.query("SELECT count(*) FROM s3.test FORMAT Values") == "({})".format(4096 * 2)
+    assert node_another_bucket.query("SELECT sum(id) FROM s3.test FORMAT Values") == "({})".format(0)
+    assert node_another_bucket.query("SELECT sum(counter) FROM s3.test FORMAT Values") == "({})".format(4096 * 2)
+    assert node_another_bucket.query("SELECT sum(counter) FROM s3.test WHERE id > 0 FORMAT Values") == "({})".format(4096)
+
+    # Restore to revision in the middle of mutation.
+    # Unfinished mutation should be completed after table startup.
+    node_another_bucket.stop_clickhouse()
+    drop_s3_metadata(node_another_bucket)
+    purge_s3(cluster, cluster.minio_bucket_2)
+    revision = str((int(revision_before_mutation) + int(revision_after_mutation)) // 2)
+    create_restore_file(node_another_bucket, revision=revision, bucket="root")
+    node_another_bucket.start_clickhouse()
+
+    assert node_another_bucket.query("SELECT count(*) FROM s3.test FORMAT Values") == "({})".format(4096 * 2)
+    assert node_another_bucket.query("SELECT sum(id) FROM s3.test FORMAT Values") == "({})".format(0)
+    assert node_another_bucket.query("SELECT sum(counter) FROM s3.test FORMAT Values") == "({})".format(4096 * 2)
+    assert node_another_bucket.query("SELECT sum(counter) FROM s3.test WHERE id > 0 FORMAT Values") == "({})".format(4096)

From 53389f79c0c433f566bfc69b71971c6bc9e8adac Mon Sep 17 00:00:00 2001
From: Pavel Kovalenko <jokserfn@yandex-team.ru>
Date: Wed, 13 Jan 2021 15:05:32 +0300
Subject: [PATCH 007/122] Ability to backup-restore metadata files for DiskS3
 (minor fixes)

---
 src/Disks/IDisk.h                                    | 2 +-
 src/Disks/S3/DiskS3.cpp                              | 3 ++-
 tests/integration/helpers/cluster.py                 | 2 +-
 tests/integration/test_merge_tree_s3_restore/test.py | 9 ++++++---
 4 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/Disks/IDisk.h b/src/Disks/IDisk.h
index a26d5015ba0..983f0dd6808 100644
--- a/src/Disks/IDisk.h
+++ b/src/Disks/IDisk.h
@@ -195,7 +195,7 @@ public:
     /// Returns executor to perform asynchronous operations.
     virtual Executor & getExecutor() { return *executor; }
 
-    /// Invoked when partitions freeze is invoked.
+    /// Invoked on partitions freeze query.
     virtual void onFreeze(const String &) { }
 
 private:
diff --git a/src/Disks/S3/DiskS3.cpp b/src/Disks/S3/DiskS3.cpp
index 5787457bf11..831296032a5 100644
--- a/src/Disks/S3/DiskS3.cpp
+++ b/src/Disks/S3/DiskS3.cpp
@@ -566,6 +566,7 @@ DiskS3::DiskS3(
     , max_single_part_upload_size(max_single_part_upload_size_)
     , min_bytes_for_seek(min_bytes_for_seek_)
     , send_metadata(send_metadata_)
+    , revision_counter(0)
     , list_object_keys_size(list_object_keys_size_)
 {
 }
@@ -1091,7 +1092,7 @@ void DiskS3::restore()
     {
         LOG_ERROR(&Poco::Logger::get("DiskS3"), "Failed to restore disk. Code: {}, e.displayText() = {}, Stack trace:\n\n{}", e.code(), e.displayText(), e.getStackTraceString());
 
-        throw Exception("Failed to restore disk: " + name, e, ErrorCodes::LOGICAL_ERROR);
+        throw;
     }
 }
 
diff --git a/tests/integration/helpers/cluster.py b/tests/integration/helpers/cluster.py
index 65f438b6575..7dc847005c3 100644
--- a/tests/integration/helpers/cluster.py
+++ b/tests/integration/helpers/cluster.py
@@ -1398,7 +1398,7 @@ class ClickHouseKiller(object):
         self.clickhouse_node = clickhouse_node
 
     def __enter__(self):
-        self.clickhouse_node.stop_clickhouse()
+        self.clickhouse_node.stop_clickhouse(kill=True)
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         self.clickhouse_node.start_clickhouse()
diff --git a/tests/integration/test_merge_tree_s3_restore/test.py b/tests/integration/test_merge_tree_s3_restore/test.py
index 8859fa73299..9f4aab9f35d 100644
--- a/tests/integration/test_merge_tree_s3_restore/test.py
+++ b/tests/integration/test_merge_tree_s3_restore/test.py
@@ -86,7 +86,7 @@ def drop_shadow_information(node):
     node.exec_in_container(['bash', '-c', 'rm -rf /var/lib/clickhouse/shadow/*'], user='root')
 
 
-def create_restore_file(node, revision='0', bucket=None, path=None):
+def create_restore_file(node, revision=0, bucket=None, path=None):
     add_restore_option = 'echo -en "{}\n" >> /var/lib/clickhouse/disks/s3/restore'
     node.exec_in_container(['bash', '-c', add_restore_option.format(revision)], user='root')
     if bucket:
@@ -96,7 +96,7 @@ def create_restore_file(node, revision='0', bucket=None, path=None):
 
 
 def get_revision_counter(node, backup_number):
-    return node.exec_in_container(['bash', '-c', 'cat /var/lib/clickhouse/disks/s3/shadow/{}/revision.txt'.format(backup_number)], user='root')
+    return int(node.exec_in_container(['bash', '-c', 'cat /var/lib/clickhouse/disks/s3/shadow/{}/revision.txt'.format(backup_number)], user='root'))
 
 
 @pytest.fixture(autouse=True)
@@ -300,10 +300,13 @@ def test_restore_mutations(cluster):
     node_another_bucket.stop_clickhouse()
     drop_s3_metadata(node_another_bucket)
     purge_s3(cluster, cluster.minio_bucket_2)
-    revision = str((int(revision_before_mutation) + int(revision_after_mutation)) // 2)
+    revision = (revision_before_mutation + revision_after_mutation) // 2
     create_restore_file(node_another_bucket, revision=revision, bucket="root")
     node_another_bucket.start_clickhouse()
 
+    # Wait for unfinished mutation completion.
+    time.sleep(3)
+
     assert node_another_bucket.query("SELECT count(*) FROM s3.test FORMAT Values") == "({})".format(4096 * 2)
     assert node_another_bucket.query("SELECT sum(id) FROM s3.test FORMAT Values") == "({})".format(0)
     assert node_another_bucket.query("SELECT sum(counter) FROM s3.test FORMAT Values") == "({})".format(4096 * 2)

From 2dda8ed1e046364b63933b2b990ea27089e4d298 Mon Sep 17 00:00:00 2001
From: Pavel Kovalenko <jokserfn@yandex-team.ru>
Date: Mon, 18 Jan 2021 13:37:09 +0300
Subject: [PATCH 008/122] Ability to backup-restore metadata files for DiskS3
 (minor fixes)

---
 .../test_merge_tree_s3_restore/test.py         | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/tests/integration/test_merge_tree_s3_restore/test.py b/tests/integration/test_merge_tree_s3_restore/test.py
index 9f4aab9f35d..346d9aced3f 100644
--- a/tests/integration/test_merge_tree_s3_restore/test.py
+++ b/tests/integration/test_merge_tree_s3_restore/test.py
@@ -142,7 +142,7 @@ def test_full_restore(cluster):
 
     node.stop_clickhouse()
     create_restore_file(node)
-    node.start_clickhouse()
+    node.start_clickhouse(10)
 
     assert node.query("SELECT count(*) FROM s3.test FORMAT Values") == "({})".format(4096 * 4)
     assert node.query("SELECT sum(id) FROM s3.test FORMAT Values") == "({})".format(0)
@@ -170,7 +170,7 @@ def test_restore_another_bucket_path(cluster):
 
     node_another_bucket.stop_clickhouse()
     create_restore_file(node_another_bucket, bucket="root")
-    node_another_bucket.start_clickhouse()
+    node_another_bucket.start_clickhouse(10)
 
     assert node_another_bucket.query("SELECT count(*) FROM s3.test FORMAT Values") == "({})".format(4096 * 4)
     assert node_another_bucket.query("SELECT sum(id) FROM s3.test FORMAT Values") == "({})".format(0)
@@ -181,7 +181,7 @@ def test_restore_another_bucket_path(cluster):
 
     node_another_bucket_path.stop_clickhouse()
     create_restore_file(node_another_bucket_path, bucket="root2", path="data")
-    node_another_bucket_path.start_clickhouse()
+    node_another_bucket_path.start_clickhouse(10)
 
     assert node_another_bucket_path.query("SELECT count(*) FROM s3.test FORMAT Values") == "({})".format(4096 * 4)
     assert node_another_bucket_path.query("SELECT sum(id) FROM s3.test FORMAT Values") == "({})".format(0)
@@ -223,7 +223,7 @@ def test_restore_different_revisions(cluster):
     drop_s3_metadata(node_another_bucket)
     purge_s3(cluster, cluster.minio_bucket_2)
     create_restore_file(node_another_bucket, revision=revision1, bucket="root")
-    node_another_bucket.start_clickhouse()
+    node_another_bucket.start_clickhouse(10)
 
     assert node_another_bucket.query("SELECT count(*) FROM s3.test FORMAT Values") == "({})".format(4096 * 2)
     assert node_another_bucket.query("SELECT sum(id) FROM s3.test FORMAT Values") == "({})".format(0)
@@ -234,7 +234,7 @@ def test_restore_different_revisions(cluster):
     drop_s3_metadata(node_another_bucket)
     purge_s3(cluster, cluster.minio_bucket_2)
     create_restore_file(node_another_bucket, revision=revision2, bucket="root")
-    node_another_bucket.start_clickhouse()
+    node_another_bucket.start_clickhouse(10)
 
     assert node_another_bucket.query("SELECT count(*) FROM s3.test FORMAT Values") == "({})".format(4096 * 4)
     assert node_another_bucket.query("SELECT sum(id) FROM s3.test FORMAT Values") == "({})".format(0)
@@ -245,7 +245,7 @@ def test_restore_different_revisions(cluster):
     drop_s3_metadata(node_another_bucket)
     purge_s3(cluster, cluster.minio_bucket_2)
     create_restore_file(node_another_bucket, revision=revision3, bucket="root")
-    node_another_bucket.start_clickhouse()
+    node_another_bucket.start_clickhouse(10)
 
     assert node_another_bucket.query("SELECT count(*) FROM s3.test FORMAT Values") == "({})".format(4096 * 4)
     assert node_another_bucket.query("SELECT sum(id) FROM s3.test FORMAT Values") == "({})".format(0)
@@ -277,7 +277,7 @@ def test_restore_mutations(cluster):
     drop_s3_metadata(node_another_bucket)
     purge_s3(cluster, cluster.minio_bucket_2)
     create_restore_file(node_another_bucket, revision=revision_before_mutation, bucket="root")
-    node_another_bucket.start_clickhouse()
+    node_another_bucket.start_clickhouse(10)
 
     assert node_another_bucket.query("SELECT count(*) FROM s3.test FORMAT Values") == "({})".format(4096 * 2)
     assert node_another_bucket.query("SELECT sum(id) FROM s3.test FORMAT Values") == "({})".format(0)
@@ -288,7 +288,7 @@ def test_restore_mutations(cluster):
     drop_s3_metadata(node_another_bucket)
     purge_s3(cluster, cluster.minio_bucket_2)
     create_restore_file(node_another_bucket, revision=revision_after_mutation, bucket="root")
-    node_another_bucket.start_clickhouse()
+    node_another_bucket.start_clickhouse(10)
 
     assert node_another_bucket.query("SELECT count(*) FROM s3.test FORMAT Values") == "({})".format(4096 * 2)
     assert node_another_bucket.query("SELECT sum(id) FROM s3.test FORMAT Values") == "({})".format(0)
@@ -302,7 +302,7 @@ def test_restore_mutations(cluster):
     purge_s3(cluster, cluster.minio_bucket_2)
     revision = (revision_before_mutation + revision_after_mutation) // 2
     create_restore_file(node_another_bucket, revision=revision, bucket="root")
-    node_another_bucket.start_clickhouse()
+    node_another_bucket.start_clickhouse(10)
 
     # Wait for unfinished mutation completion.
     time.sleep(3)

From a12c666b40c8dcbbb5a4fc1ac4b01dfb01c81654 Mon Sep 17 00:00:00 2001
From: George <gyuton@yandex-team.ru>
Date: Fri, 29 Jan 2021 01:54:21 +0300
Subject: [PATCH 009/122] Documented array function

---
 .../functions/array-functions.md              | 186 ++++++++++++++++--
 1 file changed, 169 insertions(+), 17 deletions(-)

diff --git a/docs/en/sql-reference/functions/array-functions.md b/docs/en/sql-reference/functions/array-functions.md
index dc7727bdfd8..1b50591f835 100644
--- a/docs/en/sql-reference/functions/array-functions.md
+++ b/docs/en/sql-reference/functions/array-functions.md
@@ -1288,73 +1288,225 @@ Returns the index of the first element in the `arr1` array for which `func` retu
 
 Note that the `arrayFirstIndex` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You must pass a lambda function to it as the first argument, and it can’t be omitted.
 
-## arrayMin(\[func,\] arr1, …) {#array-min}
+## arrayMin {#array-min}
 
-Returns the min of the `func` values. If the function is omitted, it just returns the min of the array elements.
+Returns the minimum of elements in the source array. 
+
+If the `func` function is specified, returns the miminum of elements converted by this function.
 
 Note that the `arrayMin` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You can pass a lambda function to it as the first argument.
 
-Examples:
+**Syntax**
+
 ```sql
-SELECT arrayMin([1, 2, 4]) AS res
+arrayMin([func,] arr1)
+```
+
+**Parameters**
+
+-   `func` — Function. [Expression](../../sql-reference/data-types/special-data-types/expression.md).
+-   `arr` — Array. [Array](../../sql-reference/data-types/array.md).
+
+**Returned value**
+
+-   The minimum of function values (or the array minimum). 
+
+Type: matches the array type. 
+
+**Examples**
+
+Query:
+
+```sql
+SELECT arrayMin([1, 2, 4]) AS res;
+```
+
+Result:
+
+```text
 ┌─res─┐
 │   1 │
 └─────┘
+```
 
+Query:
 
-SELECT arrayMin(x -> (-x), [1, 2, 4]) AS res
+```
+SELECT arrayMin(x -> (-x), [1, 2, 4]) AS res;
+```
+
+Result:
+
+```text
 ┌─res─┐
 │  -4 │
 └─────┘
 ```
 
-## arrayMax(\[func,\] arr1, …) {#array-max}
+## arrayMax {#array-max}
 
-Returns the max of the `func` values. If the function is omitted, it just returns the max of the array elements.
+Returns the maximum of elements in the source array. 
+
+If the `func` function is specified, returns the maximum of elements converted by this function.
 
 Note that the `arrayMax` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You can pass a lambda function to it as the first argument.
 
-Examples:
+**Syntax**
+
 ```sql
-SELECT arrayMax([1, 2, 4]) AS res
+arrayMax([func,] arr)
+```
+
+**Parameters**
+
+-   `func` — Function. [Expression](../../sql-reference/data-types/special-data-types/expression.md).
+-   `arr` — Array. [Array](../../sql-reference/data-types/array.md).
+
+**Returned value**
+
+-   The maximum of function values (or the array maximum). 
+
+Type: matches the array type. 
+
+**Examples**
+
+Query:
+
+```sql
+SELECT arrayMax([1, 2, 4]) AS res;
+```
+Result:
+
+```text
 ┌─res─┐
 │   4 │
 └─────┘
+```
 
+Query:
 
-SELECT arrayMax(x -> (-x), [1, 2, 4]) AS res
+```sql
+SELECT arrayMax(x -> (-x), [1, 2, 4]) AS res;
+```
+
+Result:
+
+```text
 ┌─res─┐
 │  -1 │
 └─────┘
 ```
 
-## arraySum(\[func,\] arr1, …) {#array-sum}
+## arraySum {#array-sum}
 
-Returns the sum of the `func` values. If the function is omitted, it just returns the sum of the array elements.
+Returns the sum of the elements in the source array. 
+
+If the `func` function is specified, returns the sum of elements converted by this function.
 
 Note that the `arraySum` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You can pass a lambda function to it as the first argument.
 
-Examples:
+**Syntax**
+
 ```sql
-SELECT arraySum([2,3]) AS res
+arraySum([func, ] arr)
+```
+
+**Parameters**
+
+-   `func` — Function. [Expression](../../sql-reference/data-types/special-data-types/expression.md).
+-   `arr` — Array. [Array](../../sql-reference/data-types/array.md).   
+
+**Returned value**
+
+-   The sum of the function values (or the array sum).
+
+Type: matches the array type. 
+
+**Examples**
+
+Query:
+
+```sql
+SELECT arraySum([2,3]) AS res;
+```
+
+Result:
+
+```text
 ┌─res─┐
 │   5 │
 └─────┘
+```
 
+Query:
 
-SELECT arraySum(x -> x*x, [2, 3]) AS res
+```sql
+SELECT arraySum(x -> x*x, [2, 3]) AS res;
+```
+
+Result:
+
+```text
 ┌─res─┐
 │  13 │
 └─────┘
 ```
 
+## arrayAvg {#array-avg}
 
-## arrayAvg(\[func,\] arr1, …) {#array-avg}
+Returns the average of the elements in the source array. 
 
-Returns the average of the `func` values. If the function is omitted, it just returns the average of the array elements.
+If the `func` function is specified, returns the average of elements converted by this function.
 
 Note that the `arrayAvg` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You can pass a lambda function to it as the first argument.
 
+**Syntax**
+
+```sql
+arrayAvg([func,] arr)
+```
+
+**Parameters**
+
+-   `func` — Function. [Expression](../../sql-reference/data-types/special-data-types/expression.md).
+-   `arr` — Array. [Array](../../sql-reference/data-types/array.md).   
+
+**Returned value**
+
+-   The average of the function values (or the array average).
+
+Type: [Float64](../../sql-reference/data-types/float.md).
+
+**Examples**
+
+Query:
+
+```sql
+SELECT arrayAvg([1, 2, 4]) AS res;
+```
+
+Result:
+
+```text
+┌────────────────res─┐
+│ 2.3333333333333335 │
+└────────────────────┘
+```
+
+Query:
+
+```sql
+SELECT arrayAvg(x -> (x * x), [2, 4]) AS res;
+```
+
+Result:
+
+```text
+┌─res─┐
+│  10 │
+└─────┘
+```
+
 ## arrayCumSum(\[func,\] arr1, …) {#arraycumsumfunc-arr1}
 
 Returns an array of partial sums of elements in the source array (a running sum). If the `func` function is specified, then the values of the array elements are converted by this function before summing.

From 2b77488036777a67270ce05f7a299e0298736e22 Mon Sep 17 00:00:00 2001
From: George <gyuton@yandex-team.ru>
Date: Fri, 29 Jan 2021 05:08:22 +0300
Subject: [PATCH 010/122] Minor fixes

---
 docs/en/sql-reference/functions/array-functions.md | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/docs/en/sql-reference/functions/array-functions.md b/docs/en/sql-reference/functions/array-functions.md
index 1b50591f835..402362c0601 100644
--- a/docs/en/sql-reference/functions/array-functions.md
+++ b/docs/en/sql-reference/functions/array-functions.md
@@ -1292,7 +1292,7 @@ Note that the `arrayFirstIndex` is a [higher-order function](../../sql-reference
 
 Returns the minimum of elements in the source array. 
 
-If the `func` function is specified, returns the miminum of elements converted by this function.
+If the `func` function is specified, returns the mininum of elements converted by this function.
 
 Note that the `arrayMin` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You can pass a lambda function to it as the first argument.
 
@@ -1366,7 +1366,7 @@ arrayMax([func,] arr)
 
 -   The maximum of function values (or the array maximum). 
 
-Type: matches the array type. 
+Type: matches the array type (or type of function result). 
 
 **Examples**
 
@@ -1375,6 +1375,7 @@ Query:
 ```sql
 SELECT arrayMax([1, 2, 4]) AS res;
 ```
+
 Result:
 
 ```text
@@ -1399,7 +1400,7 @@ Result:
 
 ## arraySum {#array-sum}
 
-Returns the sum of the elements in the source array. 
+Returns the sum of elements in the source array. 
 
 If the `func` function is specified, returns the sum of elements converted by this function.
 
@@ -1454,7 +1455,7 @@ Result:
 
 ## arrayAvg {#array-avg}
 
-Returns the average of the elements in the source array. 
+Returns the average of elements in the source array. 
 
 If the `func` function is specified, returns the average of elements converted by this function.
 
@@ -1473,7 +1474,7 @@ arrayAvg([func,] arr)
 
 **Returned value**
 
--   The average of the function values (or the array average).
+-   The average of function values (or the array average).
 
 Type: [Float64](../../sql-reference/data-types/float.md).
 

From 95a375b4cc444128befa6690fb290690d2d842c8 Mon Sep 17 00:00:00 2001
From: George <gyuton@yandex-team.ru>
Date: Fri, 29 Jan 2021 05:41:03 +0300
Subject: [PATCH 011/122] Minor fixes

---
 docs/en/sql-reference/functions/array-functions.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/en/sql-reference/functions/array-functions.md b/docs/en/sql-reference/functions/array-functions.md
index 402362c0601..48ef103368b 100644
--- a/docs/en/sql-reference/functions/array-functions.md
+++ b/docs/en/sql-reference/functions/array-functions.md
@@ -1299,7 +1299,7 @@ Note that the `arrayMin` is a [higher-order function](../../sql-reference/functi
 **Syntax**
 
 ```sql
-arrayMin([func,] arr1)
+arrayMin([func,] arr)
 ```
 
 **Parameters**
@@ -1409,7 +1409,7 @@ Note that the `arraySum` is a [higher-order function](../../sql-reference/functi
 **Syntax**
 
 ```sql
-arraySum([func, ] arr)
+arraySum([func,] arr)
 ```
 
 **Parameters**
@@ -1428,7 +1428,7 @@ Type: matches the array type.
 Query:
 
 ```sql
-SELECT arraySum([2,3]) AS res;
+SELECT arraySum([2, 3]) AS res;
 ```
 
 Result:

From 96d45ca8f89a7bfea26d73067e6ba1497f57858e Mon Sep 17 00:00:00 2001
From: George <gyuton@yandex-team.ru>
Date: Fri, 29 Jan 2021 09:11:43 +0300
Subject: [PATCH 012/122] Minor update

---
 docs/en/sql-reference/functions/array-functions.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/en/sql-reference/functions/array-functions.md b/docs/en/sql-reference/functions/array-functions.md
index 48ef103368b..c453acfa39e 100644
--- a/docs/en/sql-reference/functions/array-functions.md
+++ b/docs/en/sql-reference/functions/array-functions.md
@@ -1311,7 +1311,7 @@ arrayMin([func,] arr)
 
 -   The minimum of function values (or the array minimum). 
 
-Type: matches the array type. 
+Type: matches the array elements type. 
 
 **Examples**
 
@@ -1366,7 +1366,7 @@ arrayMax([func,] arr)
 
 -   The maximum of function values (or the array maximum). 
 
-Type: matches the array type (or type of function result). 
+Type: matches the array elements type. 
 
 **Examples**
 
@@ -1421,7 +1421,7 @@ arraySum([func,] arr)
 
 -   The sum of the function values (or the array sum).
 
-Type: matches the array type. 
+Type: for decimal numbers in source array — [Decimal128](../../sql-reference/data-types/decimal.md), for floating point numbers — [Float64](../../sql-reference/data-types/float.md), for numeric unsigned — [Int64](sql-reference/data-types/int-uint.md), and for numeric signed — [UInt64](sql-reference/data-types/int-uint.md).
 
 **Examples**
 

From 9f71596809b5e045c3cbba6a412a64e2a24d0c96 Mon Sep 17 00:00:00 2001
From: George <gyuton@yandex-team.ru>
Date: Fri, 29 Jan 2021 09:48:53 +0300
Subject: [PATCH 013/122] Added translation

---
 .../functions/array-functions.md              |   2 +-
 .../functions/array-functions.md              | 220 +++++++++++++++++-
 2 files changed, 218 insertions(+), 4 deletions(-)

diff --git a/docs/en/sql-reference/functions/array-functions.md b/docs/en/sql-reference/functions/array-functions.md
index c453acfa39e..ffbdbec535c 100644
--- a/docs/en/sql-reference/functions/array-functions.md
+++ b/docs/en/sql-reference/functions/array-functions.md
@@ -1421,7 +1421,7 @@ arraySum([func,] arr)
 
 -   The sum of the function values (or the array sum).
 
-Type: for decimal numbers in source array — [Decimal128](../../sql-reference/data-types/decimal.md), for floating point numbers — [Float64](../../sql-reference/data-types/float.md), for numeric unsigned — [Int64](sql-reference/data-types/int-uint.md), and for numeric signed — [UInt64](sql-reference/data-types/int-uint.md).
+Type: for decimal numbers in source array — [Decimal128](../../sql-reference/data-types/decimal.md), for floating point numbers — [Float64](../../sql-reference/data-types/float.md), for numeric unsigned — [UInt64](sql-reference/data-types/int-uint.md), and for numeric signed — [Int64](sql-reference/data-types/int-uint.md).
 
 **Examples**
 
diff --git a/docs/ru/sql-reference/functions/array-functions.md b/docs/ru/sql-reference/functions/array-functions.md
index 015d14b9de5..ad706e33b1d 100644
--- a/docs/ru/sql-reference/functions/array-functions.md
+++ b/docs/ru/sql-reference/functions/array-functions.md
@@ -1135,11 +1135,225 @@ SELECT
 
 Функция `arrayFirstIndex` является [функцией высшего порядка](../../sql-reference/functions/index.md#higher-order-functions) — в качестве первого аргумента ей нужно передать лямбда-функцию, и этот аргумент не может быть опущен.
 
-## arraySum(\[func,\] arr1, …) {#array-sum}
+## arrayMin {#array-min}
 
-Возвращает сумму значений функции `func`. Если функция не указана - просто возвращает сумму элементов массива.
+Возвращает значение минимального элемента в исходном массиве. 
 
-Функция `arraySum` является [функцией высшего порядка](../../sql-reference/functions/index.md#higher-order-functions) - в качестве первого аргумента ей можно передать лямбда-функцию.
+Если передана функция `func`, возвращается минимум из элементов массива, преобразованных этой функцией.
+
+Функция `arrayMin` является [функцией высшего порядка](../../sql-reference/functions/index.md#higher-order-functions) — в качестве первого аргумента ей можно передать лямбда-функцию.
+
+**Синтаксис**
+
+```sql
+arrayMin([func,] arr)
+```
+
+**Parameters**
+
+-   `func` — функция. [Expression](../../sql-reference/data-types/special-data-types/expression.md).
+-   `arr` — массив. [Array](../../sql-reference/data-types/array.md).
+
+**Возвращаемое значение**
+
+-   Минимальное значение функции (или минимальный элемент массива).
+
+Тип: соответствует типу элементов массива.
+
+**Примеры**
+
+Запрос:
+
+```sql
+SELECT arrayMin([1, 2, 4]) AS res;
+```
+
+Результат:
+
+```text
+┌─res─┐
+│   1 │
+└─────┘
+```
+
+Запрос:
+
+```
+SELECT arrayMin(x -> (-x), [1, 2, 4]) AS res;
+```
+
+Результат:
+
+```text
+┌─res─┐
+│  -4 │
+└─────┘
+```
+
+## arrayMax {#array-max}
+
+Возвращает значение максимального элемента в исходном массиве. 
+
+Если передана функция `func`, возвращается максимум из элементов массива, преобразованных этой функцией.
+
+Функция `arrayMax` является [функцией высшего порядка](../../sql-reference/functions/index.md#higher-order-functions) — в качестве первого аргумента ей можно передать лямбда-функцию.
+
+**Синтаксис**
+
+```sql
+arrayMax([func,] arr)
+```
+
+**Параметры**
+
+-   `func` — функция. [Expression](../../sql-reference/data-types/special-data-types/expression.md).
+-   `arr` — массив. [Array](../../sql-reference/data-types/array.md).
+
+**Возвращаемое значение**
+
+-   Максимальное значение функции (или максимальный элемент массива).
+
+Тип: соответствует типу элементов массива.
+
+**Примеры**
+
+Запрос:
+
+```sql
+SELECT arrayMax([1, 2, 4]) AS res;
+```
+
+Результат:
+
+```text
+┌─res─┐
+│   4 │
+└─────┘
+```
+
+Запрос:
+
+```sql
+SELECT arrayMax(x -> (-x), [1, 2, 4]) AS res;
+```
+
+Результат:
+
+```text
+┌─res─┐
+│  -1 │
+└─────┘
+```
+
+## arraySum {#array-sum}
+
+Возвращает сумму элементов в исходном массиве. 
+
+Если передана функция `func`, возвращается сумма элементов массива, преобразованных этой функцией.
+
+Функция `arraySum` является [функцией высшего порядка](../../sql-reference/functions/index.md#higher-order-functions) — в качестве первого аргумента ей можно передать лямбда-функцию.
+
+**Синтаксис**
+
+```sql
+arraySum([func,] arr)
+```
+
+**Параметры**
+
+-   `func` — функция. [Expression](../../sql-reference/data-types/special-data-types/expression.md).
+-   `arr` — массив. [Array](../../sql-reference/data-types/array.md).   
+
+**Возвращаемое значение**
+
+-   Сумма значений функции (или сумма элементов массива).
+
+Тип: для Decimal чисел в исходном массиве — [Decimal128](../../sql-reference/data-types/decimal.md), для чисел с плавающей точкой — [Float64](../../sql-reference/data-types/float.md), для беззнаковых целых чисел — [UInt64](sql-reference/data-types/int-uint.md), для целых чисел со знаком — [Int64](sql-reference/data-types/int-uint.md).
+
+**Примеры**
+
+Запрос:
+
+```sql
+SELECT arraySum([2, 3]) AS res;
+```
+
+Результат:
+
+```text
+┌─res─┐
+│   5 │
+└─────┘
+```
+
+Запрос:
+
+```sql
+SELECT arraySum(x -> x*x, [2, 3]) AS res;
+```
+
+Результат:
+
+```text
+┌─res─┐
+│  13 │
+└─────┘
+```
+
+## arrayAvg {#array-avg}
+
+Возвращает среднее значение элементов в исходном массиве. 
+
+Если передана функция `func`, возвращается среднее значение элементов массива, преобразованных этой функцией.
+
+Функция `arrayAvg` является [функцией высшего порядка](../../sql-reference/functions/index.md#higher-order-functions) — в качестве первого аргумента ей можно передать лямбда-функцию.
+
+**Синтаксис**
+
+```sql
+arrayAvg([func,] arr)
+```
+
+**Параметры**
+
+-   `func` — функция. [Expression](../../sql-reference/data-types/special-data-types/expression.md).
+-   `arr` — массив. [Array](../../sql-reference/data-types/array.md).   
+
+**Возвращаемое значение**
+
+-   Среднее значение функции (или среднее значение элементов массива).
+
+Тип: [Float64](../../sql-reference/data-types/float.md).
+
+**Примеры**
+
+Запрос:
+
+```sql
+SELECT arrayAvg([1, 2, 4]) AS res;
+```
+
+Результат:
+
+```text
+┌────────────────res─┐
+│ 2.3333333333333335 │
+└────────────────────┘
+```
+
+Запрос:
+
+```sql
+SELECT arrayAvg(x -> (x * x), [2, 4]) AS res;
+```
+
+Результат:
+
+```text
+┌─res─┐
+│  10 │
+└─────┘
+```
 
 ## arrayCumSum(\[func,\] arr1, …) {#arraycumsumfunc-arr1}
 

From e89c98044e1b1ff069d9dce20ac405b4b163ae9f Mon Sep 17 00:00:00 2001
From: George <gyuton@yandex-team.ru>
Date: Fri, 29 Jan 2021 09:58:31 +0300
Subject: [PATCH 014/122] fixed links

---
 docs/en/sql-reference/functions/array-functions.md | 2 +-
 docs/ru/sql-reference/functions/array-functions.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/en/sql-reference/functions/array-functions.md b/docs/en/sql-reference/functions/array-functions.md
index ffbdbec535c..57f2f4e95ed 100644
--- a/docs/en/sql-reference/functions/array-functions.md
+++ b/docs/en/sql-reference/functions/array-functions.md
@@ -1421,7 +1421,7 @@ arraySum([func,] arr)
 
 -   The sum of the function values (or the array sum).
 
-Type: for decimal numbers in source array — [Decimal128](../../sql-reference/data-types/decimal.md), for floating point numbers — [Float64](../../sql-reference/data-types/float.md), for numeric unsigned — [UInt64](sql-reference/data-types/int-uint.md), and for numeric signed — [Int64](sql-reference/data-types/int-uint.md).
+Type: for decimal numbers in source array — [Decimal128](../../sql-reference/data-types/decimal.md), for floating point numbers — [Float64](../../sql-reference/data-types/float.md), for numeric unsigned — [UInt64](../../sql-reference/data-types/int-uint.md), and for numeric signed — [Int64](../../sql-reference/data-types/int-uint.md).
 
 **Examples**
 
diff --git a/docs/ru/sql-reference/functions/array-functions.md b/docs/ru/sql-reference/functions/array-functions.md
index ad706e33b1d..2525787e7bf 100644
--- a/docs/ru/sql-reference/functions/array-functions.md
+++ b/docs/ru/sql-reference/functions/array-functions.md
@@ -1268,7 +1268,7 @@ arraySum([func,] arr)
 
 -   Сумма значений функции (или сумма элементов массива).
 
-Тип: для Decimal чисел в исходном массиве — [Decimal128](../../sql-reference/data-types/decimal.md), для чисел с плавающей точкой — [Float64](../../sql-reference/data-types/float.md), для беззнаковых целых чисел — [UInt64](sql-reference/data-types/int-uint.md), для целых чисел со знаком — [Int64](sql-reference/data-types/int-uint.md).
+Тип: для Decimal чисел в исходном массиве — [Decimal128](../../sql-reference/data-types/decimal.md), для чисел с плавающей точкой — [Float64](../../sql-reference/data-types/float.md), для беззнаковых целых чисел — [UInt64](../../sql-reference/data-types/int-uint.md), для целых чисел со знаком — [Int64](../../sql-reference/data-types/int-uint.md).
 
 **Примеры**
 

From 1bd95e474ca781a587513eda6f61d276391377f5 Mon Sep 17 00:00:00 2001
From: George <gyuton@yandex-team.ru>
Date: Fri, 29 Jan 2021 10:19:01 +0300
Subject: [PATCH 015/122] Minor fix

---
 docs/en/sql-reference/functions/array-functions.md | 2 +-
 docs/ru/sql-reference/functions/array-functions.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/en/sql-reference/functions/array-functions.md b/docs/en/sql-reference/functions/array-functions.md
index 57f2f4e95ed..be6440bbe9c 100644
--- a/docs/en/sql-reference/functions/array-functions.md
+++ b/docs/en/sql-reference/functions/array-functions.md
@@ -1331,7 +1331,7 @@ Result:
 
 Query:
 
-```
+```sql
 SELECT arrayMin(x -> (-x), [1, 2, 4]) AS res;
 ```
 
diff --git a/docs/ru/sql-reference/functions/array-functions.md b/docs/ru/sql-reference/functions/array-functions.md
index 2525787e7bf..3bba6f799c3 100644
--- a/docs/ru/sql-reference/functions/array-functions.md
+++ b/docs/ru/sql-reference/functions/array-functions.md
@@ -1178,7 +1178,7 @@ SELECT arrayMin([1, 2, 4]) AS res;
 
 Запрос:
 
-```
+```sql
 SELECT arrayMin(x -> (-x), [1, 2, 4]) AS res;
 ```
 

From ef46c3631787aa16e56e4420ab168342674cbe0b Mon Sep 17 00:00:00 2001
From: Alexander Kuzmenkov <akuzm@yandex-team.ru>
Date: Tue, 2 Feb 2021 02:26:14 +0300
Subject: [PATCH 016/122] start offset for ROWS frame

---
 src/Parsers/ExpressionElementParsers.cpp      |  20 +-
 src/Processors/Transforms/WindowTransform.cpp | 256 ++++++++++++++++--
 src/Processors/Transforms/WindowTransform.h   |  45 ++-
 .../01591_window_functions.reference          |  40 +++
 .../0_stateless/01591_window_functions.sql    |   9 +
 5 files changed, 343 insertions(+), 27 deletions(-)

diff --git a/src/Parsers/ExpressionElementParsers.cpp b/src/Parsers/ExpressionElementParsers.cpp
index ada2e3849ea..a9b38b45c1c 100644
--- a/src/Parsers/ExpressionElementParsers.cpp
+++ b/src/Parsers/ExpressionElementParsers.cpp
@@ -557,7 +557,15 @@ static bool tryParseFrameDefinition(ASTWindowDefinition * node, IParser::Pos & p
         }
         else if (parser_literal.parse(pos, ast_literal, expected))
         {
-            node->frame.begin_offset = ast_literal->as<ASTLiteral &>().value.safeGet<Int64>();
+            const Field & value = ast_literal->as<ASTLiteral &>().value;
+            if (!isInt64FieldType(value.getType()))
+            {
+                throw Exception(ErrorCodes::BAD_ARGUMENTS,
+                    "Only integer frame offsets are supported, '{}' is not supported.",
+                    Field::Types::toString(value.getType()));
+            }
+            node->frame.begin_offset = value.get<Int64>();
+            node->frame.begin_type = WindowFrame::BoundaryType::Offset;
         }
         else
         {
@@ -603,7 +611,15 @@ static bool tryParseFrameDefinition(ASTWindowDefinition * node, IParser::Pos & p
             }
             else if (parser_literal.parse(pos, ast_literal, expected))
             {
-                node->frame.end_offset = ast_literal->as<ASTLiteral &>().value.safeGet<Int64>();
+                const Field & value = ast_literal->as<ASTLiteral &>().value;
+                if (!isInt64FieldType(value.getType()))
+                {
+                    throw Exception(ErrorCodes::BAD_ARGUMENTS,
+                        "Only integer frame offsets are supported, '{}' is not supported.",
+                        Field::Types::toString(value.getType()));
+                }
+                node->frame.end_offset = value.get<Int64>();
+                node->frame.end_type = WindowFrame::BoundaryType::Offset;
             }
             else
             {
diff --git a/src/Processors/Transforms/WindowTransform.cpp b/src/Processors/Transforms/WindowTransform.cpp
index 388e7a4af3b..5b942f089c1 100644
--- a/src/Processors/Transforms/WindowTransform.cpp
+++ b/src/Processors/Transforms/WindowTransform.cpp
@@ -165,16 +165,214 @@ void WindowTransform::advancePartitionEnd()
     assert(!partition_ended && partition_end == blocksEnd());
 }
 
-void WindowTransform::advanceFrameStart() const
+auto WindowTransform::moveRowNumberNoCheck(const RowNumber & _x, int offset) const
 {
-    // Frame start is always UNBOUNDED PRECEDING for now, so we don't have to
-    // move it. It is initialized when the new partition starts.
-    if (window_description.frame.begin_type
-        != WindowFrame::BoundaryType::Unbounded)
+    RowNumber x = _x;
+
+    if (offset > 0)
     {
-        throw Exception(ErrorCodes::NOT_IMPLEMENTED,
-            "Frame start type '{}' is not implemented",
-            WindowFrame::toString(window_description.frame.begin_type));
+        for (;;)
+        {
+            assertValid(x);
+            assert(offset >= 0);
+
+            const auto block_rows = blockRowsNumber(x);
+            x.row += offset;
+            if (x.row >= block_rows)
+            {
+                offset = x.row - block_rows;
+                x.row = 0;
+                x.block++;
+
+                if (x == blocksEnd())
+                {
+                    break;
+                }
+            }
+            else
+            {
+                offset = 0;
+                break;
+            }
+        }
+    }
+    else if (offset < 0)
+    {
+        for (;;)
+        {
+            assertValid(x);
+            assert(offset <= 0);
+
+            if (x.row >= static_cast<uint64_t>(-offset))
+            {
+                x.row -= -offset;
+                offset = 0;
+                break;
+            }
+
+            if (x.block == first_block_number)
+            {
+                break;
+            }
+
+            // offset is negative
+            offset += (x.row + 1);
+            --x.block;
+            x.row = blockRowsNumber(x) - 1;
+        }
+    }
+
+    return std::tuple{x, offset};
+}
+
+auto WindowTransform::moveRowNumber(const RowNumber & _x, int offset) const
+{
+    auto [x, o] = moveRowNumberNoCheck(_x, offset);
+
+#ifndef NDEBUG
+    // Check that it was reversible.
+    auto [xx, oo] = moveRowNumberNoCheck(x, -(offset - o));
+
+//    fmt::print(stderr, "{} -> {}, result {}, {}, new offset {}, twice {}, {}\n",
+//        _x, offset, x, o, -(offset - o), xx, oo);
+    assert(xx == _x);
+    assert(oo == 0);
+#endif
+
+    return std::tuple{x, o};
+}
+
+
+void WindowTransform::advanceFrameStartRowsOffset()
+{
+    // Just recalculate it each time by walking blocks.
+    const auto [moved_row, offset_left] = moveRowNumber(current_row,
+        window_description.frame.begin_offset);
+
+    frame_start = moved_row;
+
+    assertValid(frame_start);
+
+//    fmt::print(stderr, "frame start {} partition start {}\n", frame_start,
+//        partition_start);
+
+    if (moved_row <= partition_start)
+    {
+        // Got to the beginning of partition and can't go further back.
+        frame_start = partition_start;
+        frame_started = true;
+        return;
+    }
+
+    assert(frame_start <= partition_end);
+    if (frame_start == partition_end && partition_ended)
+    {
+        // A FOLLOWING frame start ran into the end of partition.
+        frame_started = true;
+    }
+
+    assert(partition_start < frame_start);
+    frame_start = moved_row;
+    frame_started = offset_left == 0;
+}
+
+void WindowTransform::advanceFrameStartChoose()
+{
+    switch (window_description.frame.begin_type)
+    {
+        case WindowFrame::BoundaryType::Unbounded:
+            // UNBOUNDED PRECEDING, just mark it valid. It is initialized when
+            // the new partition starts.
+            frame_started = true;
+            return;
+        case WindowFrame::BoundaryType::Offset:
+            switch (window_description.frame.type)
+            {
+                case WindowFrame::FrameType::Rows:
+                    advanceFrameStartRowsOffset();
+                    return;
+                default:
+                    // Fallthrough to the "not implemented" error.
+                    break;
+            }
+            break;
+        default:
+            // Fallthrough to the "not implemented" error.
+            break;
+    }
+
+    throw Exception(ErrorCodes::NOT_IMPLEMENTED,
+        "Frame start type '{}' for frame '{}' is not implemented",
+        WindowFrame::toString(window_description.frame.begin_type),
+        WindowFrame::toString(window_description.frame.type));
+}
+
+void WindowTransform::advanceFrameStart()
+{
+    if (frame_started)
+    {
+        return;
+    }
+
+    const auto frame_start_before = frame_start;
+    advanceFrameStartChoose();
+    if (frame_start == frame_start_before)
+    {
+        return;
+    }
+
+    assert(frame_start_before < frame_start);
+    assert(partition_start <= frame_start);
+    assert(frame_start <= partition_end);
+    if (partition_ended && frame_start == partition_end)
+    {
+        // Check that if the start of frame (e.g. FOLLOWING) runs into the end
+        // of partition, it is marked as valid -- we can't advance it any
+        // further.
+        assert(frame_started);
+    }
+
+    // We're very dumb and have to reinitialize aggregate functions if the frame
+    // start changed. No point in doing it if we don't yet know where the frame
+    // starts.
+    if (!frame_started)
+    {
+        return;
+    }
+
+    // frame_end value might not be valid yet, but we know that it is greater or
+    // equal than frame_start. If it's less than the new frame_start, we have to
+    // skip rows between frame_end and frame_start, because they are not in the
+    // frame and must not contribute to the value of aggregate functions.
+    if (frame_end < frame_start)
+    {
+        frame_end = frame_start;
+    }
+
+    for (auto & ws : workspaces)
+    {
+        const auto & f = ws.window_function;
+        const auto * a = f.aggregate_function.get();
+        auto * buf = ws.aggregate_function_state.data();
+
+        a->destroy(buf);
+        a->create(buf);
+
+        for (auto row = frame_start; row < frame_end; advanceRowNumber(row))
+        {
+            if (row.block != ws.cached_block_number)
+            {
+                ws.argument_columns.clear();
+                for (const auto i : ws.argument_column_indices)
+                {
+                    ws.argument_columns.push_back(inputAt(row)[i].get());
+                }
+                ws.cached_block_number = row.block;
+            }
+
+            a->add(buf, ws.argument_columns.data(), row.row, arena.get());
+//            fmt::print(stderr, "(1) add row {}\n", row.row);
+        }
     }
 }
 
@@ -356,6 +554,7 @@ void WindowTransform::advanceFrameEnd()
         auto * columns = ws.argument_columns.data();
         for (auto row = frame_end_before.row; row < rows_end; ++row)
         {
+//            fmt::print(stderr, "(2) add row {}\n", row);
             a->add(buf, columns, row, arena.get());
         }
     }
@@ -414,8 +613,8 @@ void WindowTransform::appendChunk(Chunk & chunk)
     for (;;)
     {
         advancePartitionEnd();
-//        fmt::print(stderr, "partition [?, {}), {}\n",
-//            partition_end, partition_ended);
+//        fmt::print(stderr, "partition [{}, {}), {}\n",
+//            partition_start, partition_end, partition_ended);
 
         // Either we ran out of data or we found the end of partition (maybe
         // both, but this only happens at the total end of data).
@@ -433,12 +632,21 @@ void WindowTransform::appendChunk(Chunk & chunk)
             // Advance the frame start, updating the state of the aggregate
             // functions.
             advanceFrameStart();
+
+            if (!frame_started)
+            {
+                // Wait for more input data to find the start of frame.
+                assert(!input_is_finished);
+                assert(!partition_ended);
+            }
+
             // Advance the frame end, updating the state of the aggregate
             // functions.
             advanceFrameEnd();
 
-//            fmt::print(stderr, "row {} frame [{}, {}) {}\n",
-//                current_row, frame_start, frame_end, frame_ended);
+//            fmt::print(stderr, "row {} frame [{}, {}) {}, {}\n",
+//                current_row, frame_start, frame_end,
+//                frame_started, frame_ended);
 
             if (!frame_ended)
             {
@@ -448,8 +656,10 @@ void WindowTransform::appendChunk(Chunk & chunk)
                 return;
             }
 
-            // The frame shouldn't be empty (probably?).
-            assert(frame_start < frame_end);
+            // The frame can be empty sometimes, e.g. the boundaries coincide
+            // or the start is after the partition end. But hopefully start is
+            // not after end.
+            assert(frame_start <= frame_end);
 
             // Write out the aggregation results.
             writeOutCurrentRow();
@@ -458,6 +668,7 @@ void WindowTransform::appendChunk(Chunk & chunk)
             advanceRowNumber(current_row);
             first_not_ready_row = current_row;
             frame_ended = false;
+            frame_started = false;
         }
 
         if (input_is_finished)
@@ -478,15 +689,15 @@ void WindowTransform::appendChunk(Chunk & chunk)
         }
 
         // Start the next partition.
-        const auto new_partition_start = partition_end;
+        partition_start = partition_end;
         advanceRowNumber(partition_end);
         partition_ended = false;
         // We have to reset the frame when the new partition starts. This is not a
         // generally correct way to do so, but we don't really support moving frame
         // for now.
-        frame_start = new_partition_start;
-        frame_end = new_partition_start;
-        assert(current_row == new_partition_start);
+        frame_start = partition_start;
+        frame_end = partition_start;
+        assert(current_row == partition_start);
 
 //        fmt::print(stderr, "reinitialize agg data at start of {}\n",
 //            new_partition_start);
@@ -534,6 +745,15 @@ IProcessor::Status WindowTransform::prepare()
         return Status::Finished;
     }
 
+    if (output_data.exception)
+    {
+        // An exception occurred during processing.
+        output.pushData(std::move(output_data));
+        output.finish();
+        input.close();
+        return Status::Finished;
+    }
+
     assert(first_not_ready_row.block >= first_block_number);
     // The first_not_ready_row might be past-the-end if we have already
     // calculated the window functions for all input rows. That's why the
diff --git a/src/Processors/Transforms/WindowTransform.h b/src/Processors/Transforms/WindowTransform.h
index 39ccd4f96f9..afc44b2f706 100644
--- a/src/Processors/Transforms/WindowTransform.h
+++ b/src/Processors/Transforms/WindowTransform.h
@@ -53,6 +53,11 @@ struct RowNumber
     {
         return block == other.block && row == other.row;
     }
+
+    bool operator <= (const RowNumber & other) const
+    {
+        return *this < other || *this == other;
+    }
 };
 
 /*
@@ -101,7 +106,9 @@ public:
 
 private:
     void advancePartitionEnd();
-    void advanceFrameStart() const;
+    void advanceFrameStart();
+    void advanceFrameStartChoose();
+    void advanceFrameStartRowsOffset();
     void advanceFrameEnd();
     void advanceFrameEndCurrentRow();
     void advanceFrameEndUnbounded();
@@ -169,9 +176,28 @@ private:
 #endif
     }
 
+    auto moveRowNumber(const RowNumber & _x, int offset) const;
+    auto moveRowNumberNoCheck(const RowNumber & _x, int offset) const;
+
+    void assertValid(const RowNumber & x) const
+    {
+        assert(x.block >= first_block_number);
+        if (x.block == first_block_number + blocks.size())
+        {
+            assert(x.row == 0);
+        }
+        else
+        {
+            assert(x.row < blockRowsNumber(x));
+        }
+    }
+
     RowNumber blocksEnd() const
     { return RowNumber{first_block_number + blocks.size(), 0}; }
 
+    RowNumber blocksBegin() const
+    { return RowNumber{first_block_number, 0}; }
+
 public:
     /*
      * Data (formerly) inherited from ISimpleTransform, needed for the
@@ -217,18 +243,22 @@ public:
     // Used to determine which resulting blocks we can pass to the consumer.
     RowNumber first_not_ready_row;
 
-    // We don't keep the pointer to start of partition, because we don't really
-    // need it, and we want to be able to drop the starting blocks to save memory.
-    // The `partition_end` is past-the-end, as usual. When partition_ended = false,
-    // it still haven't ended, and partition_end is the next row to check.
+    // Boundaries of the current partition.
+    // partition_start doesn't point to a valid block, because we want to drop
+    // the blocks early to save memory. We still have track it so that we can
+    // cut off a PRECEDING frame at the partition start.
+    // The `partition_end` is past-the-end, as usual. When
+    // partition_ended = false, it still haven't ended, and partition_end is the
+    // next row to check.
+    RowNumber partition_start;
     RowNumber partition_end;
     bool partition_ended = false;
 
     // This is the row for which we are computing the window functions now.
     RowNumber current_row;
 
-    // The frame is [frame_start, frame_end) if frame_ended, and unknown
-    // otherwise. Note that when we move to the next row, both the
+    // The frame is [frame_start, frame_end) if frame_ended && frame_started,
+    // and unknown otherwise. Note that when we move to the next row, both the
     // frame_start and the frame_end may jump forward by an unknown amount of
     // blocks, e.g. if we use a RANGE frame. This means that sometimes we don't
     // know neither frame_end nor frame_start.
@@ -239,6 +269,7 @@ public:
     RowNumber frame_start;
     RowNumber frame_end;
     bool frame_ended = false;
+    bool frame_started = false;
 };
 
 }
diff --git a/tests/queries/0_stateless/01591_window_functions.reference b/tests/queries/0_stateless/01591_window_functions.reference
index 1e9b83b9983..7108d8fda8c 100644
--- a/tests/queries/0_stateless/01591_window_functions.reference
+++ b/tests/queries/0_stateless/01591_window_functions.reference
@@ -516,3 +516,43 @@ settings max_block_size = 2;
 27	27	29	29
 27	27	29	29
 30	30	30	30
+-- ROWS offset frame start
+select number, p,
+    count(*) over (partition by p order by number
+        rows between 1 preceding and unbounded following),
+    count(*) over (partition by p order by number
+        rows between 1 following and unbounded following)
+from (select number, intDiv(number, 5) p from numbers(31))
+order by p, number
+settings max_block_size = 2;
+0	0	5	4
+1	0	5	3
+2	0	4	2
+3	0	3	1
+4	0	2	0
+5	1	5	4
+6	1	5	3
+7	1	4	2
+8	1	3	1
+9	1	2	0
+10	2	5	4
+11	2	5	3
+12	2	4	2
+13	2	3	1
+14	2	2	0
+15	3	5	4
+16	3	5	3
+17	3	4	2
+18	3	3	1
+19	3	2	0
+20	4	5	4
+21	4	5	3
+22	4	4	2
+23	4	3	1
+24	4	2	0
+25	5	5	4
+26	5	5	3
+27	5	4	2
+28	5	3	1
+29	5	2	0
+30	6	1	0
diff --git a/tests/queries/0_stateless/01591_window_functions.sql b/tests/queries/0_stateless/01591_window_functions.sql
index a5b30fb884a..c4f1cfec5d5 100644
--- a/tests/queries/0_stateless/01591_window_functions.sql
+++ b/tests/queries/0_stateless/01591_window_functions.sql
@@ -163,3 +163,12 @@ window
         rows between unbounded preceding and unbounded following)
 settings max_block_size = 2;
 
+-- ROWS offset frame start
+select number, p,
+    count(*) over (partition by p order by number
+        rows between 1 preceding and unbounded following),
+    count(*) over (partition by p order by number
+        rows between 1 following and unbounded following)
+from (select number, intDiv(number, 5) p from numbers(31))
+order by p, number
+settings max_block_size = 2;

From 78a9624fc57c3053ab584aff18f0e9b8a674f07f Mon Sep 17 00:00:00 2001
From: Alexander Kuzmenkov <akuzm@yandex-team.ru>
Date: Tue, 2 Feb 2021 02:11:40 +0300
Subject: [PATCH 017/122] CURRENT ROW frame start for ROWS frame

---
 src/Processors/Transforms/WindowTransform.cpp    | 16 +++++++++++++---
 .../0_stateless/01591_window_functions.reference |  2 +-
 .../0_stateless/01591_window_functions.sql       |  2 ++
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/src/Processors/Transforms/WindowTransform.cpp b/src/Processors/Transforms/WindowTransform.cpp
index 5b942f089c1..4eb47f435d1 100644
--- a/src/Processors/Transforms/WindowTransform.cpp
+++ b/src/Processors/Transforms/WindowTransform.cpp
@@ -285,6 +285,19 @@ void WindowTransform::advanceFrameStartChoose()
             // the new partition starts.
             frame_started = true;
             return;
+        case WindowFrame::BoundaryType::Current:
+            switch (window_description.frame.type)
+            {
+                case WindowFrame::FrameType::Rows:
+                    // CURRENT ROW
+                    frame_start = current_row;
+                    frame_started = true;
+                    return;
+                default:
+                    // Fallthrough to the "not implemented" error.
+                    break;
+            }
+            break;
         case WindowFrame::BoundaryType::Offset:
             switch (window_description.frame.type)
             {
@@ -296,9 +309,6 @@ void WindowTransform::advanceFrameStartChoose()
                     break;
             }
             break;
-        default:
-            // Fallthrough to the "not implemented" error.
-            break;
     }
 
     throw Exception(ErrorCodes::NOT_IMPLEMENTED,
diff --git a/tests/queries/0_stateless/01591_window_functions.reference b/tests/queries/0_stateless/01591_window_functions.reference
index 7108d8fda8c..8c0ef9ecaa6 100644
--- a/tests/queries/0_stateless/01591_window_functions.reference
+++ b/tests/queries/0_stateless/01591_window_functions.reference
@@ -516,7 +516,7 @@ settings max_block_size = 2;
 27	27	29	29
 27	27	29	29
 30	30	30	30
--- ROWS offset frame start
+-- CURRENT ROW and offset for ROWS frame start
 select number, p,
     count(*) over (partition by p order by number
         rows between 1 preceding and unbounded following),
diff --git a/tests/queries/0_stateless/01591_window_functions.sql b/tests/queries/0_stateless/01591_window_functions.sql
index c4f1cfec5d5..3b4bdd03724 100644
--- a/tests/queries/0_stateless/01591_window_functions.sql
+++ b/tests/queries/0_stateless/01591_window_functions.sql
@@ -167,6 +167,8 @@ settings max_block_size = 2;
 select number, p,
     count(*) over (partition by p order by number
         rows between 1 preceding and unbounded following),
+    count(*) over (partition by p order by number
+        rows between current row and unbounded following),
     count(*) over (partition by p order by number
         rows between 1 following and unbounded following)
 from (select number, intDiv(number, 5) p from numbers(31))

From d084625436eae7b7e58b4214aa454d97e4e97f4e Mon Sep 17 00:00:00 2001
From: Alexander Kuzmenkov <akuzm@yandex-team.ru>
Date: Tue, 2 Feb 2021 03:51:35 +0300
Subject: [PATCH 018/122] cleanup

---
 src/Parsers/ExpressionElementParsers.cpp      |  1 +
 .../01591_window_functions.reference          | 66 ++++++++++---------
 2 files changed, 35 insertions(+), 32 deletions(-)

diff --git a/src/Parsers/ExpressionElementParsers.cpp b/src/Parsers/ExpressionElementParsers.cpp
index a9b38b45c1c..de327e3f0f3 100644
--- a/src/Parsers/ExpressionElementParsers.cpp
+++ b/src/Parsers/ExpressionElementParsers.cpp
@@ -45,6 +45,7 @@ namespace DB
 
 namespace ErrorCodes
 {
+    extern const int BAD_ARGUMENTS;
     extern const int SYNTAX_ERROR;
     extern const int LOGICAL_ERROR;
     extern const int NOT_IMPLEMENTED;
diff --git a/tests/queries/0_stateless/01591_window_functions.reference b/tests/queries/0_stateless/01591_window_functions.reference
index 8c0ef9ecaa6..7e286f753e5 100644
--- a/tests/queries/0_stateless/01591_window_functions.reference
+++ b/tests/queries/0_stateless/01591_window_functions.reference
@@ -516,43 +516,45 @@ settings max_block_size = 2;
 27	27	29	29
 27	27	29	29
 30	30	30	30
--- CURRENT ROW and offset for ROWS frame start
+-- ROWS offset frame start
 select number, p,
     count(*) over (partition by p order by number
         rows between 1 preceding and unbounded following),
+    count(*) over (partition by p order by number
+        rows between current row and unbounded following),
     count(*) over (partition by p order by number
         rows between 1 following and unbounded following)
 from (select number, intDiv(number, 5) p from numbers(31))
 order by p, number
 settings max_block_size = 2;
-0	0	5	4
-1	0	5	3
-2	0	4	2
-3	0	3	1
-4	0	2	0
-5	1	5	4
-6	1	5	3
-7	1	4	2
-8	1	3	1
-9	1	2	0
-10	2	5	4
-11	2	5	3
-12	2	4	2
-13	2	3	1
-14	2	2	0
-15	3	5	4
-16	3	5	3
-17	3	4	2
-18	3	3	1
-19	3	2	0
-20	4	5	4
-21	4	5	3
-22	4	4	2
-23	4	3	1
-24	4	2	0
-25	5	5	4
-26	5	5	3
-27	5	4	2
-28	5	3	1
-29	5	2	0
-30	6	1	0
+0	0	5	5	4
+1	0	5	4	3
+2	0	4	3	2
+3	0	3	2	1
+4	0	2	1	0
+5	1	5	5	4
+6	1	5	4	3
+7	1	4	3	2
+8	1	3	2	1
+9	1	2	1	0
+10	2	5	5	4
+11	2	5	4	3
+12	2	4	3	2
+13	2	3	2	1
+14	2	2	1	0
+15	3	5	5	4
+16	3	5	4	3
+17	3	4	3	2
+18	3	3	2	1
+19	3	2	1	0
+20	4	5	5	4
+21	4	5	4	3
+22	4	4	3	2
+23	4	3	2	1
+24	4	2	1	0
+25	5	5	5	4
+26	5	5	4	3
+27	5	4	3	2
+28	5	3	2	1
+29	5	2	1	0
+30	6	1	1	0

From 8bd026271a08b501a4852e1bd74c95632ff1aa37 Mon Sep 17 00:00:00 2001
From: Alexander Kuzmenkov <akuzm@yandex-team.ru>
Date: Tue, 2 Feb 2021 04:16:44 +0300
Subject: [PATCH 019/122] more cleanup

---
 src/Processors/Transforms/WindowTransform.cpp | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/Processors/Transforms/WindowTransform.cpp b/src/Processors/Transforms/WindowTransform.cpp
index 4eb47f435d1..23acc85aef0 100644
--- a/src/Processors/Transforms/WindowTransform.cpp
+++ b/src/Processors/Transforms/WindowTransform.cpp
@@ -326,12 +326,19 @@ void WindowTransform::advanceFrameStart()
 
     const auto frame_start_before = frame_start;
     advanceFrameStartChoose();
+    assert(frame_start_before <= frame_start);
     if (frame_start == frame_start_before)
     {
-        return;
+        // If the frame start didn't move, this means we validated that the frame
+        // starts at the point we reached earlier but were unable to validate.
+        // This probably only happens in degenerate cases where the frame start
+        // is further than the end of partition, and the partition ends at the
+        // last row of the block, but we can only tell for sure after a new
+        // block arrives. We still have to update the state of aggregate
+        // functions when the frame start becomes valid, so we continue.
+        assert(frame_started);
     }
 
-    assert(frame_start_before < frame_start);
     assert(partition_start <= frame_start);
     assert(frame_start <= partition_end);
     if (partition_ended && frame_start == partition_end)
@@ -669,6 +676,8 @@ void WindowTransform::appendChunk(Chunk & chunk)
             // The frame can be empty sometimes, e.g. the boundaries coincide
             // or the start is after the partition end. But hopefully start is
             // not after end.
+            assert(frame_started);
+            assert(frame_ended);
             assert(frame_start <= frame_end);
 
             // Write out the aggregation results.

From 35e7c15394e684da2b2744bbefae28617eb0c229 Mon Sep 17 00:00:00 2001
From: gyuton <40863448+gyuton@users.noreply.github.com>
Date: Tue, 2 Feb 2021 05:58:25 +0300
Subject: [PATCH 020/122] Update
 docs/ru/sql-reference/functions/array-functions.md

Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com>
---
 docs/ru/sql-reference/functions/array-functions.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/ru/sql-reference/functions/array-functions.md b/docs/ru/sql-reference/functions/array-functions.md
index 3bba6f799c3..68766cafe60 100644
--- a/docs/ru/sql-reference/functions/array-functions.md
+++ b/docs/ru/sql-reference/functions/array-functions.md
@@ -1149,7 +1149,7 @@ SELECT
 arrayMin([func,] arr)
 ```
 
-**Parameters**
+**Параметры**
 
 -   `func` — функция. [Expression](../../sql-reference/data-types/special-data-types/expression.md).
 -   `arr` — массив. [Array](../../sql-reference/data-types/array.md).

From 921518db0a9637ea1b3452fa6ca90c9758c76df9 Mon Sep 17 00:00:00 2001
From: Aleksei Semiglazov <asemiglazov@cloudflare.com>
Date: Fri, 20 Nov 2020 17:23:53 +0000
Subject: [PATCH 021/122] CLICKHOUSE-606: query deduplication based on parts'
 UUID

* add the query data deduplication excluding duplicated parts in MergeTree family engines.

query deduplication is based on parts' UUID which should be enabled first with merge_tree setting
assign_part_uuids=1

allow_experimental_query_deduplication setting is to enable part deduplication, default ot false.

data part UUID is a mechanism of giving a data part a unique identifier.
Having UUID and deduplication mechanism provides a potential of moving parts
between shards preserving data consistency on a read path:
duplicated UUIDs will cause root executor to retry query against on of the replica explicitly
asking to exclude encountered duplicated fingerprints during a distributed query execution.

NOTE: this implementation don't provide any knobs to lock part and hence its UUID. Any mutations/merge will
update part's UUID.

* add _part_uuid virtual column, allowing to use UUIDs in predicates.

Signed-off-by: Aleksei Semiglazov <asemiglazov@cloudflare.com>

address comments
---
 programs/client/Client.cpp                    |   3 +
 src/Client/Connection.cpp                     |  10 ++
 src/Client/Connection.h                       |   3 +
 src/Client/MultiplexedConnections.cpp         |  17 ++
 src/Client/MultiplexedConnections.h           |   3 +
 src/Columns/ColumnsNumber.h                   |   2 +
 src/Common/ErrorCodes.cpp                     |   1 +
 src/Core/Protocol.h                           |   8 +-
 src/Core/Settings.h                           |   2 +
 src/DataStreams/RemoteQueryExecutor.cpp       |  79 ++++++++-
 src/DataStreams/RemoteQueryExecutor.h         |  18 ++
 src/IO/WriteHelpers.h                         |   1 +
 src/Interpreters/Context.cpp                  |  19 ++
 src/Interpreters/Context.h                    |   8 +
 src/Server/TCPHandler.cpp                     |  39 +++++
 src/Server/TCPHandler.h                       |   5 +
 .../MergeTreeBaseSelectProcessor.cpp          |  21 +++
 src/Storages/MergeTree/MergeTreeData.cpp      |   2 +
 .../MergeTree/MergeTreeDataPartUUID.cpp       |  38 ++++
 .../MergeTree/MergeTreeDataPartUUID.h         |  34 ++++
 .../MergeTree/MergeTreeDataSelectExecutor.cpp | 125 ++++++++++---
 src/Storages/StorageDistributed.cpp           |   2 +
 src/Storages/ya.make                          |   1 +
 .../test_query_deduplication/__init__.py      |   0
 .../configs/deduplication_settings.xml        |   5 +
 .../configs/remote_servers.xml                |  24 +++
 .../test_query_deduplication/test.py          | 165 ++++++++++++++++++
 27 files changed, 607 insertions(+), 28 deletions(-)
 create mode 100644 src/Storages/MergeTree/MergeTreeDataPartUUID.cpp
 create mode 100644 src/Storages/MergeTree/MergeTreeDataPartUUID.h
 create mode 100644 tests/integration/test_query_deduplication/__init__.py
 create mode 100644 tests/integration/test_query_deduplication/configs/deduplication_settings.xml
 create mode 100644 tests/integration/test_query_deduplication/configs/remote_servers.xml
 create mode 100644 tests/integration/test_query_deduplication/test.py

diff --git a/programs/client/Client.cpp b/programs/client/Client.cpp
index 9a8b580407a..8d3a1ba7c74 100644
--- a/programs/client/Client.cpp
+++ b/programs/client/Client.cpp
@@ -1900,6 +1900,9 @@ private:
 
         switch (packet.type)
         {
+            case Protocol::Server::PartUUIDs:
+                return true;
+
             case Protocol::Server::Data:
                 if (!cancelled)
                     onData(packet.block);
diff --git a/src/Client/Connection.cpp b/src/Client/Connection.cpp
index 65b15a46955..e38a6b240a6 100644
--- a/src/Client/Connection.cpp
+++ b/src/Client/Connection.cpp
@@ -542,6 +542,12 @@ void Connection::sendData(const Block & block, const String & name, bool scalar)
         throttler->add(out->count() - prev_bytes);
 }
 
+void Connection::sendIgnoredPartUUIDs(const std::vector<UUID> & uuids)
+{
+    writeVarUInt(Protocol::Client::IgnoredPartUUIDs, *out);
+    writeVectorBinary(uuids, *out);
+    out->next();
+}
 
 void Connection::sendPreparedData(ReadBuffer & input, size_t size, const String & name)
 {
@@ -798,6 +804,10 @@ Packet Connection::receivePacket(std::function<void(Poco::Net::Socket &)> async_
             case Protocol::Server::EndOfStream:
                 return res;
 
+            case Protocol::Server::PartUUIDs:
+                readVectorBinary(res.part_uuids, *in);
+                return res;
+
             default:
                 /// In unknown state, disconnect - to not leave unsynchronised connection.
                 disconnect();
diff --git a/src/Client/Connection.h b/src/Client/Connection.h
index 83e8f3ba206..2d24b143d7a 100644
--- a/src/Client/Connection.h
+++ b/src/Client/Connection.h
@@ -66,6 +66,7 @@ struct Packet
     std::vector<String> multistring_message;
     Progress progress;
     BlockStreamProfileInfo profile_info;
+    std::vector<UUID> part_uuids;
 
     Packet() : type(Protocol::Server::Hello) {}
 };
@@ -157,6 +158,8 @@ public:
     void sendScalarsData(Scalars & data);
     /// Send all contents of external (temporary) tables.
     void sendExternalTablesData(ExternalTablesData & data);
+    /// Send parts' uuids to excluded them from query processing
+    void sendIgnoredPartUUIDs(const std::vector<UUID> & uuids);
 
     /// Send prepared block of data (serialized and, if need, compressed), that will be read from 'input'.
     /// You could pass size of serialized/compressed block.
diff --git a/src/Client/MultiplexedConnections.cpp b/src/Client/MultiplexedConnections.cpp
index ed7aad0a515..c50dd7b6454 100644
--- a/src/Client/MultiplexedConnections.cpp
+++ b/src/Client/MultiplexedConnections.cpp
@@ -140,6 +140,21 @@ void MultiplexedConnections::sendQuery(
     sent_query = true;
 }
 
+void MultiplexedConnections::sendIgnoredPartUUIDs(const std::vector<UUID> & uuids)
+{
+    std::lock_guard lock(cancel_mutex);
+
+    if (sent_query)
+        throw Exception("Cannot send uuids after query is sent.", ErrorCodes::LOGICAL_ERROR);
+
+    for (ReplicaState & state : replica_states)
+    {
+        Connection * connection = state.connection;
+        if (connection != nullptr)
+            connection->sendIgnoredPartUUIDs(uuids);
+    }
+}
+
 Packet MultiplexedConnections::receivePacket()
 {
     std::lock_guard lock(cancel_mutex);
@@ -195,6 +210,7 @@ Packet MultiplexedConnections::drain()
 
         switch (packet.type)
         {
+            case Protocol::Server::PartUUIDs:
             case Protocol::Server::Data:
             case Protocol::Server::Progress:
             case Protocol::Server::ProfileInfo:
@@ -253,6 +269,7 @@ Packet MultiplexedConnections::receivePacketUnlocked(std::function<void(Poco::Ne
 
     switch (packet.type)
     {
+        case Protocol::Server::PartUUIDs:
         case Protocol::Server::Data:
         case Protocol::Server::Progress:
         case Protocol::Server::ProfileInfo:
diff --git a/src/Client/MultiplexedConnections.h b/src/Client/MultiplexedConnections.h
index 2ab2b60570e..da0326fa6c0 100644
--- a/src/Client/MultiplexedConnections.h
+++ b/src/Client/MultiplexedConnections.h
@@ -50,6 +50,9 @@ public:
     /// Send a request to the replica to cancel the request
     void sendCancel();
 
+    /// Send parts' uuids to replicas to exclude them from query processing
+    void sendIgnoredPartUUIDs(const std::vector<UUID> & uuids);
+
     /** On each replica, read and skip all packets to EndOfStream or Exception.
       * Returns EndOfStream if no exception has been received. Otherwise
       * returns the last received packet of type Exception.
diff --git a/src/Columns/ColumnsNumber.h b/src/Columns/ColumnsNumber.h
index 96ce2bd6d6f..17a28e617c3 100644
--- a/src/Columns/ColumnsNumber.h
+++ b/src/Columns/ColumnsNumber.h
@@ -26,4 +26,6 @@ using ColumnInt256 = ColumnVector<Int256>;
 using ColumnFloat32 = ColumnVector<Float32>;
 using ColumnFloat64 = ColumnVector<Float64>;
 
+using ColumnUUID = ColumnVector<UInt128>;
+
 }
diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp
index a2cd65137c0..09e5945f2b5 100644
--- a/src/Common/ErrorCodes.cpp
+++ b/src/Common/ErrorCodes.cpp
@@ -533,6 +533,7 @@
     M(564, INTERSERVER_SCHEME_DOESNT_MATCH) \
     M(565, TOO_MANY_PARTITIONS) \
     M(566, CANNOT_RMDIR) \
+    M(567, DUPLICATED_PART_UUIDS) \
     \
     M(999, KEEPER_EXCEPTION) \
     M(1000, POCO_EXCEPTION) \
diff --git a/src/Core/Protocol.h b/src/Core/Protocol.h
index f383e509751..df51a0cb61a 100644
--- a/src/Core/Protocol.h
+++ b/src/Core/Protocol.h
@@ -75,8 +75,9 @@ namespace Protocol
             TablesStatusResponse = 9, /// A response to TablesStatus request.
             Log = 10,                 /// System logs of the query execution
             TableColumns = 11,        /// Columns' description for default values calculation
+            PartUUIDs = 12,           /// List of unique parts ids.
 
-            MAX = TableColumns,
+            MAX = PartUUIDs,
         };
 
         /// NOTE: If the type of packet argument would be Enum, the comparison packet >= 0 && packet < 10
@@ -98,6 +99,7 @@ namespace Protocol
                 "TablesStatusResponse",
                 "Log",
                 "TableColumns",
+                "PartUUIDs",
             };
             return packet <= MAX
                 ? data[packet]
@@ -132,8 +134,9 @@ namespace Protocol
             TablesStatusRequest = 5, /// Check status of tables on the server.
             KeepAlive = 6,           /// Keep the connection alive
             Scalar = 7,              /// A block of data (compressed or not).
+            IgnoredPartUUIDs = 8,    /// List of unique parts ids to exclude from query processing
 
-            MAX = Scalar,
+            MAX = IgnoredPartUUIDs,
         };
 
         inline const char * toString(UInt64 packet)
@@ -147,6 +150,7 @@ namespace Protocol
                 "TablesStatusRequest",
                 "KeepAlive",
                 "Scalar",
+                "IgnoredPartUUIDs",
             };
             return packet <= MAX
                 ? data[packet]
diff --git a/src/Core/Settings.h b/src/Core/Settings.h
index c4cf3803913..ecd3fa9e746 100644
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@@ -420,6 +420,8 @@ class IColumn;
     M(Bool, async_socket_for_remote, true, "Asynchronously read from socket executing remote query", 0) \
     \
     M(Bool, optimize_rewrite_sum_if_to_count_if, true, "Rewrite sumIf() and sum(if()) function countIf() function when logically equivalent", 0) \
+    M(Bool, allow_experimental_query_deduplication, false, "Allow sending parts' UUIDs for a query in order to deduplicate data parts if any", 0) \
+    \
     /** Obsolete settings that do nothing but left for compatibility reasons. Remove each one after half a year of obsolescence. */ \
     \
     M(UInt64, max_memory_usage_for_all_queries, 0, "Obsolete. Will be removed after 2020-10-20", 0) \
diff --git a/src/DataStreams/RemoteQueryExecutor.cpp b/src/DataStreams/RemoteQueryExecutor.cpp
index 14e51ffefdf..ce7db264eef 100644
--- a/src/DataStreams/RemoteQueryExecutor.cpp
+++ b/src/DataStreams/RemoteQueryExecutor.cpp
@@ -13,6 +13,7 @@
 #include <Interpreters/InternalTextLogsQueue.h>
 #include <IO/ConnectionTimeoutsContext.h>
 #include <Common/FiberStack.h>
+#include <Storages/MergeTree/MergeTreeDataPartUUID.h>
 
 namespace DB
 {
@@ -20,6 +21,7 @@ namespace DB
 namespace ErrorCodes
 {
     extern const int UNKNOWN_PACKET_FROM_SERVER;
+    extern const int DUPLICATED_PART_UUIDS;
 }
 
 RemoteQueryExecutor::RemoteQueryExecutor(
@@ -158,6 +160,7 @@ void RemoteQueryExecutor::sendQuery()
     std::lock_guard guard(was_cancelled_mutex);
 
     established = true;
+    was_cancelled = false;
 
     auto timeouts = ConnectionTimeouts::getTCPTimeoutsWithFailover(settings);
     ClientInfo modified_client_info = context.getClientInfo();
@@ -167,6 +170,14 @@ void RemoteQueryExecutor::sendQuery()
         modified_client_info.client_trace_context = CurrentThread::get().thread_trace_context;
     }
 
+    {
+        std::lock_guard lock(duplicated_part_uuids_mutex);
+        if (!duplicated_part_uuids.empty())
+        {
+            multiplexed_connections->sendIgnoredPartUUIDs(duplicated_part_uuids);
+        }
+    }
+
     multiplexed_connections->sendQuery(timeouts, query, query_id, stage, modified_client_info, true);
 
     established = false;
@@ -195,7 +206,29 @@ Block RemoteQueryExecutor::read()
         Packet packet = multiplexed_connections->receivePacket();
 
         if (auto block = processPacket(std::move(packet)))
+        {
+            if (got_duplicated_part_uuids)
+            {
+                /// Cancel previous query and disconnect before retry.
+                cancel();
+                multiplexed_connections->disconnect();
+
+                /// Only resend once, otherwise throw an exception
+                if (!resent_query)
+                {
+                    if (log)
+                        LOG_DEBUG(log, "Found duplicate UUIDs, will retry query without those parts");
+
+                    resent_query = true;
+                    sent_query = false;
+                    got_duplicated_part_uuids = false;
+                    /// Consecutive read will implicitly send query first.
+                    return read();
+                }
+                throw Exception("Found duplicate uuids while processing query.", ErrorCodes::DUPLICATED_PART_UUIDS);
+            }
             return *block;
+        }
     }
 }
 
@@ -233,7 +266,29 @@ std::variant<Block, int> RemoteQueryExecutor::read(std::unique_ptr<ReadContext>
         else
         {
             if (auto data = processPacket(std::move(read_context->packet)))
+            {
+                if (got_duplicated_part_uuids)
+                {
+                    /// Cancel previous query and disconnect before retry.
+                    cancel();
+                    multiplexed_connections->disconnect();
+
+                    /// Only resend once, otherwise throw an exception
+                    if (!resent_query)
+                    {
+                        if (log)
+                            LOG_DEBUG(log, "Found duplicate UUIDs, will retry query without those parts");
+
+                        resent_query = true;
+                        sent_query = false;
+                        got_duplicated_part_uuids = false;
+                        /// Consecutive read will implicitly send query first.
+                        return read(read_context);
+                    }
+                    throw Exception("Found duplicate uuids while processing query.", ErrorCodes::DUPLICATED_PART_UUIDS);
+                }
                 return std::move(*data);
+            }
         }
     }
     while (true);
@@ -246,6 +301,13 @@ std::optional<Block> RemoteQueryExecutor::processPacket(Packet packet)
 {
     switch (packet.type)
     {
+        case Protocol::Server::PartUUIDs:
+            if (!setPartUUIDs(packet.part_uuids))
+            {
+                got_duplicated_part_uuids = true;
+                return Block();
+            }
+            break;
         case Protocol::Server::Data:
             /// If the block is not empty and is not a header block
             if (packet.block && (packet.block.rows() > 0))
@@ -306,6 +368,20 @@ std::optional<Block> RemoteQueryExecutor::processPacket(Packet packet)
     return {};
 }
 
+bool RemoteQueryExecutor::setPartUUIDs(const std::vector<UUID> & uuids)
+{
+    Context & query_context = const_cast<Context &>(context).getQueryContext();
+    auto duplicates = query_context.getPartUUIDs()->add(uuids);
+
+    if (!duplicates.empty())
+    {
+        std::lock_guard lock(duplicated_part_uuids_mutex);
+        duplicated_part_uuids.insert(duplicated_part_uuids.begin(), duplicates.begin(), duplicates.end());
+        return false;
+    }
+    return true;
+}
+
 void RemoteQueryExecutor::finish(std::unique_ptr<ReadContext> * read_context)
 {
     /** If one of:
@@ -383,6 +459,7 @@ void RemoteQueryExecutor::sendExternalTables()
     {
         std::lock_guard lock(external_tables_mutex);
 
+        external_tables_data.clear();
         external_tables_data.reserve(count);
 
         for (size_t i = 0; i < count; ++i)
@@ -446,7 +523,7 @@ bool RemoteQueryExecutor::isQueryPending() const
 
 bool RemoteQueryExecutor::hasThrownException() const
 {
-    return got_exception_from_replica || got_unknown_packet_from_replica;
+    return got_exception_from_replica || got_unknown_packet_from_replica || got_duplicated_part_uuids;
 }
 
 }
diff --git a/src/DataStreams/RemoteQueryExecutor.h b/src/DataStreams/RemoteQueryExecutor.h
index 46d9d067563..843cf75f1f8 100644
--- a/src/DataStreams/RemoteQueryExecutor.h
+++ b/src/DataStreams/RemoteQueryExecutor.h
@@ -57,6 +57,9 @@ public:
     /// Create connection and send query, external tables and scalars.
     void sendQuery();
 
+    /// Query is resent to a replica, the query itself can be modified.
+    std::atomic<bool> resent_query { false };
+
     /// Read next block of data. Returns empty block if query is finished.
     Block read();
 
@@ -152,6 +155,14 @@ private:
       */
     std::atomic<bool> got_unknown_packet_from_replica { false };
 
+    /** Got duplicated uuids from replica
+      */
+    std::atomic<bool> got_duplicated_part_uuids{ false };
+
+    /// Parts uuids, collected from remote replicas
+    std::mutex duplicated_part_uuids_mutex;
+    std::vector<UUID> duplicated_part_uuids;
+
     PoolMode pool_mode = PoolMode::GET_MANY;
     StorageID main_table = StorageID::createEmpty();
 
@@ -163,6 +174,10 @@ private:
     /// Send all temporary tables to remote servers
     void sendExternalTables();
 
+    /** Set part uuids to a query context, collected from remote replicas.
+      */
+    bool setPartUUIDs(const std::vector<UUID> & uuids);
+
     /// If wasn't sent yet, send request to cancel all connections to replicas
     void tryCancel(const char * reason, std::unique_ptr<ReadContext> * read_context);
 
@@ -174,6 +189,9 @@ private:
 
     /// Process packet for read and return data block if possible.
     std::optional<Block> processPacket(Packet packet);
+
+    /// Reads packet by packet
+    Block readPackets();
 };
 
 }
diff --git a/src/IO/WriteHelpers.h b/src/IO/WriteHelpers.h
index 9072f306bd9..a37a5b5ddc6 100644
--- a/src/IO/WriteHelpers.h
+++ b/src/IO/WriteHelpers.h
@@ -910,6 +910,7 @@ inline void writeBinary(const StringRef & x, WriteBuffer & buf) { writeStringBin
 inline void writeBinary(const std::string_view & x, WriteBuffer & buf) { writeStringBinary(x, buf); }
 inline void writeBinary(const Int128 & x, WriteBuffer & buf) { writePODBinary(x, buf); }
 inline void writeBinary(const UInt128 & x, WriteBuffer & buf) { writePODBinary(x, buf); }
+inline void writeBinary(const UUID & x, WriteBuffer & buf) { writePODBinary(x, buf); }
 inline void writeBinary(const DummyUInt256 & x, WriteBuffer & buf) { writePODBinary(x, buf); }
 inline void writeBinary(const Decimal32 & x, WriteBuffer & buf) { writePODBinary(x, buf); }
 inline void writeBinary(const Decimal64 & x, WriteBuffer & buf) { writePODBinary(x, buf); }
diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp
index 8ff317764a7..9a1fcf6a067 100644
--- a/src/Interpreters/Context.cpp
+++ b/src/Interpreters/Context.cpp
@@ -64,6 +64,7 @@
 #include <Common/RemoteHostFilter.h>
 #include <Interpreters/DatabaseCatalog.h>
 #include <Storages/MergeTree/BackgroundJobsExecutor.h>
+#include <Storages/MergeTree/MergeTreeDataPartUUID.h>
 
 
 namespace ProfileEvents
@@ -2510,4 +2511,22 @@ StorageID Context::resolveStorageIDImpl(StorageID storage_id, StorageNamespace w
     return StorageID::createEmpty();
 }
 
+PartUUIDsPtr Context::getPartUUIDs()
+{
+    auto lock = getLock();
+    if (!part_uuids)
+        part_uuids = std::make_shared<PartUUIDs>();
+
+    return part_uuids;
+}
+
+PartUUIDsPtr Context::getIgnoredPartUUIDs()
+{
+    auto lock = getLock();
+    if (!ignored_part_uuids)
+        ignored_part_uuids = std::make_shared<PartUUIDs>();
+
+    return ignored_part_uuids;
+}
+
 }
diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h
index 5801cc2b949..4dbdf390473 100644
--- a/src/Interpreters/Context.h
+++ b/src/Interpreters/Context.h
@@ -107,6 +107,8 @@ using StoragePolicyPtr = std::shared_ptr<const IStoragePolicy>;
 using StoragePoliciesMap = std::map<String, StoragePolicyPtr>;
 class StoragePolicySelector;
 using StoragePolicySelectorPtr = std::shared_ptr<const StoragePolicySelector>;
+struct PartUUIDs;
+using PartUUIDsPtr = std::shared_ptr<PartUUIDs>;
 
 class IOutputFormat;
 using OutputFormatPtr = std::shared_ptr<IOutputFormat>;
@@ -264,6 +266,9 @@ private:
     using SampleBlockCache = std::unordered_map<std::string, Block>;
     mutable SampleBlockCache sample_block_cache;
 
+    std::shared_ptr<PartUUIDs> part_uuids; /// set of parts' uuids, is used for query parts deduplication
+    std::shared_ptr<PartUUIDs> ignored_part_uuids; /// set of parts' uuids are meant to be excluded from query processing
+
     NameToNameMap query_parameters;   /// Dictionary with query parameters for prepared statements.
                                                      /// (key=name, value)
 
@@ -734,6 +739,9 @@ public:
     };
 
     MySQLWireContext mysql;
+
+    PartUUIDsPtr getPartUUIDs();
+    PartUUIDsPtr getIgnoredPartUUIDs();
 private:
     std::unique_lock<std::recursive_mutex> getLock() const;
 
diff --git a/src/Server/TCPHandler.cpp b/src/Server/TCPHandler.cpp
index 12d1a0249b7..0d040652342 100644
--- a/src/Server/TCPHandler.cpp
+++ b/src/Server/TCPHandler.cpp
@@ -24,6 +24,7 @@
 #include <Interpreters/OpenTelemetrySpanLog.h>
 #include <Storages/StorageMemory.h>
 #include <Storages/StorageReplicatedMergeTree.h>
+#include <Storages/MergeTree/MergeTreeDataPartUUID.h>
 #include <Core/ExternalTable.h>
 #include <Storages/ColumnDefault.h>
 #include <DataTypes/DataTypeLowCardinality.h>
@@ -180,10 +181,16 @@ void TCPHandler::runImpl()
 
             /** If Query - process it. If Ping or Cancel - go back to the beginning.
              *  There may come settings for a separate query that modify `query_context`.
+             *  It's possible to receive part uuids packet before the query, so then receivePacket has to be called twice.
              */
             if (!receivePacket())
                 continue;
 
+            /** If part_uuids got received in previous packet, trying to read again.
+              */
+            if (state.empty() && state.part_uuids && !receivePacket())
+                continue;
+
             query_scope.emplace(*query_context);
 
             send_exception_with_stack_trace = query_context->getSettingsRef().calculate_text_stack_trace;
@@ -528,6 +535,8 @@ void TCPHandler::processOrdinaryQuery()
     /// Pull query execution result, if exists, and send it to network.
     if (state.io.in)
     {
+        sendPartUUIDs();
+
         /// This allows the client to prepare output format
         if (Block header = state.io.in->getHeader())
             sendData(header);
@@ -592,6 +601,8 @@ void TCPHandler::processOrdinaryQueryWithProcessors()
 {
     auto & pipeline = state.io.pipeline;
 
+    sendPartUUIDs();
+
     /// Send header-block, to allow client to prepare output format for data to send.
     {
         const auto & header = pipeline.getHeader();
@@ -693,6 +704,20 @@ void TCPHandler::receiveUnexpectedTablesStatusRequest()
     throw NetException("Unexpected packet TablesStatusRequest received from client", ErrorCodes::UNEXPECTED_PACKET_FROM_CLIENT);
 }
 
+void TCPHandler::sendPartUUIDs()
+{
+    auto uuids = query_context->getPartUUIDs()->get();
+    if (!uuids.empty())
+    {
+        for (const auto & uuid : uuids)
+            LOG_TRACE(log, "Sending UUID: {}", toString(uuid));
+
+        writeVarUInt(Protocol::Server::PartUUIDs, *out);
+        writeVectorBinary(uuids, *out);
+        out->next();
+    }
+}
+
 void TCPHandler::sendProfileInfo(const BlockStreamProfileInfo & info)
 {
     writeVarUInt(Protocol::Server::ProfileInfo, *out);
@@ -905,6 +930,10 @@ bool TCPHandler::receivePacket()
 
     switch (packet_type)
     {
+        case Protocol::Client::IgnoredPartUUIDs:
+            /// Part uuids packet if any comes before query.
+            receiveIgnoredPartUUIDs();
+            return true;
         case Protocol::Client::Query:
             if (!state.empty())
                 receiveUnexpectedQuery();
@@ -940,6 +969,16 @@ bool TCPHandler::receivePacket()
     }
 }
 
+void TCPHandler::receiveIgnoredPartUUIDs()
+{
+    state.part_uuids = true;
+    std::vector<UUID> uuids;
+    readVectorBinary(uuids, *in);
+
+    if (!uuids.empty())
+        query_context->getIgnoredPartUUIDs()->add(uuids);
+}
+
 void TCPHandler::receiveClusterNameAndSalt()
 {
     readStringBinary(cluster, *in);
diff --git a/src/Server/TCPHandler.h b/src/Server/TCPHandler.h
index 0d3109a6591..41539bef1e1 100644
--- a/src/Server/TCPHandler.h
+++ b/src/Server/TCPHandler.h
@@ -67,6 +67,9 @@ struct QueryState
     /// Temporary tables read
     bool temporary_tables_read = false;
 
+    /// A state got uuids to exclude from a query
+    bool part_uuids = false;
+
     /// Request requires data from client for function input()
     bool need_receive_data_for_input = false;
     /// temporary place for incoming data block for input()
@@ -173,6 +176,7 @@ private:
     void receiveHello();
     bool receivePacket();
     void receiveQuery();
+    void receiveIgnoredPartUUIDs();
     bool receiveData(bool scalar);
     bool readDataNext(const size_t & poll_interval, const int & receive_timeout);
     void readData(const Settings & connection_settings);
@@ -201,6 +205,7 @@ private:
     void sendProgress();
     void sendLogs();
     void sendEndOfStream();
+    void sendPartUUIDs();
     void sendProfileInfo(const BlockStreamProfileInfo & info);
     void sendTotals(const Block & totals);
     void sendExtremes(const Block & extremes);
diff --git a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp
index c852151f27d..ce60856505e 100644
--- a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp
+++ b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp
@@ -7,6 +7,7 @@
 #include <Common/typeid_cast.h>
 #include <DataTypes/DataTypeNothing.h>
 #include <DataTypes/DataTypeNullable.h>
+#include <DataTypes/DataTypeUUID.h>
 
 
 namespace DB
@@ -205,6 +206,7 @@ namespace
 
         virtual void insertStringColumn(const ColumnPtr & column, const String & name) = 0;
         virtual void insertUInt64Column(const ColumnPtr & column, const String & name) = 0;
+        virtual void insertUUIDColumn(const ColumnPtr & column, const String & name) = 0;
     };
 }
 
@@ -241,6 +243,16 @@ static void injectVirtualColumnsImpl(size_t rows, VirtualColumnsInserter & inser
 
                 inserter.insertUInt64Column(column, virtual_column_name);
             }
+            else if (virtual_column_name == "_part_uuid")
+            {
+                ColumnPtr column;
+                if (rows)
+                    column = DataTypeUUID().createColumnConst(rows, task->data_part->uuid)->convertToFullColumnIfConst();
+                else
+                    column = DataTypeUUID().createColumn();
+
+                inserter.insertUUIDColumn(column, virtual_column_name);
+            }
             else if (virtual_column_name == "_partition_id")
             {
                 ColumnPtr column;
@@ -271,6 +283,11 @@ namespace
             block.insert({column, std::make_shared<DataTypeUInt64>(), name});
         }
 
+        void insertUUIDColumn(const ColumnPtr & column, const String & name) final
+        {
+            block.insert({column, std::make_shared<DataTypeUUID>(), name});
+        }
+
         Block & block;
     };
 
@@ -288,6 +305,10 @@ namespace
             columns.push_back(column);
         }
 
+        void insertUUIDColumn(const ColumnPtr & column, const String &) final
+        {
+            columns.push_back(column);
+        }
         Columns & columns;
     };
 }
diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp
index 9ed751cbc8e..56e6033d18e 100644
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@@ -4,6 +4,7 @@
 #include <DataTypes/DataTypeDate.h>
 #include <DataTypes/DataTypeDateTime.h>
 #include <DataTypes/DataTypeEnum.h>
+#include <DataTypes/DataTypeUUID.h>
 #include <DataTypes/DataTypeNullable.h>
 #include <DataTypes/DataTypeLowCardinality.h>
 #include <DataTypes/NestedUtils.h>
@@ -3949,6 +3950,7 @@ NamesAndTypesList MergeTreeData::getVirtuals() const
     return NamesAndTypesList{
         NameAndTypePair("_part", std::make_shared<DataTypeString>()),
         NameAndTypePair("_part_index", std::make_shared<DataTypeUInt64>()),
+        NameAndTypePair("_part_uuid", std::make_shared<DataTypeUUID>()),
         NameAndTypePair("_partition_id", std::make_shared<DataTypeString>()),
         NameAndTypePair("_sample_factor", std::make_shared<DataTypeFloat64>()),
     };
diff --git a/src/Storages/MergeTree/MergeTreeDataPartUUID.cpp b/src/Storages/MergeTree/MergeTreeDataPartUUID.cpp
new file mode 100644
index 00000000000..17d19855798
--- /dev/null
+++ b/src/Storages/MergeTree/MergeTreeDataPartUUID.cpp
@@ -0,0 +1,38 @@
+#include <Storages/MergeTree/MergeTreeDataPartUUID.h>
+
+namespace DB
+{
+
+std::vector<UUID> PartUUIDs::add(const std::vector<UUID> & new_uuids)
+{
+    std::lock_guard lock(mutex);
+    std::vector<UUID> intersection;
+
+    /// First check any presence of uuids in a uuids, return duplicates back if any
+    for (const auto & uuid : new_uuids)
+    {
+        if (uuids.find(uuid) != uuids.end())
+            intersection.emplace_back(uuid);
+    }
+
+    if (intersection.empty())
+    {
+        for (const auto & uuid : new_uuids)
+            uuids.emplace(uuid);
+    }
+    return intersection;
+}
+
+std::vector<UUID> PartUUIDs::get() const
+{
+    std::lock_guard lock(mutex);
+    return std::vector<UUID>(uuids.begin(), uuids.end());
+}
+
+bool PartUUIDs::has(const UUID & uuid) const
+{
+    std::lock_guard lock(mutex);
+    return uuids.find(uuid) != uuids.end();
+}
+
+}
diff --git a/src/Storages/MergeTree/MergeTreeDataPartUUID.h b/src/Storages/MergeTree/MergeTreeDataPartUUID.h
new file mode 100644
index 00000000000..ee3a9ee2791
--- /dev/null
+++ b/src/Storages/MergeTree/MergeTreeDataPartUUID.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <memory>
+#include <mutex>
+#include <unordered_set>
+#include <Core/UUID.h>
+
+namespace DB
+{
+
+/** PartUUIDs is a uuid set to control query deduplication.
+ * The object is used in query context in both direction:
+ *  Server->Client to send all parts' UUIDs that have been read during the query
+ *  Client->Server to ignored specified parts from being processed.
+ *
+ *  Current implementation assumes a user setting allow_experimental_query_deduplication=1 is set.
+ */
+struct PartUUIDs
+{
+public:
+    /// Add new UUIDs if not duplicates found otherwise return duplicated UUIDs
+    std::vector<UUID> add(const std::vector<UUID> & uuids);
+    /// Get accumulated UUIDs
+    std::vector<UUID> get() const;
+    bool has(const UUID & uuid) const;
+
+private:
+    mutable std::mutex mutex;
+    std::unordered_set<UUID> uuids;
+};
+
+using PartUUIDsPtr = std::shared_ptr<PartUUIDs>;
+
+}
diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
index 457c9c04aa9..740288e3b46 100644
--- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
@@ -15,6 +15,7 @@
 #include <Storages/MergeTree/MergeTreeIndices.h>
 #include <Storages/MergeTree/MergeTreeIndexReader.h>
 #include <Storages/MergeTree/KeyCondition.h>
+#include <Storages/MergeTree/MergeTreeDataPartUUID.h>
 #include <Storages/ReadInOrderOptimizer.h>
 #include <Parsers/ASTIdentifier.h>
 #include <Parsers/ASTLiteral.h>
@@ -35,8 +36,10 @@
 #include <Processors/QueryPlan/MergingFinal.h>
 #include <Processors/QueryPlan/ReadNothingStep.h>
 
+#include <Core/UUID.h>
 #include <DataTypes/DataTypeDate.h>
 #include <DataTypes/DataTypeEnum.h>
+#include <DataTypes/DataTypeUUID.h>
 #include <DataTypes/DataTypesNumber.h>
 #include <Storages/VirtualColumnUtils.h>
 
@@ -61,6 +64,7 @@ namespace ErrorCodes
     extern const int TOO_MANY_ROWS;
     extern const int CANNOT_PARSE_TEXT;
     extern const int TOO_MANY_PARTITIONS;
+    extern const int DUPLICATED_PART_UUIDS;
 }
 
 
@@ -71,14 +75,27 @@ MergeTreeDataSelectExecutor::MergeTreeDataSelectExecutor(const MergeTreeData & d
 
 
 /// Construct a block consisting only of possible values of virtual columns
-static Block getBlockWithPartColumn(const MergeTreeData::DataPartsVector & parts)
+static Block getBlockWithVirtualPartColumns(const MergeTreeData::DataPartsVector & parts, bool with_uuid)
 {
-    auto column = ColumnString::create();
+    auto part_column = ColumnString::create();
+    auto part_uuid_column = ColumnUUID::create();
 
     for (const auto & part : parts)
-        column->insert(part->name);
+    {
+        part_column->insert(part->name);
+        if (with_uuid)
+            part_uuid_column->insert(part->uuid);
+    }
 
-    return Block{ColumnWithTypeAndName(std::move(column), std::make_shared<DataTypeString>(), "_part")};
+    if (with_uuid)
+    {
+        return Block(std::initializer_list<ColumnWithTypeAndName>{
+            ColumnWithTypeAndName(std::move(part_column), std::make_shared<DataTypeString>(), "_part"),
+            ColumnWithTypeAndName(std::move(part_uuid_column), std::make_shared<DataTypeUUID>(), "_part_uuid"),
+        });
+    }
+
+    return Block{ColumnWithTypeAndName(std::move(part_column), std::make_shared<DataTypeString>(), "_part")};
 }
 
 
@@ -162,6 +179,7 @@ QueryPlanPtr MergeTreeDataSelectExecutor::readFromParts(
     Names real_column_names;
 
     bool part_column_queried = false;
+    bool part_uuid_column_queried = false;
 
     bool sample_factor_column_queried = false;
     Float64 used_sample_factor = 1;
@@ -181,6 +199,11 @@ QueryPlanPtr MergeTreeDataSelectExecutor::readFromParts(
         {
             virt_column_names.push_back(name);
         }
+        else if (name == "_part_uuid")
+        {
+            part_uuid_column_queried = true;
+            virt_column_names.push_back(name);
+        }
         else if (name == "_sample_factor")
         {
             sample_factor_column_queried = true;
@@ -198,9 +221,9 @@ QueryPlanPtr MergeTreeDataSelectExecutor::readFromParts(
     if (real_column_names.empty())
         real_column_names.push_back(ExpressionActions::getSmallestColumn(available_real_columns));
 
-    /// If `_part` virtual column is requested, we try to use it as an index.
-    Block virtual_columns_block = getBlockWithPartColumn(parts);
-    if (part_column_queried)
+    /// If `_part` or `_part_uuid` virtual columns are requested, we try to filter out data by them.
+    Block virtual_columns_block = getBlockWithVirtualPartColumns(parts, part_uuid_column_queried);
+    if (part_column_queried || part_uuid_column_queried)
         VirtualColumnUtils::filterBlockWithQuery(query_info.query, virtual_columns_block, context);
 
     auto part_values = VirtualColumnUtils::extractSingleValueFromBlock<String>(virtual_columns_block, "_part");
@@ -246,36 +269,88 @@ QueryPlanPtr MergeTreeDataSelectExecutor::readFromParts(
 
     /// Select the parts in which there can be data that satisfy `minmax_idx_condition` and that match the condition on `_part`,
     ///  as well as `max_block_number_to_read`.
+    /// Skip parts uuids if any to the query context, or skip parts which uuids marked as excluded.
     {
-        auto prev_parts = parts;
-        parts.clear();
+        Context & query_context
+            = context.hasQueryContext() ? const_cast<Context &>(context).getQueryContext() : const_cast<Context &>(context);
 
-        for (const auto & part : prev_parts)
+        /// process_parts prepare parts that have to be read for the query,
+        /// returns false if duplicated parts' UUID have been met
+        auto select_parts = [&] (MergeTreeData::DataPartsVector & selected_parts) -> bool
         {
-            if (part_values.find(part->name) == part_values.end())
-                continue;
+            auto ignored_part_uuids = query_context.getIgnoredPartUUIDs();
+            std::unordered_set<UUID> temp_part_uuids;
 
-            if (part->isEmpty())
-                continue;
+            auto prev_parts = selected_parts;
+            selected_parts.clear();
 
-            if (minmax_idx_condition && !minmax_idx_condition->checkInHyperrectangle(
-                    part->minmax_idx.hyperrectangle, data.minmax_idx_column_types).can_be_true)
-                continue;
-
-            if (partition_pruner)
+            for (const auto & part : prev_parts)
             {
-                if (partition_pruner->canBePruned(part))
+                if (part_values.find(part->name) == part_values.end())
                     continue;
+
+                if (part->isEmpty())
+                    continue;
+
+                if (minmax_idx_condition
+                    && !minmax_idx_condition->checkInHyperrectangle(part->minmax_idx.hyperrectangle, data.minmax_idx_column_types)
+                            .can_be_true)
+                    continue;
+
+                if (partition_pruner)
+                {
+                    if (partition_pruner->canBePruned(part))
+                        continue;
+                }
+
+                if (max_block_numbers_to_read)
+                {
+                    auto blocks_iterator = max_block_numbers_to_read->find(part->info.partition_id);
+                    if (blocks_iterator == max_block_numbers_to_read->end() || part->info.max_block > blocks_iterator->second)
+                        continue;
+                }
+
+                /// populate UUIDs and exclude ignored parts if enabled
+                if (query_context.getSettingsRef().allow_experimental_query_deduplication && part->uuid != UUIDHelpers::Nil)
+                {
+                    /// Skip the part if its uuid is meant to be excluded
+                    if (ignored_part_uuids->has(part->uuid))
+                        continue;
+
+                    auto result = temp_part_uuids.insert(part->uuid);
+                    if (!result.second)
+                        throw Exception("Found a part with the same UUID on the same replica.", ErrorCodes::LOGICAL_ERROR);
+                }
+
+                selected_parts.push_back(part);
             }
 
-            if (max_block_numbers_to_read)
+            if (!temp_part_uuids.empty())
             {
-                auto blocks_iterator = max_block_numbers_to_read->find(part->info.partition_id);
-                if (blocks_iterator == max_block_numbers_to_read->end() || part->info.max_block > blocks_iterator->second)
-                    continue;
+                auto duplicates = query_context.getPartUUIDs()->add(std::vector<UUID>{temp_part_uuids.begin(), temp_part_uuids.end()});
+                if (!duplicates.empty())
+                {
+                    /// on a local replica with prefer_localhost_replica=1 if any duplicates appeared during the first pass,
+                    /// adding them to the exclusion, so they will be skipped on second pass
+                    query_context.getIgnoredPartUUIDs()->add(duplicates);
+                    return false;
+                }
             }
 
-            parts.push_back(part);
+            return true;
+        };
+
+        /// Process parts that have to be read for a query.
+        auto needs_retry = !select_parts(parts);
+        /// If any duplicated part UUIDs met during the first step, try to ignore them in second pass
+        if (needs_retry)
+        {
+            if (log)
+                LOG_DEBUG(log, "Found duplicate uuids locally, will retry part selection without them");
+
+            /// Second attempt didn't help, throw an exception
+            if (!select_parts(parts))
+                throw Exception("Found duplicate UUIDs while processing query.", ErrorCodes::DUPLICATED_PART_UUIDS);
         }
     }
 
diff --git a/src/Storages/StorageDistributed.cpp b/src/Storages/StorageDistributed.cpp
index 5227cd8a33e..570aeef820d 100644
--- a/src/Storages/StorageDistributed.cpp
+++ b/src/Storages/StorageDistributed.cpp
@@ -4,6 +4,7 @@
 #include <Disks/IDisk.h>
 
 #include <DataTypes/DataTypeFactory.h>
+#include <DataTypes/DataTypeUUID.h>
 #include <DataTypes/DataTypesNumber.h>
 
 #include <Storages/Distributed/DistributedBlockOutputStream.h>
@@ -345,6 +346,7 @@ NamesAndTypesList StorageDistributed::getVirtuals() const
             NameAndTypePair("_table", std::make_shared<DataTypeString>()),
             NameAndTypePair("_part", std::make_shared<DataTypeString>()),
             NameAndTypePair("_part_index", std::make_shared<DataTypeUInt64>()),
+            NameAndTypePair("_part_uuid", std::make_shared<DataTypeUUID>()),
             NameAndTypePair("_partition_id", std::make_shared<DataTypeString>()),
             NameAndTypePair("_sample_factor", std::make_shared<DataTypeFloat64>()),
             NameAndTypePair("_shard_num", std::make_shared<DataTypeUInt32>()),
diff --git a/src/Storages/ya.make b/src/Storages/ya.make
index 69e319cbad5..dbf37e58695 100644
--- a/src/Storages/ya.make
+++ b/src/Storages/ya.make
@@ -48,6 +48,7 @@ SRCS(
     MergeTree/MergeTreeDataPartInMemory.cpp
     MergeTree/MergeTreeDataPartTTLInfo.cpp
     MergeTree/MergeTreeDataPartType.cpp
+    MergeTree/MergeTreeDataPartUUID.cpp
     MergeTree/MergeTreeDataPartWide.cpp
     MergeTree/MergeTreeDataPartWriterCompact.cpp
     MergeTree/MergeTreeDataPartWriterInMemory.cpp
diff --git a/tests/integration/test_query_deduplication/__init__.py b/tests/integration/test_query_deduplication/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/integration/test_query_deduplication/configs/deduplication_settings.xml b/tests/integration/test_query_deduplication/configs/deduplication_settings.xml
new file mode 100644
index 00000000000..8369c916848
--- /dev/null
+++ b/tests/integration/test_query_deduplication/configs/deduplication_settings.xml
@@ -0,0 +1,5 @@
+<yandex>
+    <merge_tree>
+        <assign_part_uuids>1</assign_part_uuids>
+    </merge_tree>
+</yandex>
diff --git a/tests/integration/test_query_deduplication/configs/remote_servers.xml b/tests/integration/test_query_deduplication/configs/remote_servers.xml
new file mode 100644
index 00000000000..f12558ca529
--- /dev/null
+++ b/tests/integration/test_query_deduplication/configs/remote_servers.xml
@@ -0,0 +1,24 @@
+<yandex>
+    <remote_servers>
+        <test_cluster>
+            <shard>
+                <replica>
+                    <host>node1</host>
+                    <port>9000</port>
+                </replica>
+            </shard>
+            <shard>
+                <replica>
+                    <host>node2</host>
+                    <port>9000</port>
+                </replica>
+            </shard>
+            <shard>
+                <replica>
+                    <host>node3</host>
+                    <port>9000</port>
+                </replica>
+            </shard>
+        </test_cluster>
+    </remote_servers>
+</yandex>
diff --git a/tests/integration/test_query_deduplication/test.py b/tests/integration/test_query_deduplication/test.py
new file mode 100644
index 00000000000..8d935b98579
--- /dev/null
+++ b/tests/integration/test_query_deduplication/test.py
@@ -0,0 +1,165 @@
+import uuid
+
+import pytest
+
+from helpers.cluster import ClickHouseCluster
+from helpers.test_tools import TSV
+
+DUPLICATED_UUID = uuid.uuid4()
+
+cluster = ClickHouseCluster(__file__)
+
+node1 = cluster.add_instance(
+    'node1',
+    main_configs=['configs/remote_servers.xml', 'configs/deduplication_settings.xml'])
+
+node2 = cluster.add_instance(
+    'node2',
+    main_configs=['configs/remote_servers.xml', 'configs/deduplication_settings.xml'])
+
+node3 = cluster.add_instance(
+    'node3',
+    main_configs=['configs/remote_servers.xml', 'configs/deduplication_settings.xml'])
+
+
+@pytest.fixture(scope="module")
+def started_cluster():
+    try:
+        cluster.start()
+        yield cluster
+    finally:
+        cluster.shutdown()
+
+
+def prepare_node(node, parts_uuid=None):
+    node.query("""
+    CREATE TABLE t(_prefix UInt8 DEFAULT 0, key UInt64, value UInt64)
+    ENGINE MergeTree()
+    ORDER BY tuple()
+    PARTITION BY _prefix
+    SETTINGS index_granularity = 1
+    """)
+
+    node.query("""
+    CREATE TABLE d AS t ENGINE=Distributed(test_cluster, default, t)
+    """)
+
+    # Stop merges while populating test data
+    node.query("SYSTEM STOP MERGES")
+
+    # Create 5 parts
+    for i in range(1, 6):
+        node.query("INSERT INTO t VALUES ({}, {}, {})".format(i, i, i))
+
+    node.query("DETACH TABLE t")
+
+    if parts_uuid:
+        for part, part_uuid in parts_uuid:
+            script = """
+            echo -n '{}' > /var/lib/clickhouse/data/default/t/{}/uuid.txt
+            """.format(part_uuid, part)
+            node.exec_in_container(["bash", "-c", script])
+
+    # Attach table back
+    node.query("ATTACH TABLE t")
+
+    # NOTE:
+    # due to absence of the ability to lock part, need to operate on parts with preventin merges
+    # node.query("SYSTEM START MERGES")
+    # node.query("OPTIMIZE TABLE t FINAL")
+
+    print(node.name)
+    print(node.query("SELECT name, uuid, partition FROM system.parts WHERE table = 't' AND active ORDER BY name"))
+
+    assert '5' == node.query("SELECT count() FROM system.parts WHERE table = 't' AND active").strip()
+    if parts_uuid:
+        for part, part_uuid in parts_uuid:
+            assert '1' == node.query(
+                "SELECT count() FROM system.parts WHERE table = 't' AND uuid = '{}' AND active".format(
+                    part_uuid)).strip()
+
+
+@pytest.fixture(scope="module")
+def prepared_cluster(started_cluster):
+    print("duplicated UUID: {}".format(DUPLICATED_UUID))
+    prepare_node(node1, parts_uuid=[("3_3_3_0", DUPLICATED_UUID)])
+    prepare_node(node2, parts_uuid=[("3_3_3_0", DUPLICATED_UUID)])
+    prepare_node(node3)
+
+
+def test_virtual_column(prepared_cluster):
+    # Part containing `key=3` has the same fingerprint on both nodes,
+    #   we expect it to be included only once in the end result.;
+    # select query is using virtucal column _part_fingerprint to filter out part in one shard
+    expected = """
+    1	2
+    2	2
+    3	1
+    4	2
+    5	2
+    """
+    assert TSV(expected) == TSV(node1.query("""
+    SELECT
+        key,
+        count() AS c
+    FROM d
+    WHERE ((_shard_num = 1) AND (_part_uuid != '{}')) OR (_shard_num = 2)
+    GROUP BY key
+    ORDER BY
+        key ASC
+    """.format(DUPLICATED_UUID)))
+
+
+def test_with_deduplication(prepared_cluster):
+    # Part containing `key=3` has the same fingerprint on both nodes,
+    # we expect it to be included only once in the end result
+    expected = """
+1	3
+2	3
+3	2
+4	3
+5	3
+"""
+    assert TSV(expected) == TSV(node1.query(
+        "SET allow_experimental_query_deduplication=1; SELECT key, count() c FROM d GROUP BY key ORDER BY key"))
+
+
+def test_no_merge_with_deduplication(prepared_cluster):
+    # Part containing `key=3` has the same fingerprint on both nodes,
+    # we expect it to be included only once in the end result.
+    # even with distributed_group_by_no_merge=1 the duplicated part should be excluded from the final result
+    expected = """
+1	1
+2	1
+3	1
+4	1
+5	1
+1	1
+2	1
+3	1
+4	1
+5	1
+1	1
+2	1
+4	1
+5	1
+"""
+    assert TSV(expected) == TSV(node1.query("SELECT key, count() c FROM d GROUP BY key ORDER BY key", settings={
+        "allow_experimental_query_deduplication": 1,
+        "distributed_group_by_no_merge": 1,
+    }))
+
+
+def test_without_deduplication(prepared_cluster):
+    # Part containing `key=3` has the same fingerprint on both nodes,
+    # but allow_experimental_query_deduplication is disabled,
+    # so it will not be excluded
+    expected = """
+1	3
+2	3
+3	3
+4	3
+5	3
+"""
+    assert TSV(expected) == TSV(node1.query(
+        "SET allow_experimental_query_deduplication=0; SELECT key, count() c FROM d GROUP BY key ORDER BY key"))

From d05c6446b9187411f29cd9cf052e99152502eda8 Mon Sep 17 00:00:00 2001
From: Aleksei Semiglazov <asemiglazov@cloudflare.com>
Date: Tue, 2 Feb 2021 23:21:07 +0000
Subject: [PATCH 022/122] Send cancel packet and cancel read_context before
 retrying the query

---
 src/DataStreams/RemoteQueryExecutor.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/DataStreams/RemoteQueryExecutor.cpp b/src/DataStreams/RemoteQueryExecutor.cpp
index ce7db264eef..27b3de66497 100644
--- a/src/DataStreams/RemoteQueryExecutor.cpp
+++ b/src/DataStreams/RemoteQueryExecutor.cpp
@@ -244,7 +244,7 @@ std::variant<Block, int> RemoteQueryExecutor::read(std::unique_ptr<ReadContext>
             return Block();
     }
 
-    if (!read_context)
+    if (!read_context || resent_query)
     {
         std::lock_guard lock(was_cancelled_mutex);
         if (was_cancelled)
@@ -270,7 +270,7 @@ std::variant<Block, int> RemoteQueryExecutor::read(std::unique_ptr<ReadContext>
                 if (got_duplicated_part_uuids)
                 {
                     /// Cancel previous query and disconnect before retry.
-                    cancel();
+                    cancel(&read_context);
                     multiplexed_connections->disconnect();
 
                     /// Only resend once, otherwise throw an exception
@@ -523,7 +523,7 @@ bool RemoteQueryExecutor::isQueryPending() const
 
 bool RemoteQueryExecutor::hasThrownException() const
 {
-    return got_exception_from_replica || got_unknown_packet_from_replica || got_duplicated_part_uuids;
+    return got_exception_from_replica || got_unknown_packet_from_replica;
 }
 
 }

From 7e945bab03bef9260bfc4776bdffe44fb7f8c99f Mon Sep 17 00:00:00 2001
From: Alexander Kuzmenkov <akuzm@yandex-team.ru>
Date: Wed, 3 Feb 2021 08:53:21 +0300
Subject: [PATCH 023/122] fix the calculation for moving frame start

---
 src/Common/StackTrace.h                       |  10 +-
 src/Processors/Transforms/WindowTransform.cpp | 164 ++++++++++--------
 src/Processors/Transforms/WindowTransform.h   |   9 +-
 tests/performance/window_functions.xml        |  26 ++-
 4 files changed, 130 insertions(+), 79 deletions(-)

diff --git a/src/Common/StackTrace.h b/src/Common/StackTrace.h
index 3ae4b964838..b2e14a01f03 100644
--- a/src/Common/StackTrace.h
+++ b/src/Common/StackTrace.h
@@ -34,7 +34,15 @@ public:
         std::optional<std::string> file;
         std::optional<UInt64> line;
     };
-    static constexpr size_t capacity = 32;
+
+    static constexpr size_t capacity =
+#ifndef NDEBUG
+        /* The stacks are normally larger in debug version due to less inlining. */
+        64
+#else
+        32
+#endif
+        ;
     using FramePointers = std::array<void *, capacity>;
     using Frames = std::array<Frame, capacity>;
 
diff --git a/src/Processors/Transforms/WindowTransform.cpp b/src/Processors/Transforms/WindowTransform.cpp
index 23acc85aef0..cc526eba8c4 100644
--- a/src/Processors/Transforms/WindowTransform.cpp
+++ b/src/Processors/Transforms/WindowTransform.cpp
@@ -348,49 +348,6 @@ void WindowTransform::advanceFrameStart()
         // further.
         assert(frame_started);
     }
-
-    // We're very dumb and have to reinitialize aggregate functions if the frame
-    // start changed. No point in doing it if we don't yet know where the frame
-    // starts.
-    if (!frame_started)
-    {
-        return;
-    }
-
-    // frame_end value might not be valid yet, but we know that it is greater or
-    // equal than frame_start. If it's less than the new frame_start, we have to
-    // skip rows between frame_end and frame_start, because they are not in the
-    // frame and must not contribute to the value of aggregate functions.
-    if (frame_end < frame_start)
-    {
-        frame_end = frame_start;
-    }
-
-    for (auto & ws : workspaces)
-    {
-        const auto & f = ws.window_function;
-        const auto * a = f.aggregate_function.get();
-        auto * buf = ws.aggregate_function_state.data();
-
-        a->destroy(buf);
-        a->create(buf);
-
-        for (auto row = frame_start; row < frame_end; advanceRowNumber(row))
-        {
-            if (row.block != ws.cached_block_number)
-            {
-                ws.argument_columns.clear();
-                for (const auto i : ws.argument_column_indices)
-                {
-                    ws.argument_columns.push_back(inputAt(row)[i].get());
-                }
-                ws.cached_block_number = row.block;
-            }
-
-            a->add(buf, ws.argument_columns.data(), row.row, arena.get());
-//            fmt::print(stderr, "(1) add row {}\n", row.row);
-        }
-    }
 }
 
 bool WindowTransform::arePeers(const RowNumber & x, const RowNumber & y) const
@@ -516,7 +473,6 @@ void WindowTransform::advanceFrameEnd()
     switch (window_description.frame.end_type)
     {
         case WindowFrame::BoundaryType::Current:
-            // The only frame end we have for now is CURRENT ROW.
             advanceFrameEndCurrentRow();
             break;
         case WindowFrame::BoundaryType::Unbounded:
@@ -536,45 +492,81 @@ void WindowTransform::advanceFrameEnd()
     {
         return;
     }
+}
 
-    // Add the rows over which we advanced the frame to the aggregate function
-    // states. We could have advanced over at most the entire last block.
-    uint64_t rows_end = frame_end.row;
-    if (frame_end.row == 0)
+// Update the aggregation states after the frame has changed.
+void WindowTransform::updateAggregationState()
+{
+//    fmt::print(stderr, "update agg states [{}, {}) -> [{}, {})\n",
+//        prev_frame_start, prev_frame_end, frame_start, frame_end);
+
+    // Assert that the frame boundaries are known, have proper order wrt each
+    // other, and have not gone back wrt the previous frame.
+    assert(frame_started);
+    assert(frame_ended);
+    assert(frame_start <= frame_end);
+    assert(prev_frame_start <= prev_frame_end);
+    assert(prev_frame_start <= frame_start);
+    assert(prev_frame_end <= frame_end);
+
+    // We might have to reset aggregation state and/or add some rows to it.
+    // Figure out what to do.
+    bool reset_aggregation = false;
+    RowNumber rows_to_add_start;
+    RowNumber rows_to_add_end;
+    if (frame_start == prev_frame_start)
     {
-        assert(frame_end == blocksEnd());
-        rows_end = blockRowsNumber(frame_end_before);
+        // The frame start didn't change, add the tail rows.
+        reset_aggregation = false;
+        rows_to_add_start = prev_frame_end;
+        rows_to_add_end = frame_end;
     }
     else
     {
-        assert(frame_end_before.block == frame_end.block);
+        // The frame start changed, reset the state and aggregate over the
+        // entire frame. This can be made per-function after we learn to
+        // subtract rows from some types of aggregation states, but for now we
+        // always have to reset when the frame start changes.
+        reset_aggregation = true;
+        rows_to_add_start = frame_start;
+        rows_to_add_end = frame_end;
     }
-    // Equality would mean "no data to process", for which we checked above.
-    assert(frame_end_before.row < rows_end);
 
     for (auto & ws : workspaces)
     {
-        if (frame_end_before.block != ws.cached_block_number)
-        {
-            const auto & block
-                = blocks[frame_end_before.block - first_block_number];
-            ws.argument_columns.clear();
-            for (const auto i : ws.argument_column_indices)
-            {
-                ws.argument_columns.push_back(block.input_columns[i].get());
-            }
-            ws.cached_block_number = frame_end_before.block;
-        }
-
         const auto * a = ws.window_function.aggregate_function.get();
         auto * buf = ws.aggregate_function_state.data();
-        auto * columns = ws.argument_columns.data();
-        for (auto row = frame_end_before.row; row < rows_end; ++row)
+
+        if (reset_aggregation)
         {
+//            fmt::print(stderr, "(2) reset aggregation\n");
+            a->destroy(buf);
+            a->create(buf);
+        }
+
+        for (auto row = rows_to_add_start; row < rows_to_add_end;
+            advanceRowNumber(row))
+        {
+            if (row.block != ws.cached_block_number)
+            {
+                const auto & block
+                    = blocks[row.block - first_block_number];
+                ws.argument_columns.clear();
+                for (const auto i : ws.argument_column_indices)
+                {
+                    ws.argument_columns.push_back(block.input_columns[i].get());
+                }
+                ws.cached_block_number = row.block;
+            }
+
 //            fmt::print(stderr, "(2) add row {}\n", row);
-            a->add(buf, columns, row, arena.get());
+            auto * columns = ws.argument_columns.data();
+            a->add(buf, columns, row.row, arena.get());
         }
     }
+
+    prev_frame_start = frame_start;
+    prev_frame_end = frame_end;
 }
 
 void WindowTransform::writeOutCurrentRow()
@@ -646,8 +638,11 @@ void WindowTransform::appendChunk(Chunk & chunk)
         // which is precisely the definition of `partition_end`.
         while (current_row < partition_end)
         {
-            // Advance the frame start, updating the state of the aggregate
-            // functions.
+//            fmt::print(stderr, "(1) row {} frame [{}, {}) {}, {}\n",
+//                current_row, frame_start, frame_end,
+//                frame_started, frame_ended);
+
+            // Advance the frame start.
             advanceFrameStart();
 
             if (!frame_started)
@@ -655,15 +650,19 @@ void WindowTransform::appendChunk(Chunk & chunk)
                 // Wait for more input data to find the start of frame.
                 assert(!input_is_finished);
                 assert(!partition_ended);
+                return;
             }
 
-            // Advance the frame end, updating the state of the aggregate
-            // functions.
-            advanceFrameEnd();
+            // frame_end must be greater or equal than frame_start, so if the
+            // frame_start is already past the current frame_end, we can start
+            // from it to save us some work.
+            if (frame_end < frame_start)
+            {
+                frame_end = frame_start;
+            }
 
-//            fmt::print(stderr, "row {} frame [{}, {}) {}, {}\n",
-//                current_row, frame_start, frame_end,
-//                frame_started, frame_ended);
+            // Advance the frame end.
+            advanceFrameEnd();
 
             if (!frame_ended)
             {
@@ -673,6 +672,10 @@ void WindowTransform::appendChunk(Chunk & chunk)
                 return;
             }
 
+//            fmt::print(stderr, "(2) row {} frame [{}, {}) {}, {}\n",
+//                current_row, frame_start, frame_end,
+//                frame_started, frame_ended);
+
             // The frame can be empty sometimes, e.g. the boundaries coincide
             // or the start is after the partition end. But hopefully start is
             // not after end.
@@ -680,6 +683,13 @@ void WindowTransform::appendChunk(Chunk & chunk)
             assert(frame_ended);
             assert(frame_start <= frame_end);
 
+            // Now that we know the new frame boundaries, update the aggregation
+            // states. Theoretically we could do this simultaneously with moving
+            // the frame boundaries, but it would require some care not to
+            // perform unnecessary work while we are still looking for the frame
+            // start, so do it the simple way for now.
+            updateAggregationState();
+
             // Write out the aggregation results.
             writeOutCurrentRow();
 
@@ -716,6 +726,8 @@ void WindowTransform::appendChunk(Chunk & chunk)
         // for now.
         frame_start = partition_start;
         frame_end = partition_start;
+        prev_frame_start = partition_start;
+        prev_frame_end = partition_start;
         assert(current_row == partition_start);
 
 //        fmt::print(stderr, "reinitialize agg data at start of {}\n",
diff --git a/src/Processors/Transforms/WindowTransform.h b/src/Processors/Transforms/WindowTransform.h
index afc44b2f706..5ad1132bfab 100644
--- a/src/Processors/Transforms/WindowTransform.h
+++ b/src/Processors/Transforms/WindowTransform.h
@@ -113,6 +113,7 @@ private:
     void advanceFrameEndCurrentRow();
     void advanceFrameEndUnbounded();
     bool arePeers(const RowNumber & x, const RowNumber & y) const;
+    void updateAggregationState();
     void writeOutCurrentRow();
 
     Columns & inputAt(const RowNumber & x)
@@ -254,7 +255,7 @@ public:
     RowNumber partition_end;
     bool partition_ended = false;
 
-    // This is the row for which we are computing the window functions now.
+    // The row for which we are now computing the window functions.
     RowNumber current_row;
 
     // The frame is [frame_start, frame_end) if frame_ended && frame_started,
@@ -270,6 +271,12 @@ public:
     RowNumber frame_end;
     bool frame_ended = false;
     bool frame_started = false;
+
+    // The previous frame boundaries that correspond to the current state of the
+    // aggregate function. We use them to determine how to update the aggregation
+    // state after we find the new frame.
+    RowNumber prev_frame_start;
+    RowNumber prev_frame_end;
 };
 
 }
diff --git a/tests/performance/window_functions.xml b/tests/performance/window_functions.xml
index f42345d0696..93983e9b1bf 100644
--- a/tests/performance/window_functions.xml
+++ b/tests/performance/window_functions.xml
@@ -25,7 +25,31 @@
         select *
         from (
             select CounterID, UserID, count(*) user_hits,
-                count() over (partition by CounterID order by user_hits desc)
+                count()
+                    over (partition by CounterID order by user_hits desc
+                        rows unbounded preceding)
+                    user_rank
+            from hits_100m_single
+            where CounterID < 10000
+            group by CounterID, UserID
+        )
+        where user_rank <= 10
+        format Null
+    ]]></query>
+
+    <!--
+        The RANGE version should give (almost) the same result, because counts
+        for the top ranking users are probably different, so the ranks won't be
+        influenced by grouping. But it is going to be slower than ROWS because
+        of the additional work of finding the group boundaries.
+    -->
+    <query><![CDATA[
+        select *
+        from (
+            select CounterID, UserID, count(*) user_hits,
+                count()
+                    over (partition by CounterID order by user_hits desc
+                        range unbounded preceding)
                     user_rank
             from hits_100m_single
             where CounterID < 10000

From a164abf23e70e9b6bc4064c0fd4431664ba3a747 Mon Sep 17 00:00:00 2001
From: Alexander Kuzmenkov <akuzm@yandex-team.ru>
Date: Wed, 3 Feb 2021 09:42:54 +0300
Subject: [PATCH 024/122] add frame fuzzing and one query that fails under msan

---
 programs/client/QueryFuzzer.cpp               | 44 +++++++++++++++++++
 programs/client/QueryFuzzer.h                 |  2 +
 .../01591_window_functions.reference          |  2 +
 .../0_stateless/01591_window_functions.sql    |  4 ++
 4 files changed, 52 insertions(+)

diff --git a/programs/client/QueryFuzzer.cpp b/programs/client/QueryFuzzer.cpp
index 3892e8e5732..e07d06e6080 100644
--- a/programs/client/QueryFuzzer.cpp
+++ b/programs/client/QueryFuzzer.cpp
@@ -325,6 +325,49 @@ void QueryFuzzer::fuzzColumnLikeExpressionList(IAST * ast)
     // the generic recursion into IAST.children.
 }
 
+void QueryFuzzer::fuzzWindowFrame(WindowFrame & frame)
+{
+    switch (fuzz_rand() % 40)
+    {
+        case 0:
+        {
+            const auto r = fuzz_rand() % 3;
+            frame.type = r == 0 ? WindowFrame::FrameType::Rows
+                : r == 1 ? WindowFrame::FrameType::Range
+                    : WindowFrame::FrameType::Groups;
+            break;
+        }
+        case 1:
+        {
+            const auto r = fuzz_rand() % 3;
+            frame.begin_type = r == 0 ? WindowFrame::BoundaryType::Unbounded
+                : r == 1 ? WindowFrame::BoundaryType::Current
+                    : WindowFrame::BoundaryType::Offset;
+            break;
+        }
+        case 2:
+        {
+            const auto r = fuzz_rand() % 3;
+            frame.end_type = r == 0 ? WindowFrame::BoundaryType::Unbounded
+                : r == 1 ? WindowFrame::BoundaryType::Current
+                    : WindowFrame::BoundaryType::Offset;
+            break;
+        }
+        case 3:
+        {
+            frame.begin_offset = getRandomField(0).get<Int64>();
+            break;
+        }
+        case 4:
+        {
+            frame.end_offset = getRandomField(0).get<Int64>();
+            break;
+        }
+        default:
+            break;
+    }
+}
+
 void QueryFuzzer::fuzz(ASTs & asts)
 {
     for (auto & ast : asts)
@@ -409,6 +452,7 @@ void QueryFuzzer::fuzz(ASTPtr & ast)
             auto & def = fn->window_definition->as<ASTWindowDefinition &>();
             fuzzColumnLikeExpressionList(def.partition_by.get());
             fuzzOrderByList(def.order_by.get());
+            fuzzWindowFrame(def.frame);
         }
 
         fuzz(fn->children);
diff --git a/programs/client/QueryFuzzer.h b/programs/client/QueryFuzzer.h
index e9d3f150283..38714205967 100644
--- a/programs/client/QueryFuzzer.h
+++ b/programs/client/QueryFuzzer.h
@@ -14,6 +14,7 @@ namespace DB
 
 class ASTExpressionList;
 class ASTOrderByElement;
+struct WindowFrame;
 
 /*
  * This is an AST-based query fuzzer that makes random modifications to query
@@ -65,6 +66,7 @@ struct QueryFuzzer
     void fuzzOrderByElement(ASTOrderByElement * elem);
     void fuzzOrderByList(IAST * ast);
     void fuzzColumnLikeExpressionList(IAST * ast);
+    void fuzzWindowFrame(WindowFrame & frame);
     void fuzz(ASTs & asts);
     void fuzz(ASTPtr & ast);
     void collectFuzzInfoMain(const ASTPtr ast);
diff --git a/tests/queries/0_stateless/01591_window_functions.reference b/tests/queries/0_stateless/01591_window_functions.reference
index 7e286f753e5..bd1a954ddc4 100644
--- a/tests/queries/0_stateless/01591_window_functions.reference
+++ b/tests/queries/0_stateless/01591_window_functions.reference
@@ -558,3 +558,5 @@ settings max_block_size = 2;
 28	5	3	2	1
 29	5	2	1	0
 30	6	1	1	0
+-- seen a use-after-free under MSan in this query once
+SELECT number, max(number) OVER (PARTITION BY intDiv(number, 7) ORDER BY number ASC NULLS LAST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) FROM numbers(1024) SETTINGS max_block_size = 2 FORMAT Null;
diff --git a/tests/queries/0_stateless/01591_window_functions.sql b/tests/queries/0_stateless/01591_window_functions.sql
index 3b4bdd03724..3a6d2f3d18a 100644
--- a/tests/queries/0_stateless/01591_window_functions.sql
+++ b/tests/queries/0_stateless/01591_window_functions.sql
@@ -174,3 +174,7 @@ select number, p,
 from (select number, intDiv(number, 5) p from numbers(31))
 order by p, number
 settings max_block_size = 2;
+
+-- seen a use-after-free under MSan in this query once
+SELECT number, max(number) OVER (PARTITION BY intDiv(number, 7) ORDER BY number ASC NULLS LAST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) FROM numbers(1024) SETTINGS max_block_size = 2 FORMAT Null;
+

From dad4f82e0e53ab687fa2583ae5b4c75651de05b1 Mon Sep 17 00:00:00 2001
From: Alexander Kuzmenkov <akuzm@yandex-team.ru>
Date: Wed, 3 Feb 2021 12:08:46 +0300
Subject: [PATCH 025/122] fix invalid iterator addition

---
 src/Processors/Transforms/WindowTransform.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Processors/Transforms/WindowTransform.cpp b/src/Processors/Transforms/WindowTransform.cpp
index cc526eba8c4..3dcd0a91bca 100644
--- a/src/Processors/Transforms/WindowTransform.cpp
+++ b/src/Processors/Transforms/WindowTransform.cpp
@@ -910,7 +910,7 @@ void WindowTransform::work()
 //            first_used_block);
 
         blocks.erase(blocks.begin(),
-            blocks.begin() + first_used_block - first_block_number);
+            blocks.begin() + (first_used_block - first_block_number));
         first_block_number = first_used_block;
 
         assert(next_output_block_number >= first_block_number);

From b6094859c7b643514c82f2c81a86400a9b79108f Mon Sep 17 00:00:00 2001
From: Pavel Kovalenko <jokserfn@yandex-team.ru>
Date: Wed, 3 Feb 2021 14:26:24 +0300
Subject: [PATCH 026/122] Throw exception during restore DiskS3 if object
 doesn't have path in metadata.

---
 src/Disks/S3/DiskS3.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/Disks/S3/DiskS3.cpp b/src/Disks/S3/DiskS3.cpp
index f63ca4117db..f861cff2424 100644
--- a/src/Disks/S3/DiskS3.cpp
+++ b/src/Disks/S3/DiskS3.cpp
@@ -1149,10 +1149,7 @@ void DiskS3::processRestoreFiles(const String & source_bucket, const String & so
         /// Restore file if object has 'path' in metadata.
         auto path_entry = object_metadata.find("path");
         if (path_entry == object_metadata.end())
-        {
-            LOG_WARNING(&Poco::Logger::get("DiskS3"), "Skip key {} because it doesn't have 'path' in metadata", key);
-            continue;
-        }
+            throw Exception("Failed to restore key " + key + " because it doesn't have 'path' in metadata", ErrorCodes::S3_ERROR);
 
         const auto & path = path_entry->second;
 

From 62e96c138e59449764b2e2572790cdd910ec5d89 Mon Sep 17 00:00:00 2001
From: Pavel Kovalenko <jokserfn@yandex-team.ru>
Date: Wed, 3 Feb 2021 15:01:43 +0300
Subject: [PATCH 027/122] Remove unused error code in DiskS3

---
 src/Disks/S3/DiskS3.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/Disks/S3/DiskS3.cpp b/src/Disks/S3/DiskS3.cpp
index f861cff2424..42c022a3714 100644
--- a/src/Disks/S3/DiskS3.cpp
+++ b/src/Disks/S3/DiskS3.cpp
@@ -39,7 +39,6 @@ namespace ErrorCodes
     extern const int CANNOT_SEEK_THROUGH_FILE;
     extern const int UNKNOWN_FORMAT;
     extern const int INCORRECT_DISK_INDEX;
-    extern const int NOT_IMPLEMENTED;
     extern const int BAD_ARGUMENTS;
     extern const int PATH_ACCESS_DENIED;
     extern const int CANNOT_DELETE_DIRECTORY;

From b0f400ca1b63c30ba5c2bc57f95aec241fb07056 Mon Sep 17 00:00:00 2001
From: Ivan Lezhankin <ilezhankin@yandex-team.ru>
Date: Wed, 3 Feb 2021 15:31:45 +0300
Subject: [PATCH 028/122] Reset internal buffer position on next()

---
 src/IO/BufferBase.h       |  1 +
 src/IO/ConcatReadBuffer.h | 21 ++++++++++++---------
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/src/IO/BufferBase.h b/src/IO/BufferBase.h
index c22dcbecf7b..198441d8bc1 100644
--- a/src/IO/BufferBase.h
+++ b/src/IO/BufferBase.h
@@ -40,6 +40,7 @@ public:
         inline Position end() const { return end_pos; }
         inline size_t size() const { return size_t(end_pos - begin_pos); }
         inline void resize(size_t size) { end_pos = begin_pos + size; }
+        inline bool empty() const { return size() == 0; }
 
         inline void swap(Buffer & other)
         {
diff --git a/src/IO/ConcatReadBuffer.h b/src/IO/ConcatReadBuffer.h
index 1df99429e93..c416b0fd892 100644
--- a/src/IO/ConcatReadBuffer.h
+++ b/src/IO/ConcatReadBuffer.h
@@ -25,11 +25,16 @@ protected:
             return false;
 
         /// First reading
-        if (working_buffer.size() == 0 && (*current)->hasPendingData())
+        if (working_buffer.empty())
         {
-            working_buffer = Buffer((*current)->position(), (*current)->buffer().end());
-            return true;
+            if ((*current)->hasPendingData())
+            {
+                working_buffer = Buffer((*current)->position(), (*current)->buffer().end());
+                return true;
+            }
         }
+        else
+            (*current)->position() = position();
 
         if (!(*current)->next())
         {
@@ -51,14 +56,12 @@ protected:
     }
 
 public:
-    ConcatReadBuffer(const ReadBuffers & buffers_) : ReadBuffer(nullptr, 0), buffers(buffers_), current(buffers.begin()) {}
-
-    ConcatReadBuffer(ReadBuffer & buf1, ReadBuffer & buf2) : ReadBuffer(nullptr, 0)
+    explicit ConcatReadBuffer(const ReadBuffers & buffers_) : ReadBuffer(nullptr, 0), buffers(buffers_), current(buffers.begin())
     {
-        buffers.push_back(&buf1);
-        buffers.push_back(&buf2);
-        current = buffers.begin();
+        assert(!buffers.empty());
     }
+
+    ConcatReadBuffer(ReadBuffer & buf1, ReadBuffer & buf2) : ConcatReadBuffer({&buf1, &buf2}) {}
 };
 
 }

From 72b0a18503569d240bd2e41355efd83120576d46 Mon Sep 17 00:00:00 2001
From: Ivan Lezhankin <ilezhankin@yandex-team.ru>
Date: Wed, 3 Feb 2021 15:37:32 +0300
Subject: [PATCH 029/122] Check for unread data on next()

---
 src/IO/ReadBuffer.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/IO/ReadBuffer.h b/src/IO/ReadBuffer.h
index 3d6eb6970ce..0b39a595075 100644
--- a/src/IO/ReadBuffer.h
+++ b/src/IO/ReadBuffer.h
@@ -55,6 +55,8 @@ public:
       */
     bool next()
     {
+        assert(!hasPendingData());
+
         bytes += offset();
         bool res = nextImpl();
         if (!res)
@@ -72,7 +74,7 @@ public:
             next();
     }
 
-    virtual ~ReadBuffer() {}
+    virtual ~ReadBuffer() = default;
 
 
     /** Unlike std::istream, it returns true if all data was read
@@ -192,7 +194,7 @@ private:
       */
     virtual bool nextImpl() { return false; }
 
-    [[noreturn]] void throwReadAfterEOF()
+    [[noreturn]] static inline void throwReadAfterEOF()
     {
         throw Exception("Attempt to read after eof", ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF);
     }

From f31d2206a74a7eb0afc784c0fd99367ac583ad1b Mon Sep 17 00:00:00 2001
From: Alexander Kuzmenkov <akuzm@yandex-team.ru>
Date: Wed, 3 Feb 2021 15:50:25 +0300
Subject: [PATCH 030/122] more fuzzing and less bugs

---
 programs/client/QueryFuzzer.cpp               | 19 ++++++
 src/Interpreters/WindowDescription.cpp        | 68 +++++++++++++++++++
 src/Interpreters/WindowDescription.h          |  7 ++
 src/Parsers/ExpressionElementParsers.cpp      |  1 +
 src/Processors/QueryPlan/WindowStep.cpp       |  1 +
 src/Processors/Transforms/WindowTransform.cpp | 21 ++++--
 .../01591_window_functions.reference          |  5 ++
 .../0_stateless/01591_window_functions.sql    |  2 +
 8 files changed, 118 insertions(+), 6 deletions(-)

diff --git a/programs/client/QueryFuzzer.cpp b/programs/client/QueryFuzzer.cpp
index d569a185dba..05c20434820 100644
--- a/programs/client/QueryFuzzer.cpp
+++ b/programs/client/QueryFuzzer.cpp
@@ -366,6 +366,8 @@ void QueryFuzzer::fuzzWindowFrame(WindowFrame & frame)
         default:
             break;
     }
+
+    frame.is_default = (frame == WindowFrame{});
 }
 
 void QueryFuzzer::fuzz(ASTs & asts)
@@ -465,6 +467,23 @@ void QueryFuzzer::fuzz(ASTPtr & ast)
 
         fuzz(select->children);
     }
+    /*
+     * The time to fuzz the settings has not yet come.
+     * Apparently we don't have any infractructure to validate the values of
+     * the settings, and the first query with max_block_size = -1 breaks
+     * because of overflows here and there.
+     *//*
+     * else if (auto * set = typeid_cast<ASTSetQuery *>(ast.get()))
+     * {
+     *      for (auto & c : set->changes)
+     *      {
+     *          if (fuzz_rand() % 50 == 0)
+     *          {
+     *              c.value = fuzzField(c.value);
+     *          }
+     *      }
+     * }
+     */
     else if (auto * literal = typeid_cast<ASTLiteral *>(ast.get()))
     {
         // There is a caveat with fuzzing the children: many ASTs also keep the
diff --git a/src/Interpreters/WindowDescription.cpp b/src/Interpreters/WindowDescription.cpp
index bfb53ebb79f..3569df6fd17 100644
--- a/src/Interpreters/WindowDescription.cpp
+++ b/src/Interpreters/WindowDescription.cpp
@@ -33,4 +33,72 @@ std::string WindowDescription::dump() const
     return ss.str();
 }
 
+std::string WindowFrame::toString() const
+{
+    WriteBufferFromOwnString buf;
+    toString(buf);
+    return buf.str();
+}
+
+void WindowFrame::toString(WriteBuffer & buf) const
+{
+    buf << toString(type) << " BETWEEN ";
+    if (begin_type == BoundaryType::Current)
+    {
+        buf << "CURRENT ROW";
+    }
+    else if (begin_type == BoundaryType::Unbounded)
+    {
+        buf << "UNBOUNDED PRECEDING";
+    }
+    else
+    {
+        buf << abs(begin_offset);
+        buf << " "
+            << (begin_offset > 0 ? "FOLLOWING" : "PRECEDING");
+    }
+    buf << " AND ";
+    if (end_type == BoundaryType::Current)
+    {
+        buf << "CURRENT ROW";
+    }
+    else if (end_type == BoundaryType::Unbounded)
+    {
+        buf << "UNBOUNDED PRECEDING";
+    }
+    else
+    {
+        buf << abs(end_offset);
+        buf << " "
+            << (end_offset > 0 ? "FOLLOWING" : "PRECEDING");
+    }
+}
+
+void WindowFrame::checkValid() const
+{
+    if (begin_type == BoundaryType::Unbounded
+        || end_type == BoundaryType::Unbounded)
+    {
+        return;
+    }
+
+    if (begin_type == BoundaryType::Current
+        && end_type == BoundaryType::Offset
+        && end_offset > 0)
+    {
+        return;
+    }
+
+    if (end_type == BoundaryType::Current
+        && begin_type == BoundaryType::Offset
+        && begin_offset < 0)
+    {
+        return;
+    }
+
+    throw Exception(ErrorCodes::BAD_ARGUMENTS,
+        "Window frame '{}' is invalid",
+        toString());
+}
+
 }
diff --git a/src/Interpreters/WindowDescription.h b/src/Interpreters/WindowDescription.h
index d34b7721a5e..447352f7a83 100644
--- a/src/Interpreters/WindowDescription.h
+++ b/src/Interpreters/WindowDescription.h
@@ -53,6 +53,13 @@ struct WindowFrame
     int64_t end_offset = 0;
 
 
+    // Throws BAD_ARGUMENTS exception if the frame definition is incorrect, e.g.
+    // the frame start comes later than the frame end.
+    void checkValid() const;
+
+    std::string toString() const;
+    void toString(WriteBuffer & buf) const;
+
     bool operator == (const WindowFrame & other) const
     {
         // We don't compare is_default because it's not a real property of the
diff --git a/src/Parsers/ExpressionElementParsers.cpp b/src/Parsers/ExpressionElementParsers.cpp
index e4a9c285223..c129c312d11 100644
--- a/src/Parsers/ExpressionElementParsers.cpp
+++ b/src/Parsers/ExpressionElementParsers.cpp
@@ -640,6 +640,7 @@ static bool tryParseFrameDefinition(ASTWindowDefinition * node, IParser::Pos & p
             }
             else if (keyword_following.ignore(pos, expected))
             {
+                // Positive offset or UNBOUNDED FOLLOWING.
             }
             else
             {
diff --git a/src/Processors/QueryPlan/WindowStep.cpp b/src/Processors/QueryPlan/WindowStep.cpp
index 82c589b8b20..1a71ca0adc7 100644
--- a/src/Processors/QueryPlan/WindowStep.cpp
+++ b/src/Processors/QueryPlan/WindowStep.cpp
@@ -57,6 +57,7 @@ WindowStep::WindowStep(const DataStream & input_stream_,
 {
     // We don't remove any columns, only add, so probably we don't have to update
     // the output DataStream::distinct_columns.
+    window_description.frame.checkValid();
 }
 
 void WindowStep::transformPipeline(QueryPipeline & pipeline)
diff --git a/src/Processors/Transforms/WindowTransform.cpp b/src/Processors/Transforms/WindowTransform.cpp
index 3dcd0a91bca..b7b0c72eb94 100644
--- a/src/Processors/Transforms/WindowTransform.cpp
+++ b/src/Processors/Transforms/WindowTransform.cpp
@@ -22,6 +22,8 @@ WindowTransform::WindowTransform(const Block & input_header_,
     , input_header(input_header_)
     , window_description(window_description_)
 {
+    window_description.frame.checkValid();
+
     workspaces.reserve(functions.size());
     for (const auto & f : functions)
     {
@@ -210,14 +212,21 @@ auto WindowTransform::moveRowNumberNoCheck(const RowNumber & _x, int offset) con
                 break;
             }
 
+            // Move to the first row in current block. Note that the offset is
+            // negative.
+            offset += x.row;
+            x.row = 0;
+
+            // Move to the last row of the previous block, if we are not at the
+            // first one. Offset also is incremented by one, because we pass over
+            // the first row of this block.
             if (x.block == first_block_number)
             {
                 break;
             }
 
-            // offset is negative
-            offset += (x.row + 1);
             --x.block;
+            offset += 1;
             x.row = blockRowsNumber(x) - 1;
         }
     }
@@ -253,10 +262,10 @@ void WindowTransform::advanceFrameStartRowsOffset()
 
     assertValid(frame_start);
 
-//    fmt::print(stderr, "frame start {} partition start {}\n", frame_start,
-//        partition_start);
+//    fmt::print(stderr, "frame start {} left {} partition start {}\n",
+//        frame_start, offset_left, partition_start);
 
-    if (moved_row <= partition_start)
+    if (frame_start <= partition_start)
     {
         // Got to the beginning of partition and can't go further back.
         frame_start = partition_start;
@@ -269,10 +278,10 @@ void WindowTransform::advanceFrameStartRowsOffset()
     {
         // A FOLLOWING frame start ran into the end of partition.
         frame_started = true;
+        return;
     }
 
     assert(partition_start < frame_start);
-    frame_start = moved_row;
     frame_started = offset_left == 0;
 }
 
diff --git a/tests/queries/0_stateless/01591_window_functions.reference b/tests/queries/0_stateless/01591_window_functions.reference
index bd1a954ddc4..c128aae7796 100644
--- a/tests/queries/0_stateless/01591_window_functions.reference
+++ b/tests/queries/0_stateless/01591_window_functions.reference
@@ -558,5 +558,10 @@ settings max_block_size = 2;
 28	5	3	2	1
 29	5	2	1	0
 30	6	1	1	0
+SELECT count(*) OVER (ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) FROM numbers(4);
+1
+2
+3
+3
 -- seen a use-after-free under MSan in this query once
 SELECT number, max(number) OVER (PARTITION BY intDiv(number, 7) ORDER BY number ASC NULLS LAST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) FROM numbers(1024) SETTINGS max_block_size = 2 FORMAT Null;
diff --git a/tests/queries/0_stateless/01591_window_functions.sql b/tests/queries/0_stateless/01591_window_functions.sql
index 3a6d2f3d18a..6c2883eae26 100644
--- a/tests/queries/0_stateless/01591_window_functions.sql
+++ b/tests/queries/0_stateless/01591_window_functions.sql
@@ -175,6 +175,8 @@ from (select number, intDiv(number, 5) p from numbers(31))
 order by p, number
 settings max_block_size = 2;
 
+SELECT count(*) OVER (ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) FROM numbers(4);
+
 -- seen a use-after-free under MSan in this query once
 SELECT number, max(number) OVER (PARTITION BY intDiv(number, 7) ORDER BY number ASC NULLS LAST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) FROM numbers(1024) SETTINGS max_block_size = 2 FORMAT Null;
 

From bd6d7facf1d7045798cb455c83c70c2059b0df97 Mon Sep 17 00:00:00 2001
From: Alexander Kuzmenkov <akuzm@yandex-team.ru>
Date: Wed, 3 Feb 2021 16:33:50 +0300
Subject: [PATCH 031/122] style

---
 src/Interpreters/WindowDescription.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/Interpreters/WindowDescription.cpp b/src/Interpreters/WindowDescription.cpp
index 3569df6fd17..93a75c07161 100644
--- a/src/Interpreters/WindowDescription.cpp
+++ b/src/Interpreters/WindowDescription.cpp
@@ -6,6 +6,11 @@
 namespace DB
 {
 
+namespace ErrorCodes
+{
+    extern const int BAD_ARGUMENTS;
+}
+
 std::string WindowFunctionDescription::dump() const
 {
     WriteBufferFromOwnString ss;

From 7c55ecf67db755d15714cea06adfb28f35ac3da9 Mon Sep 17 00:00:00 2001
From: Alexander Kuzmenkov <akuzm@yandex-team.ru>
Date: Wed, 3 Feb 2021 16:41:59 +0300
Subject: [PATCH 032/122] cleanup

---
 src/Processors/Transforms/WindowTransform.h                | 2 +-
 tests/queries/0_stateless/01591_window_functions.reference | 3 +++
 tests/queries/0_stateless/01591_window_functions.sql       | 2 ++
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/Processors/Transforms/WindowTransform.h b/src/Processors/Transforms/WindowTransform.h
index 5ad1132bfab..869a6fbfee2 100644
--- a/src/Processors/Transforms/WindowTransform.h
+++ b/src/Processors/Transforms/WindowTransform.h
@@ -246,7 +246,7 @@ public:
 
     // Boundaries of the current partition.
     // partition_start doesn't point to a valid block, because we want to drop
-    // the blocks early to save memory. We still have track it so that we can
+    // the blocks early to save memory. We still have to track it so that we can
     // cut off a PRECEDING frame at the partition start.
     // The `partition_end` is past-the-end, as usual. When
     // partition_ended = false, it still haven't ended, and partition_end is the
diff --git a/tests/queries/0_stateless/01591_window_functions.reference b/tests/queries/0_stateless/01591_window_functions.reference
index c128aae7796..e8db9ee7725 100644
--- a/tests/queries/0_stateless/01591_window_functions.reference
+++ b/tests/queries/0_stateless/01591_window_functions.reference
@@ -565,3 +565,6 @@ SELECT count(*) OVER (ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) FROM numbers(4);
 3
 -- seen a use-after-free under MSan in this query once
 SELECT number, max(number) OVER (PARTITION BY intDiv(number, 7) ORDER BY number ASC NULLS LAST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) FROM numbers(1024) SETTINGS max_block_size = 2 FORMAT Null;
+-- a corner case
+select count() over ();
+1
diff --git a/tests/queries/0_stateless/01591_window_functions.sql b/tests/queries/0_stateless/01591_window_functions.sql
index 6c2883eae26..486c8f871b7 100644
--- a/tests/queries/0_stateless/01591_window_functions.sql
+++ b/tests/queries/0_stateless/01591_window_functions.sql
@@ -180,3 +180,5 @@ SELECT count(*) OVER (ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) FROM numbers(4);
 -- seen a use-after-free under MSan in this query once
 SELECT number, max(number) OVER (PARTITION BY intDiv(number, 7) ORDER BY number ASC NULLS LAST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) FROM numbers(1024) SETTINGS max_block_size = 2 FORMAT Null;
 
+-- a corner case
+select count() over ();

From 35754abb4a9ed4d5a01ef5ab0c203a95f7272329 Mon Sep 17 00:00:00 2001
From: Alexander Kuzmenkov <akuzm@yandex-team.ru>
Date: Wed, 3 Feb 2021 17:22:37 +0300
Subject: [PATCH 033/122] CURRENT ROW frame start for RANGE frame

---
 src/Processors/Transforms/WindowTransform.cpp | 31 +++++++++------
 src/Processors/Transforms/WindowTransform.h   |  4 ++
 .../01591_window_functions.reference          | 39 +++++++++++++++++++
 .../0_stateless/01591_window_functions.sql    |  9 +++++
 4 files changed, 71 insertions(+), 12 deletions(-)

diff --git a/src/Processors/Transforms/WindowTransform.cpp b/src/Processors/Transforms/WindowTransform.cpp
index b7b0c72eb94..634c588beb0 100644
--- a/src/Processors/Transforms/WindowTransform.cpp
+++ b/src/Processors/Transforms/WindowTransform.cpp
@@ -295,18 +295,14 @@ void WindowTransform::advanceFrameStartChoose()
             frame_started = true;
             return;
         case WindowFrame::BoundaryType::Current:
-            switch (window_description.frame.type)
-            {
-                case WindowFrame::FrameType::Rows:
-                    // CURRENT ROW
-                    frame_start = current_row;
-                    frame_started = true;
-                    return;
-                default:
-                    // Fallthrough to the "not implemented" error.
-                    break;
-            }
-            break;
+            // CURRENT ROW differs between frame types only in how the peer
+            // groups are accounted.
+            assert(partition_start <= peer_group_start);
+            assert(peer_group_start < partition_end);
+            assert(peer_group_start <= current_row);
+            frame_start = peer_group_start;
+            frame_started = true;
+            return;
         case WindowFrame::BoundaryType::Offset:
             switch (window_description.frame.type)
             {
@@ -651,6 +647,13 @@ void WindowTransform::appendChunk(Chunk & chunk)
 //                current_row, frame_start, frame_end,
 //                frame_started, frame_ended);
 
+            // We now know that the current row is valid, so we can update the
+            // peer group start.
+            if (!arePeers(peer_group_start, current_row))
+            {
+                peer_group_start = current_row;
+            }
+
             // Advance the frame start.
             advanceFrameStart();
 
@@ -703,6 +706,8 @@ void WindowTransform::appendChunk(Chunk & chunk)
             writeOutCurrentRow();
 
             // Move to the next row. The frame will have to be recalculated.
+            // The peer group start is updated at the beginning of the loop,
+            // because current_row might now be past-the-end.
             advanceRowNumber(current_row);
             first_not_ready_row = current_row;
             frame_ended = false;
@@ -738,6 +743,7 @@ void WindowTransform::appendChunk(Chunk & chunk)
         prev_frame_start = partition_start;
         prev_frame_end = partition_start;
         assert(current_row == partition_start);
+        peer_group_start = partition_start;
 
 //        fmt::print(stderr, "reinitialize agg data at start of {}\n",
 //            new_partition_start);
@@ -925,6 +931,7 @@ void WindowTransform::work()
         assert(next_output_block_number >= first_block_number);
         assert(frame_start.block >= first_block_number);
         assert(current_row.block >= first_block_number);
+        assert(peer_group_start.block >= first_block_number);
     }
 }
 
diff --git a/src/Processors/Transforms/WindowTransform.h b/src/Processors/Transforms/WindowTransform.h
index 869a6fbfee2..c5e1c8b3653 100644
--- a/src/Processors/Transforms/WindowTransform.h
+++ b/src/Processors/Transforms/WindowTransform.h
@@ -257,6 +257,10 @@ public:
 
     // The row for which we are now computing the window functions.
     RowNumber current_row;
+    // The start of current peer group, needed for CURRENT ROW frame start.
+    // For ROWS frame, always equal to the current row, and for RANGE and GROUP
+    // frames may be earlier.
+    RowNumber peer_group_start;
 
     // The frame is [frame_start, frame_end) if frame_ended && frame_started,
     // and unknown otherwise. Note that when we move to the next row, both the
diff --git a/tests/queries/0_stateless/01591_window_functions.reference b/tests/queries/0_stateless/01591_window_functions.reference
index e8db9ee7725..e93d0eaead3 100644
--- a/tests/queries/0_stateless/01591_window_functions.reference
+++ b/tests/queries/0_stateless/01591_window_functions.reference
@@ -568,3 +568,42 @@ SELECT number, max(number) OVER (PARTITION BY intDiv(number, 7) ORDER BY number
 -- a corner case
 select count() over ();
 1
+-- RANGE CURRENT ROW frame start
+select number, p, o,
+    count(*) over (partition by p order by o
+        range between current row and unbounded following)
+from (select number, intDiv(number, 5) p, mod(number, 3) o
+    from numbers(31))
+order by p, o, number
+settings max_block_size = 2;
+0	0	0	5
+3	0	0	5
+1	0	1	3
+4	0	1	3
+2	0	2	1
+6	1	0	5
+9	1	0	5
+7	1	1	3
+5	1	2	2
+8	1	2	2
+12	2	0	5
+10	2	1	4
+13	2	1	4
+11	2	2	2
+14	2	2	2
+15	3	0	5
+18	3	0	5
+16	3	1	3
+19	3	1	3
+17	3	2	1
+21	4	0	5
+24	4	0	5
+22	4	1	3
+20	4	2	2
+23	4	2	2
+27	5	0	5
+25	5	1	4
+28	5	1	4
+26	5	2	2
+29	5	2	2
+30	6	0	1
diff --git a/tests/queries/0_stateless/01591_window_functions.sql b/tests/queries/0_stateless/01591_window_functions.sql
index 486c8f871b7..d8e38a04eee 100644
--- a/tests/queries/0_stateless/01591_window_functions.sql
+++ b/tests/queries/0_stateless/01591_window_functions.sql
@@ -182,3 +182,12 @@ SELECT number, max(number) OVER (PARTITION BY intDiv(number, 7) ORDER BY number
 
 -- a corner case
 select count() over ();
+
+-- RANGE CURRENT ROW frame start
+select number, p, o,
+    count(*) over (partition by p order by o
+        range between current row and unbounded following)
+from (select number, intDiv(number, 5) p, mod(number, 3) o
+    from numbers(31))
+order by p, o, number
+settings max_block_size = 2;

From ec382d881262093ce87b8693bae851fdbc42b075 Mon Sep 17 00:00:00 2001
From: Alexander Kuzmenkov <akuzm@yandex-team.ru>
Date: Wed, 3 Feb 2021 17:55:40 +0300
Subject: [PATCH 034/122] BETWEEN CURRENT ROW AND CURRENT ROW

---
 src/Interpreters/WindowDescription.cpp                    | 8 ++++++++
 .../queries/0_stateless/01591_window_functions.reference  | 7 +++++++
 tests/queries/0_stateless/01591_window_functions.sql      | 5 +++++
 3 files changed, 20 insertions(+)

diff --git a/src/Interpreters/WindowDescription.cpp b/src/Interpreters/WindowDescription.cpp
index 93a75c07161..12f4bdd4124 100644
--- a/src/Interpreters/WindowDescription.cpp
+++ b/src/Interpreters/WindowDescription.cpp
@@ -101,6 +101,14 @@ void WindowFrame::checkValid() const
         return;
     }
 
+    if (end_type == BoundaryType::Current
+        && begin_type == BoundaryType::Current)
+    {
+        // BETWEEN CURRENT ROW AND CURRENT ROW makes some sense for RANGE or
+        // GROUP frames, and is technically valid for ROWS frame.
+        return;
+    }
+
     throw Exception(ErrorCodes::BAD_ARGUMENTS,
         "Window frame '{}' is invalid",
         toString());
diff --git a/tests/queries/0_stateless/01591_window_functions.reference b/tests/queries/0_stateless/01591_window_functions.reference
index e93d0eaead3..2e9c659e0af 100644
--- a/tests/queries/0_stateless/01591_window_functions.reference
+++ b/tests/queries/0_stateless/01591_window_functions.reference
@@ -607,3 +607,10 @@ settings max_block_size = 2;
 26	5	2	2
 29	5	2	2
 30	6	0	1
+select
+    count(*) over (rows between  current row and current row),
+    count(*) over (range between  current row and current row)
+from numbers(3);
+1	3
+1	3
+1	3
diff --git a/tests/queries/0_stateless/01591_window_functions.sql b/tests/queries/0_stateless/01591_window_functions.sql
index d8e38a04eee..6c4190b47d3 100644
--- a/tests/queries/0_stateless/01591_window_functions.sql
+++ b/tests/queries/0_stateless/01591_window_functions.sql
@@ -191,3 +191,8 @@ from (select number, intDiv(number, 5) p, mod(number, 3) o
     from numbers(31))
 order by p, o, number
 settings max_block_size = 2;
+
+select
+    count(*) over (rows between  current row and current row),
+    count(*) over (range between  current row and current row)
+from numbers(3);

From 82ab793731929e3739c5641a1d29ac0af20d4c60 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Tue, 2 Feb 2021 10:56:22 +0300
Subject: [PATCH 035/122] Fix missing type check in StorageEmbeddedRocksDB

---
 docker/test/fasttest/run.sh                   |   1 +
 .../RocksDB/StorageEmbeddedRocksDB.cpp        | 119 +++++++++---------
 .../0_stateless/01686_rocksdb.reference       |  15 +++
 tests/queries/0_stateless/01686_rocksdb.sql   |  27 ++++
 .../queries/0_stateless/arcadia_skip_list.txt |   1 +
 5 files changed, 103 insertions(+), 60 deletions(-)
 create mode 100644 tests/queries/0_stateless/01686_rocksdb.reference
 create mode 100644 tests/queries/0_stateless/01686_rocksdb.sql

diff --git a/docker/test/fasttest/run.sh b/docker/test/fasttest/run.sh
index 17cec7ae286..bb29959acd2 100755
--- a/docker/test/fasttest/run.sh
+++ b/docker/test/fasttest/run.sh
@@ -319,6 +319,7 @@ function run_tests
 
          # In fasttest, ENABLE_LIBRARIES=0, so rocksdb engine is not enabled by default
         01504_rocksdb
+        01686_rocksdb
 
         # Look at DistributedFilesToInsert, so cannot run in parallel.
         01460_DistributedFilesToInsert
diff --git a/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp b/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp
index 249026d1011..d7456966467 100644
--- a/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp
+++ b/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp
@@ -24,6 +24,7 @@
 #include <Interpreters/Set.h>
 #include <Interpreters/PreparedSets.h>
 #include <Interpreters/TreeRewriter.h>
+#include <Interpreters/convertFieldToType.h>
 
 #include <Poco/File.h>
 #include <Poco/Path.h>
@@ -44,9 +45,12 @@ namespace ErrorCodes
     extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
 }
 
+using FieldVectorPtr = std::shared_ptr<FieldVector>;
+
 
 // returns keys may be filter by condition
-static bool traverseASTFilter(const String & primary_key, const DataTypePtr & primary_key_type, const ASTPtr & elem, const PreparedSets & sets, FieldVector & res)
+static bool traverseASTFilter(
+    const String & primary_key, const DataTypePtr & primary_key_type, const ASTPtr & elem, const PreparedSets & sets, FieldVectorPtr & res)
 {
     const auto * function = elem->as<ASTFunction>();
     if (!function)
@@ -63,13 +67,9 @@ static bool traverseASTFilter(const String & primary_key, const DataTypePtr & pr
     else if (function->name == "or")
     {
         // make sure every child has the key filter condition
-        FieldVector child_res;
         for (const auto & child : function->arguments->children)
-        {
-            if (!traverseASTFilter(primary_key, primary_key_type, child, sets, child_res))
+            if (!traverseASTFilter(primary_key, primary_key_type, child, sets, res))
                 return false;
-        }
-        res.insert(res.end(), child_res.begin(), child_res.end());
         return true;
     }
     else if (function->name == "equals" || function->name == "in")
@@ -108,9 +108,7 @@ static bool traverseASTFilter(const String & primary_key, const DataTypePtr & pr
             prepared_set->checkColumnsNumber(1);
             const auto & set_column = *prepared_set->getSetElements()[0];
             for (size_t row = 0; row < set_column.size(); ++row)
-            {
-                res.push_back(set_column[row]);
-            }
+                res->push_back(set_column[row]);
             return true;
         }
         else
@@ -125,10 +123,12 @@ static bool traverseASTFilter(const String & primary_key, const DataTypePtr & pr
             if (ident->name() != primary_key)
                 return false;
 
-            //function->name == "equals"
+            /// function->name == "equals"
             if (const auto * literal = value->as<ASTLiteral>())
             {
-                res.push_back(literal->value);
+                auto converted_field = convertFieldToType(literal->value, *primary_key_type);
+                if (!converted_field.isNull())
+                    res->push_back(converted_field);
                 return true;
             }
         }
@@ -140,14 +140,14 @@ static bool traverseASTFilter(const String & primary_key, const DataTypePtr & pr
 /** Retrieve from the query a condition of the form `key = 'key'`, `key in ('xxx_'), from conjunctions in the WHERE clause.
   * TODO support key like search
   */
-static std::pair<FieldVector, bool> getFilterKeys(const String & primary_key, const DataTypePtr & primary_key_type, const SelectQueryInfo & query_info)
+static std::pair<FieldVectorPtr, bool> getFilterKeys(
+    const String & primary_key, const DataTypePtr & primary_key_type, const SelectQueryInfo & query_info)
 {
     const auto & select = query_info.query->as<ASTSelectQuery &>();
     if (!select.where())
-    {
-        return std::make_pair(FieldVector{}, true);
-    }
-    FieldVector res;
+        return {{}, true};
+
+    FieldVectorPtr res = std::make_shared<FieldVector>();
     auto matched_keys = traverseASTFilter(primary_key, primary_key_type, select.where(), query_info.sets, res);
     return std::make_pair(res, !matched_keys);
 }
@@ -159,23 +159,19 @@ public:
     EmbeddedRocksDBSource(
         const StorageEmbeddedRocksDB & storage_,
         const StorageMetadataPtr & metadata_snapshot_,
-        const FieldVector & keys_,
-        const size_t start_,
-        const size_t end_,
+        FieldVectorPtr keys_,
+        FieldVector::const_iterator begin_,
+        FieldVector::const_iterator end_,
         const size_t max_block_size_)
         : SourceWithProgress(metadata_snapshot_->getSampleBlock())
         , storage(storage_)
         , metadata_snapshot(metadata_snapshot_)
-        , start(start_)
+        , keys(std::move(keys_))
+        , begin(begin_)
         , end(end_)
+        , it(begin)
         , max_block_size(max_block_size_)
     {
-        // slice the keys
-        if (end > start)
-        {
-            keys.resize(end - start);
-            std::copy(keys_.begin() + start, keys_.begin() + end, keys.begin());
-        }
     }
 
     String getName() const override
@@ -185,27 +181,34 @@ public:
 
     Chunk generate() override
     {
-        if (processed_keys >= keys.size() || (start == end))
+        if (it >= end)
             return {};
 
-        std::vector<rocksdb::Slice> slices_keys;
-        slices_keys.reserve(keys.size());
-        std::vector<String> values;
-        std::vector<WriteBufferFromOwnString> wbs(keys.size());
+        size_t num_keys = end - begin;
+
+        std::vector<std::string> serialized_keys(num_keys);
+        std::vector<rocksdb::Slice> slices_keys(num_keys);
 
         const auto & sample_block = metadata_snapshot->getSampleBlock();
         const auto & key_column = sample_block.getByName(storage.primary_key);
         auto columns = sample_block.cloneEmptyColumns();
         size_t primary_key_pos = sample_block.getPositionByName(storage.primary_key);
 
-        for (size_t i = processed_keys; i < std::min(keys.size(), processed_keys + max_block_size); ++i)
+        size_t rows_processed = 0;
+        while (it < end && rows_processed < max_block_size)
         {
-            key_column.type->serializeBinary(keys[i], wbs[i]);
-            auto str_ref = wbs[i].stringRef();
-            slices_keys.emplace_back(str_ref.data, str_ref.size);
+            WriteBufferFromString wb(serialized_keys[rows_processed]);
+            key_column.type->serializeBinary(*it, wb);
+            wb.finalize();
+            slices_keys[rows_processed] = std::move(serialized_keys[rows_processed]);
+
+            ++it;
+            ++rows_processed;
         }
 
+        std::vector<String> values;
         auto statuses = storage.rocksdb_ptr->MultiGet(rocksdb::ReadOptions(), slices_keys, &values);
+
         for (size_t i = 0; i < statuses.size(); ++i)
         {
             if (statuses[i].ok())
@@ -221,7 +224,6 @@ public:
                 }
             }
         }
-        processed_keys += max_block_size;
 
         UInt64 num_rows = columns.at(0)->size();
         return Chunk(std::move(columns), num_rows);
@@ -231,12 +233,11 @@ private:
     const StorageEmbeddedRocksDB & storage;
 
     const StorageMetadataPtr metadata_snapshot;
-    const size_t start;
-    const size_t end;
+    FieldVectorPtr keys;
+    FieldVector::const_iterator begin;
+    FieldVector::const_iterator end;
+    FieldVector::const_iterator it;
     const size_t max_block_size;
-    FieldVector keys;
-
-    size_t processed_keys = 0;
 };
 
 
@@ -289,7 +290,8 @@ Pipe StorageEmbeddedRocksDB::read(
         unsigned num_streams)
 {
     metadata_snapshot->check(column_names, getVirtuals(), getStorageID());
-    FieldVector keys;
+
+    FieldVectorPtr keys;
     bool all_scan = false;
 
     auto primary_key_data_type = metadata_snapshot->getSampleBlock().getByName(primary_key).type;
@@ -302,37 +304,34 @@ Pipe StorageEmbeddedRocksDB::read(
     }
     else
     {
-        if (keys.empty())
+        if (keys->empty())
             return {};
 
-        std::sort(keys.begin(), keys.end());
-        auto unique_iter = std::unique(keys.begin(), keys.end());
-        if (unique_iter != keys.end())
-            keys.erase(unique_iter, keys.end());
+        std::sort(keys->begin(), keys->end());
+        keys->erase(std::unique(keys->begin(), keys->end()), keys->end());
 
         Pipes pipes;
-        size_t start = 0;
-        size_t end;
 
-        const size_t num_threads = std::min(size_t(num_streams), keys.size());
-        const size_t batch_per_size = ceil(keys.size() * 1.0 / num_threads);
+        size_t num_keys = keys->size();
+        size_t num_threads = std::min(size_t(num_streams), keys->size());
 
-        for (size_t t = 0; t < num_threads; ++t)
+        assert(num_keys <= std::numeric_limits<uint32_t>::max());
+        assert(num_threads <= std::numeric_limits<uint32_t>::max());
+
+        for (size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx)
         {
-            if (start >= keys.size())
-                start = end = 0;
-            else
-                end = start + batch_per_size > keys.size() ? keys.size() : start + batch_per_size;
+            size_t begin = num_keys * thread_idx / num_threads;
+            size_t end = num_keys * (thread_idx + 1) / num_threads;
 
-            pipes.emplace_back(
-                std::make_shared<EmbeddedRocksDBSource>(*this, metadata_snapshot, keys, start, end, max_block_size));
-            start += batch_per_size;
+            pipes.emplace_back(std::make_shared<EmbeddedRocksDBSource>(
+                    *this, metadata_snapshot, keys, keys->begin() + begin, keys->begin() + end, max_block_size));
         }
         return Pipe::unitePipes(std::move(pipes));
     }
 }
 
-BlockOutputStreamPtr StorageEmbeddedRocksDB::write(const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, const Context & /*context*/)
+BlockOutputStreamPtr StorageEmbeddedRocksDB::write(
+    const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, const Context & /*context*/)
 {
     return std::make_shared<EmbeddedRocksDBBlockOutputStream>(*this, metadata_snapshot);
 }
diff --git a/tests/queries/0_stateless/01686_rocksdb.reference b/tests/queries/0_stateless/01686_rocksdb.reference
new file mode 100644
index 00000000000..fa4e12d51ff
--- /dev/null
+++ b/tests/queries/0_stateless/01686_rocksdb.reference
@@ -0,0 +1,15 @@
+123	Hello, world (123)
+--
+--
+123	Hello, world (123)
+4567	Hello, world (4567)
+--
+--
+0	Hello, world (0)
+--
+123	Hello, world (123)
+456	Hello, world (456)
+--
+99	Hello, world (99)
+999	Hello, world (999)
+9999	Hello, world (9999)
diff --git a/tests/queries/0_stateless/01686_rocksdb.sql b/tests/queries/0_stateless/01686_rocksdb.sql
new file mode 100644
index 00000000000..c9b133acff3
--- /dev/null
+++ b/tests/queries/0_stateless/01686_rocksdb.sql
@@ -0,0 +1,27 @@
+DROP TABLE IF EXISTS test;
+
+CREATE TABLE test (key UInt64, value String) Engine=EmbeddedRocksDB PRIMARY KEY(key);
+
+INSERT INTO test SELECT number, format('Hello, world ({})', toString(number)) FROM numbers(10000);
+
+SELECT * FROM test WHERE key = 123;
+SELECT '--';
+SELECT * FROM test WHERE key = -123;
+SELECT '--';
+SELECT * FROM test WHERE key = 123 OR key = 4567 ORDER BY key;
+SELECT '--';
+SELECT * FROM test WHERE key = NULL;
+SELECT '--';
+SELECT * FROM test WHERE key = NULL OR key = 0;
+SELECT '--';
+SELECT * FROM test WHERE key IN (123, 456, -123) ORDER BY key;
+SELECT '--';
+SELECT * FROM test WHERE key = 'Hello'; -- { serverError 53 }
+
+DETACH TABLE test NO DELAY;
+ATTACH TABLE test;
+
+SELECT * FROM test WHERE key IN (99, 999, 9999, -123) ORDER BY key;
+
+DROP TABLE IF EXISTS test;
+
diff --git a/tests/queries/0_stateless/arcadia_skip_list.txt b/tests/queries/0_stateless/arcadia_skip_list.txt
index a33ff98032b..d262fb9058e 100644
--- a/tests/queries/0_stateless/arcadia_skip_list.txt
+++ b/tests/queries/0_stateless/arcadia_skip_list.txt
@@ -200,3 +200,4 @@
 01676_clickhouse_client_autocomplete
 01671_aggregate_function_group_bitmap_data
 01674_executable_dictionary_implicit_key
+01686_rocksdb

From 05c5c8ed80e7895e1d3507ee089f1456f0bf5686 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Wed, 3 Feb 2021 21:23:54 +0300
Subject: [PATCH 036/122] Avoid UBSan report in pointInPolygon

---
 src/Functions/CMakeLists.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/Functions/CMakeLists.txt b/src/Functions/CMakeLists.txt
index 321aa5e2196..1c3beb2e47d 100644
--- a/src/Functions/CMakeLists.txt
+++ b/src/Functions/CMakeLists.txt
@@ -117,3 +117,6 @@ target_link_libraries(clickhouse_functions PRIVATE clickhouse_functions_array)
 if (USE_STATS)
     target_link_libraries(clickhouse_functions PRIVATE stats)
 endif()
+
+# Signed integer overflow on user-provided data inside boost::geometry - ignore.
+set_source_files_properties("pointInPolygon.cpp" PROPERTIES COMPILE_FLAGS -fno-sanitize=signed-integer-overflow)

From eb8b2e883aae2257d926f4894f16fa22f86ad170 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Wed, 3 Feb 2021 21:25:04 +0300
Subject: [PATCH 037/122] Add a test

---
 tests/queries/0_stateless/01700_point_in_polygon_ubsan.reference | 0
 tests/queries/0_stateless/01700_point_in_polygon_ubsan.sql       | 1 +
 2 files changed, 1 insertion(+)
 create mode 100644 tests/queries/0_stateless/01700_point_in_polygon_ubsan.reference
 create mode 100644 tests/queries/0_stateless/01700_point_in_polygon_ubsan.sql

diff --git a/tests/queries/0_stateless/01700_point_in_polygon_ubsan.reference b/tests/queries/0_stateless/01700_point_in_polygon_ubsan.reference
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/queries/0_stateless/01700_point_in_polygon_ubsan.sql b/tests/queries/0_stateless/01700_point_in_polygon_ubsan.sql
new file mode 100644
index 00000000000..d7859bdc5a9
--- /dev/null
+++ b/tests/queries/0_stateless/01700_point_in_polygon_ubsan.sql
@@ -0,0 +1 @@
+SELECT pointInPolygon((0, 0), [[(0, 0), (10, 10), (256, -9223372036854775808)]]) FORMAT Null;
\ No newline at end of file

From 0aca4a740ccd6462fbe0360f3aa6ea6db9344a5c Mon Sep 17 00:00:00 2001
From: alexey-milovidov <milovidov@yandex-team.ru>
Date: Wed, 3 Feb 2021 21:26:03 +0300
Subject: [PATCH 038/122] Update 01700_point_in_polygon_ubsan.sql

---
 tests/queries/0_stateless/01700_point_in_polygon_ubsan.sql | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/queries/0_stateless/01700_point_in_polygon_ubsan.sql b/tests/queries/0_stateless/01700_point_in_polygon_ubsan.sql
index d7859bdc5a9..97db40ab65e 100644
--- a/tests/queries/0_stateless/01700_point_in_polygon_ubsan.sql
+++ b/tests/queries/0_stateless/01700_point_in_polygon_ubsan.sql
@@ -1 +1 @@
-SELECT pointInPolygon((0, 0), [[(0, 0), (10, 10), (256, -9223372036854775808)]]) FORMAT Null;
\ No newline at end of file
+SELECT pointInPolygon((0, 0), [[(0, 0), (10, 10), (256, -9223372036854775808)]]) FORMAT Null;

From d6372bd3d91408140337bd77b4b22e9c133eb7ce Mon Sep 17 00:00:00 2001
From: Alexander Kuzmenkov <akuzm@yandex-team.ru>
Date: Thu, 4 Feb 2021 09:38:42 +0300
Subject: [PATCH 039/122] linker woes

---
 src/Processors/Transforms/WindowTransform.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/Processors/Transforms/WindowTransform.cpp b/src/Processors/Transforms/WindowTransform.cpp
index b7b0c72eb94..c5c4432e886 100644
--- a/src/Processors/Transforms/WindowTransform.cpp
+++ b/src/Processors/Transforms/WindowTransform.cpp
@@ -22,8 +22,6 @@ WindowTransform::WindowTransform(const Block & input_header_,
     , input_header(input_header_)
     , window_description(window_description_)
 {
-    window_description.frame.checkValid();
-
     workspaces.reserve(functions.size());
     for (const auto & f : functions)
     {

From b281d39036b8c7098bb6b98e217ee331278c428f Mon Sep 17 00:00:00 2001
From: Alexander Kuzmenkov <akuzm@yandex-team.ru>
Date: Thu, 4 Feb 2021 09:40:09 +0300
Subject: [PATCH 040/122] yamake

---
 src/Interpreters/ya.make | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/Interpreters/ya.make b/src/Interpreters/ya.make
index 1cadc447e59..6a155749ddf 100644
--- a/src/Interpreters/ya.make
+++ b/src/Interpreters/ya.make
@@ -145,6 +145,7 @@ SRCS(
     TranslateQualifiedNamesVisitor.cpp
     TreeOptimizer.cpp
     TreeRewriter.cpp
+    WindowDescription.cpp
     addMissingDefaults.cpp
     addTypeConversionToAST.cpp
     castColumn.cpp

From 773b364fe43b39df53e8ffad220c301d379446b2 Mon Sep 17 00:00:00 2001
From: Alexander Kuzmenkov <akuzm@yandex-team.ru>
Date: Thu, 4 Feb 2021 09:49:11 +0300
Subject: [PATCH 041/122] check some bounds

---
 src/Parsers/ExpressionElementParsers.cpp | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/src/Parsers/ExpressionElementParsers.cpp b/src/Parsers/ExpressionElementParsers.cpp
index c129c312d11..2434f7dbc42 100644
--- a/src/Parsers/ExpressionElementParsers.cpp
+++ b/src/Parsers/ExpressionElementParsers.cpp
@@ -568,6 +568,15 @@ static bool tryParseFrameDefinition(ASTWindowDefinition * node, IParser::Pos & p
             }
             node->frame.begin_offset = value.get<Int64>();
             node->frame.begin_type = WindowFrame::BoundaryType::Offset;
+            // We can easily get a UINT64_MAX here, which doesn't even fit into
+            // int64_t. Not sure what checks we are going to need here after we
+            // support floats and dates.
+            if (node->frame.begin_offset > INT_MAX || node->frame.begin_offset < INT_MIN)
+            {
+                throw Exception(ErrorCodes::BAD_ARGUMENTS,
+                    "Frame offset must be between {} and {}, but {} is given",
+                    INT_MAX, INT_MIN, node->frame.begin_offset);
+            }
         }
         else
         {
@@ -622,6 +631,13 @@ static bool tryParseFrameDefinition(ASTWindowDefinition * node, IParser::Pos & p
                 }
                 node->frame.end_offset = value.get<Int64>();
                 node->frame.end_type = WindowFrame::BoundaryType::Offset;
+
+                if (node->frame.end_offset > INT_MAX || node->frame.end_offset < INT_MIN)
+                {
+                    throw Exception(ErrorCodes::BAD_ARGUMENTS,
+                        "Frame offset must be between {} and {}, but {} is given",
+                        INT_MAX, INT_MIN, node->frame.end_offset);
+                }
             }
             else
             {

From 1b6262f874bfa26870bf404e2149698016ef75d3 Mon Sep 17 00:00:00 2001
From: Alexander Kuzmenkov <akuzm@yandex-team.ru>
Date: Thu, 4 Feb 2021 09:51:46 +0300
Subject: [PATCH 042/122] cleanup

---
 src/Parsers/ExpressionElementParsers.cpp      | 2 +-
 src/Processors/Transforms/WindowTransform.cpp | 7 ++-----
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/Parsers/ExpressionElementParsers.cpp b/src/Parsers/ExpressionElementParsers.cpp
index 2434f7dbc42..3f4403bc264 100644
--- a/src/Parsers/ExpressionElementParsers.cpp
+++ b/src/Parsers/ExpressionElementParsers.cpp
@@ -585,7 +585,7 @@ static bool tryParseFrameDefinition(ASTWindowDefinition * node, IParser::Pos & p
 
         if (keyword_preceding.ignore(pos, expected))
         {
-            node->frame.begin_offset = - node->frame.begin_offset;
+            node->frame.begin_offset = -node->frame.begin_offset;
         }
         else if (keyword_following.ignore(pos, expected))
         {
diff --git a/src/Processors/Transforms/WindowTransform.cpp b/src/Processors/Transforms/WindowTransform.cpp
index 634c588beb0..775a9e23191 100644
--- a/src/Processors/Transforms/WindowTransform.cpp
+++ b/src/Processors/Transforms/WindowTransform.cpp
@@ -434,18 +434,15 @@ void WindowTransform::advanceFrameEndCurrentRow()
 
 //    fmt::print(stderr, "first row {} last {}\n", frame_end.row, rows_end);
 
-    // We could retreat the frame_end here, but for some reason I am reluctant
-    // to do this... It would have better data locality.
-    auto reference = current_row;
+    // Advance frame_end while it is still peers with the current row.
     for (; frame_end.row < rows_end; ++frame_end.row)
     {
-        if (!arePeers(reference, frame_end))
+        if (!arePeers(current_row, frame_end))
         {
 //            fmt::print(stderr, "{} and {} don't match\n", reference, frame_end);
             frame_ended = true;
             return;
         }
-        reference = frame_end;
     }
 
     // Might have gotten to the end of the current block, have to properly

From c1c71fc8e9bb23b48fac327c52b1c1ce77400e2a Mon Sep 17 00:00:00 2001
From: Alexander Kuzmenkov <akuzm@yandex-team.ru>
Date: Thu, 4 Feb 2021 10:41:09 +0300
Subject: [PATCH 043/122] ROWS OFFSET frame end

---
 src/Interpreters/WindowDescription.cpp        | 15 +++
 src/Processors/Transforms/WindowTransform.cpp | 59 ++++++++++--
 src/Processors/Transforms/WindowTransform.h   |  3 +-
 .../01591_window_functions.reference          | 95 +++++++++++++++++++
 .../0_stateless/01591_window_functions.sql    | 16 ++++
 5 files changed, 181 insertions(+), 7 deletions(-)

diff --git a/src/Interpreters/WindowDescription.cpp b/src/Interpreters/WindowDescription.cpp
index 12f4bdd4124..6e72f056b16 100644
--- a/src/Interpreters/WindowDescription.cpp
+++ b/src/Interpreters/WindowDescription.cpp
@@ -109,6 +109,21 @@ void WindowFrame::checkValid() const
         return;
     }
 
+    if (end_type == BoundaryType::Offset
+        && begin_type == BoundaryType::Offset)
+    {
+        if (type == FrameType::Rows)
+        {
+            if (end_offset >= begin_offset)
+            {
+                return;
+            }
+        }
+
+        // For RANGE and GROUPS, we must check that end follows begin if sorted
+        // according to ORDER BY (we don't support them yet).
+    }
+
     throw Exception(ErrorCodes::BAD_ARGUMENTS,
         "Window frame '{}' is invalid",
         toString());
diff --git a/src/Processors/Transforms/WindowTransform.cpp b/src/Processors/Transforms/WindowTransform.cpp
index 2eaabaf1dc5..474d1a3c452 100644
--- a/src/Processors/Transforms/WindowTransform.cpp
+++ b/src/Processors/Transforms/WindowTransform.cpp
@@ -271,16 +271,22 @@ void WindowTransform::advanceFrameStartRowsOffset()
         return;
     }
 
-    assert(frame_start <= partition_end);
-    if (frame_start == partition_end && partition_ended)
+    if (partition_end <= frame_start)
     {
         // A FOLLOWING frame start ran into the end of partition.
-        frame_started = true;
+        frame_start = partition_end;
+        frame_started = partition_ended;
         return;
     }
 
+    // Handled the equality case above. Now the frame start is inside the
+    // partition, if we walked all the offset, it's final.
     assert(partition_start < frame_start);
     frame_started = offset_left == 0;
+
+    // If we ran into the start of data (offset left is negative), we won't be
+    // able to make progress. Should have handled this case above.
+    assert(offset_left >= 0);
 }
 
 void WindowTransform::advanceFrameStartChoose()
@@ -463,6 +469,39 @@ void WindowTransform::advanceFrameEndUnbounded()
     frame_ended = partition_ended;
 }
 
+void WindowTransform::advanceFrameEndRowsOffset()
+{
+    // Walk the specified offset from the current row. The "+1" is needed
+    // because the frame_end is a past-the-end pointer.
+    const auto [moved_row, offset_left] = moveRowNumber(current_row,
+        window_description.frame.end_offset + 1);
+
+    if (partition_end <= moved_row)
+    {
+        // Clamp to the end of partition. It might not have ended yet, in which
+        // case wait for more data.
+        frame_end = partition_end;
+        frame_ended = partition_ended;
+        return;
+    }
+
+    if (moved_row <= partition_start)
+    {
+        // Clamp to the start of partition.
+        frame_end = partition_start;
+        frame_ended = true;
+        return;
+    }
+
+    // Frame end inside partition, if we walked all the offset, it's final.
+    frame_end = moved_row;
+    frame_ended = offset_left == 0;
+
+    // If we ran into the start of data (offset left is negative), we won't be
+    // able to make progress. Should have handled this case above.
+    assert(offset_left >= 0);
+}
+
 void WindowTransform::advanceFrameEnd()
 {
     // No reason for this function to be called again after it succeeded.
@@ -479,9 +518,17 @@ void WindowTransform::advanceFrameEnd()
             advanceFrameEndUnbounded();
             break;
         case WindowFrame::BoundaryType::Offset:
-            throw Exception(ErrorCodes::NOT_IMPLEMENTED,
-                "The frame end type '{}' is not implemented",
-                WindowFrame::toString(window_description.frame.end_type));
+            switch (window_description.frame.type)
+            {
+                case WindowFrame::FrameType::Rows:
+                    advanceFrameEndRowsOffset();
+                    break;
+                default:
+                    throw Exception(ErrorCodes::NOT_IMPLEMENTED,
+                        "The frame end type '{}' is not implemented",
+                        WindowFrame::toString(window_description.frame.end_type));
+            }
+            break;
     }
 
 //    fmt::print(stderr, "frame_end {} -> {}\n", frame_end_before, frame_end);
diff --git a/src/Processors/Transforms/WindowTransform.h b/src/Processors/Transforms/WindowTransform.h
index c5e1c8b3653..bb1a9aefd64 100644
--- a/src/Processors/Transforms/WindowTransform.h
+++ b/src/Processors/Transforms/WindowTransform.h
@@ -109,9 +109,10 @@ private:
     void advanceFrameStart();
     void advanceFrameStartChoose();
     void advanceFrameStartRowsOffset();
-    void advanceFrameEnd();
     void advanceFrameEndCurrentRow();
     void advanceFrameEndUnbounded();
+    void advanceFrameEndRowsOffset();
+    void advanceFrameEnd();
     bool arePeers(const RowNumber & x, const RowNumber & y) const;
     void updateAggregationState();
     void writeOutCurrentRow();
diff --git a/tests/queries/0_stateless/01591_window_functions.reference b/tests/queries/0_stateless/01591_window_functions.reference
index 2e9c659e0af..1993c59fc8b 100644
--- a/tests/queries/0_stateless/01591_window_functions.reference
+++ b/tests/queries/0_stateless/01591_window_functions.reference
@@ -558,11 +558,106 @@ settings max_block_size = 2;
 28	5	3	2	1
 29	5	2	1	0
 30	6	1	1	0
+-- ROWS offset frame start and end
+select number, p,
+    count(*) over (partition by p order by number
+        rows between 2 preceding and 2 following)
+from (select number, intDiv(number, 7) p from numbers(71))
+order by p, number
+settings max_block_size = 2;
+0	0	3
+1	0	4
+2	0	5
+3	0	5
+4	0	5
+5	0	4
+6	0	3
+7	1	3
+8	1	4
+9	1	5
+10	1	5
+11	1	5
+12	1	4
+13	1	3
+14	2	3
+15	2	4
+16	2	5
+17	2	5
+18	2	5
+19	2	4
+20	2	3
+21	3	3
+22	3	4
+23	3	5
+24	3	5
+25	3	5
+26	3	4
+27	3	3
+28	4	3
+29	4	4
+30	4	5
+31	4	5
+32	4	5
+33	4	4
+34	4	3
+35	5	3
+36	5	4
+37	5	5
+38	5	5
+39	5	5
+40	5	4
+41	5	3
+42	6	3
+43	6	4
+44	6	5
+45	6	5
+46	6	5
+47	6	4
+48	6	3
+49	7	3
+50	7	4
+51	7	5
+52	7	5
+53	7	5
+54	7	4
+55	7	3
+56	8	3
+57	8	4
+58	8	5
+59	8	5
+60	8	5
+61	8	4
+62	8	3
+63	9	3
+64	9	4
+65	9	5
+66	9	5
+67	9	5
+68	9	4
+69	9	3
+70	10	1
 SELECT count(*) OVER (ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) FROM numbers(4);
 1
 2
 3
 3
+-- frame boundaries that runs into the partition end
+select
+    count() over (partition by intDiv(number, 3)
+        rows between 100 following and unbounded following),
+    count() over (partition by intDiv(number, 3)
+        rows between current row and 100 following)
+from numbers(10);
+0	3
+0	2
+0	1
+0	3
+0	2
+0	1
+0	3
+0	2
+0	1
+0	1
 -- seen a use-after-free under MSan in this query once
 SELECT number, max(number) OVER (PARTITION BY intDiv(number, 7) ORDER BY number ASC NULLS LAST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) FROM numbers(1024) SETTINGS max_block_size = 2 FORMAT Null;
 -- a corner case
diff --git a/tests/queries/0_stateless/01591_window_functions.sql b/tests/queries/0_stateless/01591_window_functions.sql
index 6c4190b47d3..400d4832144 100644
--- a/tests/queries/0_stateless/01591_window_functions.sql
+++ b/tests/queries/0_stateless/01591_window_functions.sql
@@ -175,8 +175,24 @@ from (select number, intDiv(number, 5) p from numbers(31))
 order by p, number
 settings max_block_size = 2;
 
+-- ROWS offset frame start and end
+select number, p,
+    count(*) over (partition by p order by number
+        rows between 2 preceding and 2 following)
+from (select number, intDiv(number, 7) p from numbers(71))
+order by p, number
+settings max_block_size = 2;
+
 SELECT count(*) OVER (ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) FROM numbers(4);
 
+-- frame boundaries that runs into the partition end
+select
+    count() over (partition by intDiv(number, 3)
+        rows between 100 following and unbounded following),
+    count() over (partition by intDiv(number, 3)
+        rows between current row and 100 following)
+from numbers(10);
+
 -- seen a use-after-free under MSan in this query once
 SELECT number, max(number) OVER (PARTITION BY intDiv(number, 7) ORDER BY number ASC NULLS LAST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) FROM numbers(1024) SETTINGS max_block_size = 2 FORMAT Null;
 

From 1cc9e03d0cca12bedd0559e30843b1356d29da51 Mon Sep 17 00:00:00 2001
From: kreuzerkrieg <kreuzerkrieg@gmail.com>
Date: Tue, 2 Feb 2021 16:09:43 +0200
Subject: [PATCH 044/122] return `DiskType` instead of `String`
 IDisk::getType() as in the rest of storage interfaces

---
 src/Disks/DiskDecorator.h                  |  2 +-
 src/Disks/DiskLocal.h                      |  2 +-
 src/Disks/DiskMemory.h                     |  2 +-
 src/Disks/IDisk.h                          | 25 +++++++++++++++++++++-
 src/Disks/S3/DiskS3.h                      |  2 +-
 src/Storages/System/StorageSystemDisks.cpp |  2 +-
 6 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/src/Disks/DiskDecorator.h b/src/Disks/DiskDecorator.h
index b50252c2c97..0f66ef63800 100644
--- a/src/Disks/DiskDecorator.h
+++ b/src/Disks/DiskDecorator.h
@@ -48,7 +48,7 @@ public:
     void setReadOnly(const String & path) override;
     void createHardLink(const String & src_path, const String & dst_path) override;
     void truncateFile(const String & path, size_t size) override;
-    const String getType() const override { return delegate->getType(); }
+    DiskType::Type getType() const override { return delegate->getType(); }
     Executor & getExecutor() override;
     SyncGuardPtr getDirectorySyncGuard(const String & path) const override;
 
diff --git a/src/Disks/DiskLocal.h b/src/Disks/DiskLocal.h
index d8d45290986..495e511f07e 100644
--- a/src/Disks/DiskLocal.h
+++ b/src/Disks/DiskLocal.h
@@ -100,7 +100,7 @@ public:
 
     void truncateFile(const String & path, size_t size) override;
 
-    const String getType() const override { return "local"; }
+    DiskType::Type getType() const override { return DiskType::Type::Local; }
 
     SyncGuardPtr getDirectorySyncGuard(const String & path) const override;
 
diff --git a/src/Disks/DiskMemory.h b/src/Disks/DiskMemory.h
index 3ebc76661d4..1ef38801a6c 100644
--- a/src/Disks/DiskMemory.h
+++ b/src/Disks/DiskMemory.h
@@ -91,7 +91,7 @@ public:
 
     void truncateFile(const String & path, size_t size) override;
 
-    const String getType() const override { return "memory"; }
+    DiskType::Type getType() const override { return DiskType::Type::RAM; }
 
 private:
     void createDirectoriesImpl(const String & path);
diff --git a/src/Disks/IDisk.h b/src/Disks/IDisk.h
index f41490a0807..3bbe553ba59 100644
--- a/src/Disks/IDisk.h
+++ b/src/Disks/IDisk.h
@@ -57,6 +57,29 @@ public:
 
 using SpacePtr = std::shared_ptr<Space>;
 
+struct DiskType
+{
+    enum class Type
+    {
+        Local,
+        RAM,
+        S3
+    };
+    static String toString(Type disk_type)
+    {
+        switch (disk_type)
+        {
+            case Type::Local:
+                return "local";
+            case Type::RAM:
+                return "memory";
+            case Type::S3:
+                return "s3";
+        }
+        __builtin_unreachable();
+    }
+};
+
 /**
  * A guard, that should synchronize file's or directory's state
  * with storage device (e.g. fsync in POSIX) in its destructor.
@@ -191,7 +214,7 @@ public:
     virtual void truncateFile(const String & path, size_t size);
 
     /// Return disk type - "local", "s3", etc.
-    virtual const String getType() const = 0;
+    virtual DiskType::Type getType() const = 0;
 
     /// Invoked when Global Context is shutdown.
     virtual void shutdown() { }
diff --git a/src/Disks/S3/DiskS3.h b/src/Disks/S3/DiskS3.h
index 3dbd9029fb2..4447d49b0ed 100644
--- a/src/Disks/S3/DiskS3.h
+++ b/src/Disks/S3/DiskS3.h
@@ -105,7 +105,7 @@ public:
 
     void setReadOnly(const String & path) override;
 
-    const String getType() const override { return "s3"; }
+    DiskType::Type getType() const override { return DiskType::Type::S3; }
 
     void shutdown() override;
 
diff --git a/src/Storages/System/StorageSystemDisks.cpp b/src/Storages/System/StorageSystemDisks.cpp
index fbbee51e34e..b04d24cc705 100644
--- a/src/Storages/System/StorageSystemDisks.cpp
+++ b/src/Storages/System/StorageSystemDisks.cpp
@@ -51,7 +51,7 @@ Pipe StorageSystemDisks::read(
         col_free->insert(disk_ptr->getAvailableSpace());
         col_total->insert(disk_ptr->getTotalSpace());
         col_keep->insert(disk_ptr->getKeepingFreeSpace());
-        col_type->insert(disk_ptr->getType());
+        col_type->insert(DiskType::toString(disk_ptr->getType()));
     }
 
     Columns res_columns;

From 81b0cc381da8366fe6356a852c6b372ed078dc6c Mon Sep 17 00:00:00 2001
From: hexiaoting <hewenting_ict@163.com>
Date: Thu, 4 Feb 2021 17:36:09 +0800
Subject: [PATCH 045/122] enlarge signed type for modulo function

---
 src/DataTypes/NumberTraits.h                               | 5 ++++-
 tests/queries/0_stateless/01692_mod_enlarge_type.reference | 2 ++
 tests/queries/0_stateless/01692_mod_enlarge_type.sql       | 2 ++
 3 files changed, 8 insertions(+), 1 deletion(-)
 create mode 100644 tests/queries/0_stateless/01692_mod_enlarge_type.reference
 create mode 100644 tests/queries/0_stateless/01692_mod_enlarge_type.sql

diff --git a/src/DataTypes/NumberTraits.h b/src/DataTypes/NumberTraits.h
index 3aa00c68274..c68a73eeb17 100644
--- a/src/DataTypes/NumberTraits.h
+++ b/src/DataTypes/NumberTraits.h
@@ -108,7 +108,10 @@ template <typename A, typename B> struct ResultOfIntegerDivision
     */
 template <typename A, typename B> struct ResultOfModulo
 {
-    using Type0 = typename Construct<is_signed_v<A> || is_signed_v<B>, false, sizeof(B)>::Type;
+    using Type0 = typename Construct<
+        is_signed_v<A> || is_signed_v<B>,
+	false,
+        (is_signed_v<A> || is_signed_v<B>) ? std::max(sizeof(A), sizeof(B)) : sizeof(B)>::Type;
     using Type = std::conditional_t<std::is_floating_point_v<A> || std::is_floating_point_v<B>, Float64, Type0>;
 };
 
diff --git a/tests/queries/0_stateless/01692_mod_enlarge_type.reference b/tests/queries/0_stateless/01692_mod_enlarge_type.reference
new file mode 100644
index 00000000000..6d962821ad6
--- /dev/null
+++ b/tests/queries/0_stateless/01692_mod_enlarge_type.reference
@@ -0,0 +1,2 @@
+-199
+146	Int32
diff --git a/tests/queries/0_stateless/01692_mod_enlarge_type.sql b/tests/queries/0_stateless/01692_mod_enlarge_type.sql
new file mode 100644
index 00000000000..1a97d084a4f
--- /dev/null
+++ b/tests/queries/0_stateless/01692_mod_enlarge_type.sql
@@ -0,0 +1,2 @@
+SELECT toInt32(-199) % 200;
+select toInt32(441746) % 150 as a , toTypeName(a);

From 41698d65f556a0b93333520d80a95000519e7cc5 Mon Sep 17 00:00:00 2001
From: hexiaoting <hewenting_ict@163.com>
Date: Thu, 4 Feb 2021 17:50:59 +0800
Subject: [PATCH 046/122] fix style error

---
 src/DataTypes/NumberTraits.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/DataTypes/NumberTraits.h b/src/DataTypes/NumberTraits.h
index c68a73eeb17..4d534df0b42 100644
--- a/src/DataTypes/NumberTraits.h
+++ b/src/DataTypes/NumberTraits.h
@@ -110,7 +110,7 @@ template <typename A, typename B> struct ResultOfModulo
 {
     using Type0 = typename Construct<
         is_signed_v<A> || is_signed_v<B>,
-	false,
+        false,
         (is_signed_v<A> || is_signed_v<B>) ? std::max(sizeof(A), sizeof(B)) : sizeof(B)>::Type;
     using Type = std::conditional_t<std::is_floating_point_v<A> || std::is_floating_point_v<B>, Float64, Type0>;
 };

From 7a2279d06707ea3a4a99842b10048cc3fd2c31e9 Mon Sep 17 00:00:00 2001
From: Nikolai Kochetov <nik-kochetov@yandex-team.ru>
Date: Thu, 4 Feb 2021 14:44:00 +0300
Subject: [PATCH 047/122] Fix removing filter column from expression after
 Filter actions split

---
 src/Interpreters/ActionsDAG.cpp               | 49 +++++++++++--------
 src/Interpreters/ActionsDAG.h                 | 15 +++---
 .../QueryPlan/Optimizations/splitFilter.cpp   |  5 +-
 3 files changed, 40 insertions(+), 29 deletions(-)

diff --git a/src/Interpreters/ActionsDAG.cpp b/src/Interpreters/ActionsDAG.cpp
index d8c40ffda2f..becd3f4f4a2 100644
--- a/src/Interpreters/ActionsDAG.cpp
+++ b/src/Interpreters/ActionsDAG.cpp
@@ -454,36 +454,42 @@ bool ActionsDAG::tryRestoreColumn(const std::string & column_name)
     return false;
 }
 
-void ActionsDAG::removeUnusedInput(const std::string & column_name)
+bool ActionsDAG::removeUnusedResult(const std::string & column_name)
 {
+    /// Find column in index and remove.
+    const Node * col;
+    {
+        auto it = index.begin();
+        for (; it != index.end(); ++it)
+            if ((*it)->result_name == column_name)
+                break;
+
+        if (it == index.end())
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Not found result {} in ActionsDAG\n{}", column_name, dumpDAG());
+
+        col = *it;
+        index.remove(it);
+    }
+
+    /// Check if column is in input.
     auto it = inputs.begin();
     for (; it != inputs.end(); ++it)
-        if ((*it)->result_name == column_name)
+        if (*it == col)
             break;
 
     if (it == inputs.end())
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Not found input {} in ActionsDAG\n{}", column_name, dumpDAG());
+        return false;
 
-    auto * input = *it;
+    /// Check column has no dependent.
     for (const auto & node : nodes)
         for (const auto * child : node.children)
-            if (input == child)
-                throw Exception(ErrorCodes::LOGICAL_ERROR,
-                                "Cannot remove input {} because it has dependent nodes in ActionsDAG\n{}",
-                                column_name, dumpDAG());
-
-    for (auto jt = index.begin(); jt != index.end(); ++jt)
-    {
-        if (*jt == input)
-        {
-            index.remove(jt);
-            break;
-        }
-    }
+            if (col == child)
+                return false;
 
+    /// Remove from nodes and inputs.
     for (auto jt = nodes.begin(); jt != nodes.end(); ++jt)
     {
-        if (&(*jt) == input)
+        if (&(*jt) == *it)
         {
             nodes.erase(jt);
             break;
@@ -491,6 +497,7 @@ void ActionsDAG::removeUnusedInput(const std::string & column_name)
     }
 
     inputs.erase(it);
+    return true;
 }
 
 ActionsDAGPtr ActionsDAG::clone() const
@@ -844,7 +851,7 @@ ActionsDAGPtr ActionsDAG::merge(ActionsDAG && first, ActionsDAG && second)
     return std::make_shared<ActionsDAG>(std::move(first));
 }
 
-std::pair<ActionsDAGPtr, ActionsDAGPtr> ActionsDAG::split(std::unordered_set<const Node *> split_nodes) const
+ActionsDAG::SplitResult ActionsDAG::split(std::unordered_set<const Node *> split_nodes) const
 {
     /// Split DAG into two parts.
     /// (first_nodes, first_index) is a part which will have split_list in result.
@@ -1045,7 +1052,7 @@ std::pair<ActionsDAGPtr, ActionsDAGPtr> ActionsDAG::split(std::unordered_set<con
     return {std::move(first_actions), std::move(second_actions)};
 }
 
-std::pair<ActionsDAGPtr, ActionsDAGPtr>  ActionsDAG::splitActionsBeforeArrayJoin(const NameSet & array_joined_columns) const
+ActionsDAG::SplitResult ActionsDAG::splitActionsBeforeArrayJoin(const NameSet & array_joined_columns) const
 {
 
     struct Frame
@@ -1113,7 +1120,7 @@ std::pair<ActionsDAGPtr, ActionsDAGPtr>  ActionsDAG::splitActionsBeforeArrayJoin
     return res;
 }
 
-std::pair<ActionsDAGPtr, ActionsDAGPtr> ActionsDAG::splitActionsForFilter(const std::string & column_name) const
+ActionsDAG::SplitResult ActionsDAG::splitActionsForFilter(const std::string & column_name) const
 {
     auto it = index.begin();
     for (; it != index.end(); ++it)
diff --git a/src/Interpreters/ActionsDAG.h b/src/Interpreters/ActionsDAG.h
index b12da30e24f..fa5ae2ac83f 100644
--- a/src/Interpreters/ActionsDAG.h
+++ b/src/Interpreters/ActionsDAG.h
@@ -214,9 +214,10 @@ public:
 
     /// If column is not in index, try to find it in nodes and insert back into index.
     bool tryRestoreColumn(const std::string & column_name);
-    /// Find column in input. Remove it from input and index.
-    /// Checks that column in inputs and has not dependent nodes.
-    void removeUnusedInput(const std::string & column_name);
+    /// Find column in result. Remove it from index.
+    /// If columns is in inputs and has no dependent nodes, remove it from inputs too.
+    /// Return true if column was removed from inputs.
+    bool removeUnusedResult(const std::string & column_name);
 
     void projectInput() { settings.project_input = true; }
     void removeUnusedActions(const Names & required_names);
@@ -255,18 +256,20 @@ public:
     /// Otherwise, any two actions may be combined.
     static ActionsDAGPtr merge(ActionsDAG && first, ActionsDAG && second);
 
+    using SplitResult = std::pair<ActionsDAGPtr, ActionsDAGPtr>;
+
     /// Split ActionsDAG into two DAGs, where first part contains all nodes from split_nodes and their children.
     /// Execution of first then second parts on block is equivalent to execution of initial DAG.
     /// First DAG and initial DAG have equal inputs, second DAG and initial DAG has equal index (outputs).
     /// Second DAG inputs may contain less inputs then first DAG (but also include other columns).
-    std::pair<ActionsDAGPtr, ActionsDAGPtr> split(std::unordered_set<const Node *> split_nodes) const;
+    SplitResult split(std::unordered_set<const Node *> split_nodes) const;
 
     /// Splits actions into two parts. Returned first half may be swapped with ARRAY JOIN.
-    std::pair<ActionsDAGPtr, ActionsDAGPtr> splitActionsBeforeArrayJoin(const NameSet & array_joined_columns) const;
+    SplitResult splitActionsBeforeArrayJoin(const NameSet & array_joined_columns) const;
 
     /// Splits actions into two parts. First part has minimal size sufficient for calculation of column_name.
     /// Index of initial actions must contain column_name.
-    std::pair<ActionsDAGPtr, ActionsDAGPtr> splitActionsForFilter(const std::string & column_name) const;
+    SplitResult splitActionsForFilter(const std::string & column_name) const;
 
 private:
     Node & addNode(Node node, bool can_replace = false);
diff --git a/src/Processors/QueryPlan/Optimizations/splitFilter.cpp b/src/Processors/QueryPlan/Optimizations/splitFilter.cpp
index 38ba8f25b24..8c212936195 100644
--- a/src/Processors/QueryPlan/Optimizations/splitFilter.cpp
+++ b/src/Processors/QueryPlan/Optimizations/splitFilter.cpp
@@ -24,8 +24,9 @@ size_t trySplitFilter(QueryPlan::Node * node, QueryPlan::Nodes & nodes)
     if (split.second->trivial())
         return 0;
 
+    bool remove_filter = false;
     if (filter_step->removesFilterColumn())
-        split.second->removeUnusedInput(filter_step->getFilterColumnName());
+        remove_filter = split.second->removeUnusedResult(filter_step->getFilterColumnName());
 
     auto description = filter_step->getStepDescription();
 
@@ -37,7 +38,7 @@ size_t trySplitFilter(QueryPlan::Node * node, QueryPlan::Nodes & nodes)
             filter_node.children.at(0)->step->getOutputStream(),
             std::move(split.first),
             filter_step->getFilterColumnName(),
-            filter_step->removesFilterColumn());
+            remove_filter);
 
     node->step = std::make_unique<ExpressionStep>(filter_node.step->getOutputStream(), std::move(split.second));
 

From e6f1ce48fe50c83a88f8c8feb985c1a0ea6180d3 Mon Sep 17 00:00:00 2001
From: Haavard Kvaalen <havardk@kvaalen.no>
Date: Thu, 4 Feb 2021 11:37:12 +0100
Subject: [PATCH 048/122] Fix handling of TABLE_MAP_EVENT from MySQL

The MySQL replication code assumed that row update events would be
preceded by a single TABLE_MAP_EVENT.  However, if a single SQL
statement modifies rows in multiple tables, MySQL will first send
table map events for all involved tables, and then row update events.

Depending on circumstances, this could lead to an exception when the row
update was processed, the update could be incorrectly dropped, or the
update could be applied to the wrong table.
---
 src/Core/MySQL/MySQLReplication.cpp           | 46 +++++++++++++++----
 src/Core/MySQL/MySQLReplication.h             | 36 +++++++++++----
 .../materialize_with_ddl.py                   | 15 ++++++
 .../test_materialize_mysql_database/test.py   |  5 ++
 4 files changed, 86 insertions(+), 16 deletions(-)

diff --git a/src/Core/MySQL/MySQLReplication.cpp b/src/Core/MySQL/MySQLReplication.cpp
index b86d6447e26..8e1e0cd7646 100644
--- a/src/Core/MySQL/MySQLReplication.cpp
+++ b/src/Core/MySQL/MySQLReplication.cpp
@@ -136,6 +136,7 @@ namespace MySQLReplication
         out << "XID: " << this->xid << '\n';
     }
 
+    /// https://dev.mysql.com/doc/internals/en/table-map-event.html
     void TableMapEvent::parseImpl(ReadBuffer & payload)
     {
         payload.readStrict(reinterpret_cast<char *>(&table_id), 6);
@@ -257,15 +258,19 @@ namespace MySQLReplication
         out << "Null Bitmap: " << bitmap_str << '\n';
     }
 
-    void RowsEvent::parseImpl(ReadBuffer & payload)
+    void RowsEventHeader::parse(ReadBuffer & payload)
     {
         payload.readStrict(reinterpret_cast<char *>(&table_id), 6);
         payload.readStrict(reinterpret_cast<char *>(&flags), 2);
 
+        UInt16 extra_data_len;
         /// This extra_data_len contains the 2 bytes length.
         payload.readStrict(reinterpret_cast<char *>(&extra_data_len), 2);
         payload.ignore(extra_data_len - 2);
+    }
 
+    void RowsEvent::parseImpl(ReadBuffer & payload)
+    {
         number_columns = readLengthEncodedNumber(payload);
         size_t columns_bitmap_size = (number_columns + 7) / 8;
         switch (header.type)
@@ -795,37 +800,50 @@ namespace MySQLReplication
             {
                 event = std::make_shared<TableMapEvent>(std::move(event_header));
                 event->parseEvent(event_payload);
-                table_map = std::static_pointer_cast<TableMapEvent>(event);
+                auto table_map = std::static_pointer_cast<TableMapEvent>(event);
+                table_maps[table_map->table_id] = table_map;
                 break;
             }
             case WRITE_ROWS_EVENT_V1:
             case WRITE_ROWS_EVENT_V2: {
-                if (doReplicate())
-                    event = std::make_shared<WriteRowsEvent>(table_map, std::move(event_header));
+                RowsEventHeader rows_header(event_header.type);
+                rows_header.parse(event_payload);
+                if (doReplicate(rows_header.table_id))
+                    event = std::make_shared<WriteRowsEvent>(table_maps.at(rows_header.table_id), std::move(event_header), rows_header);
                 else
                     event = std::make_shared<DryRunEvent>(std::move(event_header));
 
                 event->parseEvent(event_payload);
+                if (rows_header.flags & ROWS_END_OF_STATEMENT)
+                    table_maps.clear();
                 break;
             }
             case DELETE_ROWS_EVENT_V1:
             case DELETE_ROWS_EVENT_V2: {
-                if (doReplicate())
-                    event = std::make_shared<DeleteRowsEvent>(table_map, std::move(event_header));
+                RowsEventHeader rows_header(event_header.type);
+                rows_header.parse(event_payload);
+                if (doReplicate(rows_header.table_id))
+                    event = std::make_shared<DeleteRowsEvent>(table_maps.at(rows_header.table_id), std::move(event_header), rows_header);
                 else
                     event = std::make_shared<DryRunEvent>(std::move(event_header));
 
                 event->parseEvent(event_payload);
+                if (rows_header.flags & ROWS_END_OF_STATEMENT)
+                    table_maps.clear();
                 break;
             }
             case UPDATE_ROWS_EVENT_V1:
             case UPDATE_ROWS_EVENT_V2: {
-                if (doReplicate())
-                    event = std::make_shared<UpdateRowsEvent>(table_map, std::move(event_header));
+                RowsEventHeader rows_header(event_header.type);
+                rows_header.parse(event_payload);
+                if (doReplicate(rows_header.table_id))
+                    event = std::make_shared<UpdateRowsEvent>(table_maps.at(rows_header.table_id), std::move(event_header), rows_header);
                 else
                     event = std::make_shared<DryRunEvent>(std::move(event_header));
 
                 event->parseEvent(event_payload);
+                if (rows_header.flags & ROWS_END_OF_STATEMENT)
+                    table_maps.clear();
                 break;
             }
             case GTID_EVENT:
@@ -843,6 +861,18 @@ namespace MySQLReplication
             }
         }
     }
+
+    bool MySQLFlavor::doReplicate(UInt64 table_id)
+    {
+        if (replicate_do_db.empty())
+            return false;
+        if (table_id == 0x00ffffff) {
+            // Special "dummy event"
+            return false;
+        }
+        auto table_map = table_maps.at(table_id);
+        return table_map->schema == replicate_do_db;
+    }
 }
 
 }
diff --git a/src/Core/MySQL/MySQLReplication.h b/src/Core/MySQL/MySQLReplication.h
index 7c7604cad58..ae8dc80f673 100644
--- a/src/Core/MySQL/MySQLReplication.h
+++ b/src/Core/MySQL/MySQLReplication.h
@@ -430,6 +430,22 @@ namespace MySQLReplication
         void parseMeta(String meta);
     };
 
+    enum RowsEventFlags
+    {
+        ROWS_END_OF_STATEMENT = 1
+    };
+
+    class RowsEventHeader
+    {
+    public:
+        EventType type;
+        UInt64 table_id;
+        UInt16 flags;
+
+        RowsEventHeader(EventType type_) : type(type_), table_id(0), flags(0) {};
+        void parse(ReadBuffer & payload);
+    };
+
     class RowsEvent : public EventBase
     {
     public:
@@ -438,9 +454,11 @@ namespace MySQLReplication
         String table;
         std::vector<Field> rows;
 
-        RowsEvent(std::shared_ptr<TableMapEvent> table_map_, EventHeader && header_)
-            : EventBase(std::move(header_)), number_columns(0), table_id(0), flags(0), extra_data_len(0), table_map(table_map_)
+        RowsEvent(std::shared_ptr<TableMapEvent> table_map_, EventHeader && header_, const RowsEventHeader & rows_header)
+            : EventBase(std::move(header_)), number_columns(0), table_map(table_map_)
         {
+            table_id = rows_header.table_id;
+            flags = rows_header.flags;
             schema = table_map->schema;
             table = table_map->table;
         }
@@ -450,7 +468,6 @@ namespace MySQLReplication
     protected:
         UInt64 table_id;
         UInt16 flags;
-        UInt16 extra_data_len;
         Bitmap columns_present_bitmap1;
         Bitmap columns_present_bitmap2;
 
@@ -464,21 +481,24 @@ namespace MySQLReplication
     class WriteRowsEvent : public RowsEvent
     {
     public:
-        WriteRowsEvent(std::shared_ptr<TableMapEvent> table_map_, EventHeader && header_) : RowsEvent(table_map_, std::move(header_)) {}
+        WriteRowsEvent(std::shared_ptr<TableMapEvent> table_map_, EventHeader && header_, const RowsEventHeader & rows_header)
+            : RowsEvent(table_map_, std::move(header_), rows_header) {}
         MySQLEventType type() const override { return MYSQL_WRITE_ROWS_EVENT; }
     };
 
     class DeleteRowsEvent : public RowsEvent
     {
     public:
-        DeleteRowsEvent(std::shared_ptr<TableMapEvent> table_map_, EventHeader && header_) : RowsEvent(table_map_, std::move(header_)) {}
+        DeleteRowsEvent(std::shared_ptr<TableMapEvent> table_map_, EventHeader && header_, const RowsEventHeader & rows_header)
+            : RowsEvent(table_map_, std::move(header_), rows_header) {}
         MySQLEventType type() const override { return MYSQL_DELETE_ROWS_EVENT; }
     };
 
     class UpdateRowsEvent : public RowsEvent
     {
     public:
-        UpdateRowsEvent(std::shared_ptr<TableMapEvent> table_map_, EventHeader && header_) : RowsEvent(table_map_, std::move(header_)) {}
+        UpdateRowsEvent(std::shared_ptr<TableMapEvent> table_map_, EventHeader && header_, const RowsEventHeader & rows_header)
+            : RowsEvent(table_map_, std::move(header_), rows_header) {}
         MySQLEventType type() const override { return MYSQL_UPDATE_ROWS_EVENT; }
     };
 
@@ -546,10 +566,10 @@ namespace MySQLReplication
         Position position;
         BinlogEventPtr event;
         String replicate_do_db;
-        std::shared_ptr<TableMapEvent> table_map;
+        std::map<UInt64, std::shared_ptr<TableMapEvent> > table_maps;
         size_t checksum_signature_length = 4;
 
-        inline bool doReplicate() { return (replicate_do_db.empty() || table_map->schema == replicate_do_db); }
+        bool doReplicate(UInt64 table_id);
     };
 }
 
diff --git a/tests/integration/test_materialize_mysql_database/materialize_with_ddl.py b/tests/integration/test_materialize_mysql_database/materialize_with_ddl.py
index 38ff8fd752b..b7f432d963b 100644
--- a/tests/integration/test_materialize_mysql_database/materialize_with_ddl.py
+++ b/tests/integration/test_materialize_mysql_database/materialize_with_ddl.py
@@ -757,3 +757,18 @@ def system_parts_test(clickhouse_node, mysql_node, service_name):
     check_active_parts(2)
     clickhouse_node.query("OPTIMIZE TABLE system_parts_test.test")
     check_active_parts(1)
+
+def multi_table_update_test(clickhouse_node, mysql_node, service_name):
+    mysql_node.query("DROP DATABASE IF EXISTS multi_table_update")
+    clickhouse_node.query("DROP DATABASE IF EXISTS multi_table_update")
+    mysql_node.query("CREATE DATABASE multi_table_update")
+    mysql_node.query("CREATE TABLE multi_table_update.a (id INT(11) NOT NULL PRIMARY KEY, value VARCHAR(255))")
+    mysql_node.query("CREATE TABLE multi_table_update.b (id INT(11) NOT NULL PRIMARY KEY, othervalue VARCHAR(255))")
+    mysql_node.query("INSERT INTO multi_table_update.a VALUES(1, 'foo')")
+    mysql_node.query("INSERT INTO multi_table_update.b VALUES(1, 'bar')")
+    clickhouse_node.query("CREATE DATABASE multi_table_update ENGINE = MaterializeMySQL('{}:3306', 'multi_table_update', 'root', 'clickhouse')".format(service_name))
+    check_query(clickhouse_node, "SHOW TABLES FROM multi_table_update", "a\nb\n")
+    mysql_node.query("UPDATE multi_table_update.a, multi_table_update.b SET value='baz', othervalue='quux' where a.id=b.id")
+
+    check_query(clickhouse_node, "SELECT * FROM multi_table_update.a", "1\tbaz\n")
+    check_query(clickhouse_node, "SELECT * FROM multi_table_update.b", "1\tquux\n")
diff --git a/tests/integration/test_materialize_mysql_database/test.py b/tests/integration/test_materialize_mysql_database/test.py
index 8cd2f7def07..32316901dce 100644
--- a/tests/integration/test_materialize_mysql_database/test.py
+++ b/tests/integration/test_materialize_mysql_database/test.py
@@ -237,3 +237,8 @@ def test_utf8mb4(started_cluster, started_mysql_8_0, started_mysql_5_7, clickhou
 @pytest.mark.parametrize(('clickhouse_node'), [node_db_ordinary, node_db_ordinary])
 def test_system_parts_table(started_cluster, started_mysql_8_0, clickhouse_node):
     materialize_with_ddl.system_parts_test(clickhouse_node, started_mysql_8_0, "mysql8_0")
+
+@pytest.mark.parametrize(('clickhouse_node'), [node_db_ordinary, node_db_ordinary])
+def test_multi_table_update(started_cluster, started_mysql_8_0, started_mysql_5_7, clickhouse_node):
+    materialize_with_ddl.multi_table_update_test(clickhouse_node, started_mysql_5_7, "mysql1")
+    materialize_with_ddl.multi_table_update_test(clickhouse_node, started_mysql_8_0, "mysql8_0")

From a3ac27674134a743b16e963aa5490e8077cbb297 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Thu, 4 Feb 2021 14:56:04 +0300
Subject: [PATCH 049/122] Add type promotion for modulo of division of negative
 number

---
 src/DataTypes/NumberTraits.h                          | 11 ++++++++---
 .../01700_mod_negative_type_promotion.reference       |  1 +
 .../0_stateless/01700_mod_negative_type_promotion.sql |  1 +
 3 files changed, 10 insertions(+), 3 deletions(-)
 create mode 100644 tests/queries/0_stateless/01700_mod_negative_type_promotion.reference
 create mode 100644 tests/queries/0_stateless/01700_mod_negative_type_promotion.sql

diff --git a/src/DataTypes/NumberTraits.h b/src/DataTypes/NumberTraits.h
index 3aa00c68274..479fc37c795 100644
--- a/src/DataTypes/NumberTraits.h
+++ b/src/DataTypes/NumberTraits.h
@@ -104,11 +104,16 @@ template <typename A, typename B> struct ResultOfIntegerDivision
         sizeof(A)>::Type;
 };
 
-/** Division with remainder you get a number with the same number of bits as in divisor.
-    */
+/** Division with remainder you get a number with the same number of bits as in divisor,
+  * or larger in case of signed type.
+  */
 template <typename A, typename B> struct ResultOfModulo
 {
-    using Type0 = typename Construct<is_signed_v<A> || is_signed_v<B>, false, sizeof(B)>::Type;
+    static constexpr bool result_is_signed = is_signed_v<A>;
+    /// If modulo of division can yield negative number, we need larger type to accomodate it.
+    /// Example: toInt32(-199) % toUInt8(200) will return -199 that does not fit in Int8, only in Int16.
+    static constexpr size_t size_of_result = result_is_signed ? nextSize(sizeof(B)) : sizeof(B);
+    using Type0 = typename Construct<result_is_signed, false, size_of_result>::Type;
     using Type = std::conditional_t<std::is_floating_point_v<A> || std::is_floating_point_v<B>, Float64, Type0>;
 };
 
diff --git a/tests/queries/0_stateless/01700_mod_negative_type_promotion.reference b/tests/queries/0_stateless/01700_mod_negative_type_promotion.reference
new file mode 100644
index 00000000000..b8d2624b7fe
--- /dev/null
+++ b/tests/queries/0_stateless/01700_mod_negative_type_promotion.reference
@@ -0,0 +1 @@
+-199
diff --git a/tests/queries/0_stateless/01700_mod_negative_type_promotion.sql b/tests/queries/0_stateless/01700_mod_negative_type_promotion.sql
new file mode 100644
index 00000000000..db850ba5c80
--- /dev/null
+++ b/tests/queries/0_stateless/01700_mod_negative_type_promotion.sql
@@ -0,0 +1 @@
+SELECT toInt32(-199) % 200;

From 858043cd538fb21fbfff69e78b3d3a07b996b100 Mon Sep 17 00:00:00 2001
From: Alexander Kuzmenkov <akuzm@yandex-team.ru>
Date: Thu, 4 Feb 2021 15:06:48 +0300
Subject: [PATCH 050/122] detect unmarked long tests in flaky check

---
 tests/clickhouse-test                         | 26 ++++++++++++-------
 ...ong_concurrent_select_and_drop_deadlock.sh |  2 ++
 .../01232_preparing_sets_race_condition.sh    |  2 ++
 .../0_stateless/01443_merge_truncate.sh       |  2 ++
 4 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/tests/clickhouse-test b/tests/clickhouse-test
index 0c49a3670a0..74f5f07eb9d 100755
--- a/tests/clickhouse-test
+++ b/tests/clickhouse-test
@@ -428,15 +428,23 @@ def run_tests_array(all_tests_with_params):
                                 status += print_test_time(total_time)
                                 status += " - result differs with reference:\n{}\n".format(diff)
                             else:
-                                passed_total += 1
-                                failures_chain = 0
-                                status += MSG_OK
-                                status += print_test_time(total_time)
-                                status += "\n"
-                                if os.path.exists(stdout_file):
-                                    os.remove(stdout_file)
-                                if os.path.exists(stderr_file):
-                                    os.remove(stderr_file)
+                                if args.test_runs > 1 and total_time > 30 and 'long' not in name:
+                                    # We're in Flaky Check mode, check the run time as well while we're at it.
+                                    failures += 1
+                                    failures_chain += 1
+                                    status += MSG_FAIL
+                                    status += print_test_time(total_time)
+                                    status += " - Long test not marked as 'long'"
+                                else:
+                                    passed_total += 1
+                                    failures_chain = 0
+                                    status += MSG_OK
+                                    status += print_test_time(total_time)
+                                    status += "\n"
+                                    if os.path.exists(stdout_file):
+                                        os.remove(stdout_file)
+                                    if os.path.exists(stderr_file):
+                                        os.remove(stderr_file)
 
             if status and not status.endswith('\n'):
                 status += '\n'
diff --git a/tests/queries/0_stateless/00840_long_concurrent_select_and_drop_deadlock.sh b/tests/queries/0_stateless/00840_long_concurrent_select_and_drop_deadlock.sh
index 60a2d8eb9a0..f7659bc3728 100755
--- a/tests/queries/0_stateless/00840_long_concurrent_select_and_drop_deadlock.sh
+++ b/tests/queries/0_stateless/00840_long_concurrent_select_and_drop_deadlock.sh
@@ -1,5 +1,7 @@
 #!/usr/bin/env bash
 
+# remove this comment before merge
+
 set -e
 
 CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
diff --git a/tests/queries/0_stateless/01232_preparing_sets_race_condition.sh b/tests/queries/0_stateless/01232_preparing_sets_race_condition.sh
index e42e68a6589..5aaac7cd86e 100755
--- a/tests/queries/0_stateless/01232_preparing_sets_race_condition.sh
+++ b/tests/queries/0_stateless/01232_preparing_sets_race_condition.sh
@@ -1,5 +1,7 @@
 #!/usr/bin/env bash
 
+# remove this comment before merge
+
 CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 # shellcheck source=../shell_config.sh
 . "$CURDIR"/../shell_config.sh
diff --git a/tests/queries/0_stateless/01443_merge_truncate.sh b/tests/queries/0_stateless/01443_merge_truncate.sh
index ffd5f225ffe..538e457a5d8 100755
--- a/tests/queries/0_stateless/01443_merge_truncate.sh
+++ b/tests/queries/0_stateless/01443_merge_truncate.sh
@@ -1,5 +1,7 @@
 #!/usr/bin/env bash
 
+# remove this comment before merge
+
 set -e
 
 CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)

From 4d66bc413cacd43bca352ea3ce5c912975283ac2 Mon Sep 17 00:00:00 2001
From: Nikolai Kochetov <nik-kochetov@yandex-team.ru>
Date: Thu, 4 Feb 2021 16:19:20 +0300
Subject: [PATCH 051/122] Update test.

---
 tests/queries/0_stateless/00597_push_down_predicate.sql | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/queries/0_stateless/00597_push_down_predicate.sql b/tests/queries/0_stateless/00597_push_down_predicate.sql
index ea01bba9f4d..ec306ac6792 100644
--- a/tests/queries/0_stateless/00597_push_down_predicate.sql
+++ b/tests/queries/0_stateless/00597_push_down_predicate.sql
@@ -8,6 +8,8 @@ DROP TABLE IF EXISTS test_view_00597;
 CREATE TABLE test_00597(date Date, id Int8, name String, value Int64) ENGINE = MergeTree(date, (id, date), 8192);
 CREATE VIEW test_view_00597 AS SELECT * FROM test_00597;
 
+SELECT * FROM (SELECT floor(floor(1, floor(NULL), id = 257), floor(floor(floor(floor(NULL), '10485.76', '9223372036854775807', NULL), floor(10, floor(65535, NULL), 100.0000991821289), NULL)), '2.56'), b.* FROM (SELECT floor(floor(floor(floor(NULL), 1000.0001220703125))), * FROM test_00597) AS b) WHERE id = 257;
+
 INSERT INTO test_00597 VALUES('2000-01-01', 1, 'test string 1', 1);
 INSERT INTO test_00597 VALUES('2000-01-01', 2, 'test string 2', 2);
 

From a161969b50b8a167a93d352b8f1b1e9a004155f2 Mon Sep 17 00:00:00 2001
From: Haavard Kvaalen <havardk@kvaalen.no>
Date: Thu, 4 Feb 2021 14:43:18 +0100
Subject: [PATCH 052/122] Fix incorrectly placed brace

---
 src/Core/MySQL/MySQLReplication.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/Core/MySQL/MySQLReplication.cpp b/src/Core/MySQL/MySQLReplication.cpp
index 8e1e0cd7646..8fdf337c849 100644
--- a/src/Core/MySQL/MySQLReplication.cpp
+++ b/src/Core/MySQL/MySQLReplication.cpp
@@ -866,7 +866,8 @@ namespace MySQLReplication
     {
         if (replicate_do_db.empty())
             return false;
-        if (table_id == 0x00ffffff) {
+        if (table_id == 0x00ffffff)
+        {
             // Special "dummy event"
             return false;
         }

From ee0ff755e2a8ad105b00fcb305a1a41447af5e87 Mon Sep 17 00:00:00 2001
From: Ivan Lezhankin <ilezhankin@yandex-team.ru>
Date: Thu, 4 Feb 2021 17:46:46 +0300
Subject: [PATCH 053/122] Check that position always advances

---
 programs/client/Client.cpp      | 2 +-
 src/IO/LimitReadBuffer.cpp      | 6 ++++++
 src/IO/ReadBuffer.h             | 6 +++---
 src/Parsers/ParserInsertQuery.h | 2 +-
 4 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/programs/client/Client.cpp b/programs/client/Client.cpp
index 9a8b580407a..f52b9b7a0da 100644
--- a/programs/client/Client.cpp
+++ b/programs/client/Client.cpp
@@ -1719,7 +1719,7 @@ private:
             }
             // Remember where the data ended. We use this info later to determine
             // where the next query begins.
-            parsed_insert_query->end = data_in.buffer().begin() + data_in.count();
+            parsed_insert_query->end = parsed_insert_query->data + data_in.count();
         }
         else if (!is_interactive)
         {
diff --git a/src/IO/LimitReadBuffer.cpp b/src/IO/LimitReadBuffer.cpp
index f36facfdd99..a2c93642833 100644
--- a/src/IO/LimitReadBuffer.cpp
+++ b/src/IO/LimitReadBuffer.cpp
@@ -1,4 +1,5 @@
 #include <IO/LimitReadBuffer.h>
+
 #include <Common/Exception.h>
 
 
@@ -13,6 +14,8 @@ namespace ErrorCodes
 
 bool LimitReadBuffer::nextImpl()
 {
+    assert(position() >= in.position());
+
     /// Let underlying buffer calculate read bytes in `next()` call.
     in.position() = position();
 
@@ -25,7 +28,10 @@ bool LimitReadBuffer::nextImpl()
     }
 
     if (!in.next())
+    {
+        working_buffer = in.buffer();
         return false;
+    }
 
     working_buffer = in.buffer();
 
diff --git a/src/IO/ReadBuffer.h b/src/IO/ReadBuffer.h
index 3d6eb6970ce..df21fc9bfb2 100644
--- a/src/IO/ReadBuffer.h
+++ b/src/IO/ReadBuffer.h
@@ -58,9 +58,9 @@ public:
         bytes += offset();
         bool res = nextImpl();
         if (!res)
-            working_buffer.resize(0);
-
-        pos = working_buffer.begin() + nextimpl_working_buffer_offset;
+            working_buffer = Buffer(pos, pos);
+        else
+            pos = working_buffer.begin() + nextimpl_working_buffer_offset;
         nextimpl_working_buffer_offset = 0;
         return res;
     }
diff --git a/src/Parsers/ParserInsertQuery.h b/src/Parsers/ParserInsertQuery.h
index b6a199c9d71..f98e433551d 100644
--- a/src/Parsers/ParserInsertQuery.h
+++ b/src/Parsers/ParserInsertQuery.h
@@ -30,7 +30,7 @@ private:
     const char * getName() const override { return "INSERT query"; }
     bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override;
 public:
-    ParserInsertQuery(const char * end_) : end(end_) {}
+    explicit ParserInsertQuery(const char * end_) : end(end_) {}
 };
 
 /** Insert accepts an identifier and an asterisk with variants.

From d219540a9cef940d7f07f1802f9abc44f71027c0 Mon Sep 17 00:00:00 2001
From: Denny Crane <deniszhuravlov@gmail.com>
Date: Thu, 4 Feb 2021 11:13:32 -0400
Subject: [PATCH 054/122] max_array_size = 1mil

---
 docs/ru/sql-reference/data-types/array.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/ru/sql-reference/data-types/array.md b/docs/ru/sql-reference/data-types/array.md
index 906246b66ee..86a23ed041b 100644
--- a/docs/ru/sql-reference/data-types/array.md
+++ b/docs/ru/sql-reference/data-types/array.md
@@ -47,6 +47,8 @@ SELECT [1, 2] AS x, toTypeName(x)
 
 ## Особенности работы с типами данных {#osobennosti-raboty-s-tipami-dannykh}
 
+Максимальный размер массива ограничен одним миллионом элементов.
+
 При создании массива «на лету» ClickHouse автоматически определяет тип аргументов как наиболее узкий тип данных, в котором можно хранить все перечисленные аргументы. Если среди аргументов есть [NULL](../../sql-reference/data-types/array.md#null-literal) или аргумент типа [Nullable](nullable.md#data_type-nullable), то тип элементов массива — [Nullable](nullable.md).
 
 Если ClickHouse не смог подобрать тип данных, то он сгенерирует исключение. Это произойдёт, например, при попытке создать массив одновременно со строками и числами `SELECT array(1, 'a')`.

From 0dc12ebfe149d1a7d6c8baccf26600126cdcf427 Mon Sep 17 00:00:00 2001
From: Denny Crane <deniszhuravlov@gmail.com>
Date: Thu, 4 Feb 2021 11:15:33 -0400
Subject: [PATCH 055/122] max_array_size = 1mil

---
 docs/en/sql-reference/data-types/array.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/en/sql-reference/data-types/array.md b/docs/en/sql-reference/data-types/array.md
index 48957498d63..41e35aaa96f 100644
--- a/docs/en/sql-reference/data-types/array.md
+++ b/docs/en/sql-reference/data-types/array.md
@@ -45,6 +45,8 @@ SELECT [1, 2] AS x, toTypeName(x)
 
 ## Working with Data Types {#working-with-data-types}
 
+The maximum size of an array is limited to one million elements. 
+
 When creating an array on the fly, ClickHouse automatically defines the argument type as the narrowest data type that can store all the listed arguments. If there are any [Nullable](../../sql-reference/data-types/nullable.md#data_type-nullable) or literal [NULL](../../sql-reference/syntax.md#null-literal) values, the type of an array element also becomes [Nullable](../../sql-reference/data-types/nullable.md).
 
 If ClickHouse couldn’t determine the data type, it generates an exception. For instance, this happens when trying to create an array with strings and numbers simultaneously (`SELECT array(1, 'a')`).

From e49a051092260e72973ef57b436b5c773731d633 Mon Sep 17 00:00:00 2001
From: Haavard Kvaalen <havardk@kvaalen.no>
Date: Thu, 4 Feb 2021 16:21:28 +0100
Subject: [PATCH 056/122] Remove superfluous semicolon

---
 src/Core/MySQL/MySQLReplication.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Core/MySQL/MySQLReplication.h b/src/Core/MySQL/MySQLReplication.h
index ae8dc80f673..d415bdda70d 100644
--- a/src/Core/MySQL/MySQLReplication.h
+++ b/src/Core/MySQL/MySQLReplication.h
@@ -442,7 +442,7 @@ namespace MySQLReplication
         UInt64 table_id;
         UInt16 flags;
 
-        RowsEventHeader(EventType type_) : type(type_), table_id(0), flags(0) {};
+        RowsEventHeader(EventType type_) : type(type_), table_id(0), flags(0) {}
         void parse(ReadBuffer & payload);
     };
 

From 237ee39228607a39b13e1b2836ec8b6ca6647bc8 Mon Sep 17 00:00:00 2001
From: "Anthony N. Simon" <anthonynsimon@users.noreply.github.com>
Date: Thu, 4 Feb 2021 16:34:32 +0100
Subject: [PATCH 057/122] Add Panelbear to adopters

---
 docs/en/introduction/adopters.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/en/introduction/adopters.md b/docs/en/introduction/adopters.md
index ca16119f460..c7230f2f080 100644
--- a/docs/en/introduction/adopters.md
+++ b/docs/en/introduction/adopters.md
@@ -74,6 +74,7 @@ toc_title: Adopters
 | <a href="https://getnoc.com/" class="favicon">NOC Project</a> | Network Monitoring | Analytics | Main Product | — | [Official Website](https://getnoc.com/features/big-data/) |
 | <a href="https://www.nuna.com/" class="favicon">Nuna Inc.</a> | Health Data Analytics | — | — | — | [Talk in English, July 2020](https://youtu.be/GMiXCMFDMow?t=170) |
 | <a href="https://www.oneapm.com/" class="favicon">OneAPM</a> | Monitorings and Data Analysis | Main product | — | — | [Slides in Chinese, October 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup19/8.%20clickhouse在OneAPM的应用%20杜龙.pdf) |
+| <a href="https://panelbear.com/" class="favicon">Panelbear | Analytics | Monitoring and Analytics | — | — | [Tech Stack, November 2020](https://panelbear.com/blog/tech-stack/) |
 | <a href="https://www.percent.cn/" class="favicon">Percent 百分点</a> | Analytics | Main Product | — | — | [Slides in Chinese, June 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup24/4.%20ClickHouse万亿数据双中心的设计与实践%20.pdf) |
 | <a href="https://www.percona.com/" class="favicon">Percona</a> | Performance analysis | Percona Monitoring and Management | — | — | [Official website, Mar 2020](https://www.percona.com/blog/2020/03/30/advanced-query-analysis-in-percona-monitoring-and-management-with-direct-clickhouse-access/) |
 | <a href="https://plausible.io/" class="favicon">Plausible</a> | Analytics | Main Product | — | — | [Blog post, June 2020](https://twitter.com/PlausibleHQ/status/1273889629087969280) |

From ac4c3a6b27156f0528a1a0d2179b5c7e8d0a66fc Mon Sep 17 00:00:00 2001
From: Ivan Lezhankin <ilezhankin@yandex-team.ru>
Date: Thu, 4 Feb 2021 18:59:54 +0300
Subject: [PATCH 058/122] Don't pass empty vector to ConcatReadBuffer

---
 src/Server/HTTPHandler.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/Server/HTTPHandler.cpp b/src/Server/HTTPHandler.cpp
index e161b5752ae..5e0d1f0ac66 100644
--- a/src/Server/HTTPHandler.cpp
+++ b/src/Server/HTTPHandler.cpp
@@ -219,8 +219,11 @@ void HTTPHandler::pushDelayedResults(Output & used_output)
         }
     }
 
-    ConcatReadBuffer concat_read_buffer(read_buffers_raw_ptr);
-    copyData(concat_read_buffer, *used_output.out_maybe_compressed);
+    if (!read_buffers_raw_ptr.empty())
+    {
+        ConcatReadBuffer concat_read_buffer(read_buffers_raw_ptr);
+        copyData(concat_read_buffer, *used_output.out_maybe_compressed);
+    }
 }
 
 

From 85b5805c3cc0c20286030cd505356c9b28c25047 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Thu, 4 Feb 2021 19:36:31 +0300
Subject: [PATCH 059/122] Fix tests

---
 src/DataTypes/NumberTraits.h                | 2 +-
 src/Functions/tests/gtest_number_traits.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/DataTypes/NumberTraits.h b/src/DataTypes/NumberTraits.h
index 479fc37c795..c3b0d956ef5 100644
--- a/src/DataTypes/NumberTraits.h
+++ b/src/DataTypes/NumberTraits.h
@@ -110,7 +110,7 @@ template <typename A, typename B> struct ResultOfIntegerDivision
 template <typename A, typename B> struct ResultOfModulo
 {
     static constexpr bool result_is_signed = is_signed_v<A>;
-    /// If modulo of division can yield negative number, we need larger type to accomodate it.
+    /// If modulo of division can yield negative number, we need larger type to accommodate it.
     /// Example: toInt32(-199) % toUInt8(200) will return -199 that does not fit in Int8, only in Int16.
     static constexpr size_t size_of_result = result_is_signed ? nextSize(sizeof(B)) : sizeof(B);
     using Type0 = typename Construct<result_is_signed, false, size_of_result>::Type;
diff --git a/src/Functions/tests/gtest_number_traits.cpp b/src/Functions/tests/gtest_number_traits.cpp
index 7664b4fcbdc..7f25c6cbeb7 100644
--- a/src/Functions/tests/gtest_number_traits.cpp
+++ b/src/Functions/tests/gtest_number_traits.cpp
@@ -258,7 +258,7 @@ TEST(NumberTraits, Others)
     ASSERT_EQ(getTypeString(DB::NumberTraits::ResultOfFloatingPointDivision<DB::UInt16, DB::Int16>::Type()), "Float64");
     ASSERT_EQ(getTypeString(DB::NumberTraits::ResultOfFloatingPointDivision<DB::UInt32, DB::Int16>::Type()), "Float64");
     ASSERT_EQ(getTypeString(DB::NumberTraits::ResultOfIntegerDivision<DB::UInt8, DB::Int16>::Type()), "Int8");
-    ASSERT_EQ(getTypeString(DB::NumberTraits::ResultOfModulo<DB::UInt32, DB::Int8>::Type()), "Int8");
+    ASSERT_EQ(getTypeString(DB::NumberTraits::ResultOfModulo<DB::UInt32, DB::Int8>::Type()), "UInt8");
 }
 
 

From b0fba3decef216e32c8813c0fe321210915159c6 Mon Sep 17 00:00:00 2001
From: Alexander Kuzmenkov <akuzm@yandex-team.ru>
Date: Thu, 4 Feb 2021 20:46:15 +0300
Subject: [PATCH 060/122] mark as long

---
 ...ference => 01232_preparing_sets_race_condition_long.reference} | 0
 ...e_condition.sh => 01232_preparing_sets_race_condition_long.sh} | 0
 ...rge_truncate.reference => 01443_merge_truncate_long.reference} | 0
 .../{01443_merge_truncate.sh => 01443_merge_truncate_long.sh}     | 0
 4 files changed, 0 insertions(+), 0 deletions(-)
 rename tests/queries/0_stateless/{01232_preparing_sets_race_condition.reference => 01232_preparing_sets_race_condition_long.reference} (100%)
 rename tests/queries/0_stateless/{01232_preparing_sets_race_condition.sh => 01232_preparing_sets_race_condition_long.sh} (100%)
 rename tests/queries/0_stateless/{01443_merge_truncate.reference => 01443_merge_truncate_long.reference} (100%)
 rename tests/queries/0_stateless/{01443_merge_truncate.sh => 01443_merge_truncate_long.sh} (100%)

diff --git a/tests/queries/0_stateless/01232_preparing_sets_race_condition.reference b/tests/queries/0_stateless/01232_preparing_sets_race_condition_long.reference
similarity index 100%
rename from tests/queries/0_stateless/01232_preparing_sets_race_condition.reference
rename to tests/queries/0_stateless/01232_preparing_sets_race_condition_long.reference
diff --git a/tests/queries/0_stateless/01232_preparing_sets_race_condition.sh b/tests/queries/0_stateless/01232_preparing_sets_race_condition_long.sh
similarity index 100%
rename from tests/queries/0_stateless/01232_preparing_sets_race_condition.sh
rename to tests/queries/0_stateless/01232_preparing_sets_race_condition_long.sh
diff --git a/tests/queries/0_stateless/01443_merge_truncate.reference b/tests/queries/0_stateless/01443_merge_truncate_long.reference
similarity index 100%
rename from tests/queries/0_stateless/01443_merge_truncate.reference
rename to tests/queries/0_stateless/01443_merge_truncate_long.reference
diff --git a/tests/queries/0_stateless/01443_merge_truncate.sh b/tests/queries/0_stateless/01443_merge_truncate_long.sh
similarity index 100%
rename from tests/queries/0_stateless/01443_merge_truncate.sh
rename to tests/queries/0_stateless/01443_merge_truncate_long.sh

From a500bd70a489fbce3a4968fb4ec3b31db5e5cab0 Mon Sep 17 00:00:00 2001
From: Alexander Kuzmenkov <36882414+akuzm@users.noreply.github.com>
Date: Thu, 4 Feb 2021 21:14:02 +0300
Subject: [PATCH 061/122] Update index.md

---
 docs/en/sql-reference/window-functions/index.md | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/docs/en/sql-reference/window-functions/index.md b/docs/en/sql-reference/window-functions/index.md
index a79328ade32..22b40585452 100644
--- a/docs/en/sql-reference/window-functions/index.md
+++ b/docs/en/sql-reference/window-functions/index.md
@@ -1,4 +1,9 @@
-# [development] Window Functions
+---
+toc_priority: 62
+toc_title: Window Functions
+---
+
+# [experimental] Window Functions
 
 !!! warning "Warning"
 This is an experimental feature that is currently in development and is not ready
@@ -11,9 +16,7 @@ Pure window functions such as `rank`, `lag`, `lead` and so on are not yet suppor
 The window can be specified either with an `OVER` clause or with a separate
 `WINDOW` clause.
 
-Only two variants of frame are supported, `ROWS` and `RANGE`. The only supported
-frame boundaries are `ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW`.
-
+Only two variants of frame are supported, `ROWS` and `RANGE`. Offsets for the `RANGE` frame are not yet supported.
 
 ## References
 

From 61605026658e24c218c8bb3b9a531883a7d27bda Mon Sep 17 00:00:00 2001
From: Alexander Kuzmenkov <36882414+akuzm@users.noreply.github.com>
Date: Thu, 4 Feb 2021 21:16:26 +0300
Subject: [PATCH 062/122] Update index.md

---
 docs/en/sql-reference/window-functions/index.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/en/sql-reference/window-functions/index.md b/docs/en/sql-reference/window-functions/index.md
index 22b40585452..211656ed07f 100644
--- a/docs/en/sql-reference/window-functions/index.md
+++ b/docs/en/sql-reference/window-functions/index.md
@@ -31,6 +31,7 @@ https://github.com/ClickHouse/ClickHouse/blob/master/tests/performance/window_fu
 https://github.com/ClickHouse/ClickHouse/blob/master/tests/queries/0_stateless/01591_window_functions.sql
 
 ### Postgres Docs
+https://www.postgresql.org/docs/current/sql-select.html#SQL-WINDOW
 https://www.postgresql.org/docs/devel/sql-expressions.html#SYNTAX-WINDOW-FUNCTIONS
 https://www.postgresql.org/docs/devel/functions-window.html
 https://www.postgresql.org/docs/devel/tutorial-window.html

From cea2fcb18ca578b4ac1ac8bd8c57633da92ee501 Mon Sep 17 00:00:00 2001
From: Alexander Kuzmenkov <36882414+akuzm@users.noreply.github.com>
Date: Thu, 4 Feb 2021 21:17:25 +0300
Subject: [PATCH 063/122] Update index.md

---
 docs/en/sql-reference/window-functions/index.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en/sql-reference/window-functions/index.md b/docs/en/sql-reference/window-functions/index.md
index 211656ed07f..5a6f13226a5 100644
--- a/docs/en/sql-reference/window-functions/index.md
+++ b/docs/en/sql-reference/window-functions/index.md
@@ -8,7 +8,7 @@ toc_title: Window Functions
 !!! warning "Warning"
 This is an experimental feature that is currently in development and is not ready
 for general use. It will change in unpredictable backwards-incompatible ways in
-the future releases.
+the future releases. Set `allow_experimental_window_functions = 1` to enable it.
 
 ClickHouse currently supports calculation of aggregate functions over a window.
 Pure window functions such as `rank`, `lag`, `lead` and so on are not yet supported.

From 610d7b755b9eb80abcd2696dd130ec57f14d34e5 Mon Sep 17 00:00:00 2001
From: Ivan Lezhankin <ilezhankin@yandex-team.ru>
Date: Thu, 4 Feb 2021 21:50:15 +0300
Subject: [PATCH 064/122] Don't forget to update position of inner buffer

---
 src/IO/ReadWriteBufferFromHTTP.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/IO/ReadWriteBufferFromHTTP.h b/src/IO/ReadWriteBufferFromHTTP.h
index de10f268dc3..9c77fc3a517 100644
--- a/src/IO/ReadWriteBufferFromHTTP.h
+++ b/src/IO/ReadWriteBufferFromHTTP.h
@@ -205,6 +205,8 @@ namespace detail
         {
             if (next_callback)
                 next_callback(count());
+            if (working_buffer.size())
+                impl->position() = position();
             if (!impl->next())
                 return false;
             internal_buffer = impl->buffer();

From 19c38c61e6baad8d81ec52837d8ff18f5cdfc8b5 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Thu, 4 Feb 2021 22:03:56 +0300
Subject: [PATCH 065/122] Add fuzzer for ColumnsDescription

---
 src/Storages/tests/CMakeLists.txt                 |  3 +++
 src/Storages/tests/columns_description_fuzzer.cpp | 14 ++++++++++++++
 2 files changed, 17 insertions(+)
 create mode 100644 src/Storages/tests/columns_description_fuzzer.cpp

diff --git a/src/Storages/tests/CMakeLists.txt b/src/Storages/tests/CMakeLists.txt
index 292f7603838..b58fed9edf5 100644
--- a/src/Storages/tests/CMakeLists.txt
+++ b/src/Storages/tests/CMakeLists.txt
@@ -29,4 +29,7 @@ target_link_libraries (transform_part_zk_nodes
 if (ENABLE_FUZZING)
     add_executable (mergetree_checksum_fuzzer mergetree_checksum_fuzzer.cpp)
     target_link_libraries (mergetree_checksum_fuzzer PRIVATE dbms ${LIB_FUZZING_ENGINE})
+
+    add_executable (columns_description_fuzzer columns_description_fuzzer.cpp)
+    target_link_libraries (columns_description_fuzzer PRIVATE dbms ${LIB_FUZZING_ENGINE})
 endif ()
diff --git a/src/Storages/tests/columns_description_fuzzer.cpp b/src/Storages/tests/columns_description_fuzzer.cpp
new file mode 100644
index 00000000000..b0f1c0a14f9
--- /dev/null
+++ b/src/Storages/tests/columns_description_fuzzer.cpp
@@ -0,0 +1,14 @@
+#include <Storages/ColumnsDescription.h>
+
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size)
+try
+{
+    using namespace DB;
+    ColumnsDescription columns = ColumnsDescription::parse(std::string(reinterpret_cast<const char *>(data), size));
+    return 0;
+}
+catch (...)
+{
+    return 1;
+}

From 84b88c68025e1d3237b1b8f850255a6023487505 Mon Sep 17 00:00:00 2001
From: Ivan Lezhankin <ilezhankin@yandex-team.ru>
Date: Thu, 4 Feb 2021 22:06:43 +0300
Subject: [PATCH 066/122] CC

---
 src/Disks/S3/DiskS3.cpp          | 4 ++--
 src/IO/ReadWriteBufferFromHTTP.h | 6 ++----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/Disks/S3/DiskS3.cpp b/src/Disks/S3/DiskS3.cpp
index 238db98c9cc..89413660e35 100644
--- a/src/Disks/S3/DiskS3.cpp
+++ b/src/Disks/S3/DiskS3.cpp
@@ -244,7 +244,7 @@ public:
         if (whence == SEEK_CUR)
         {
             /// If position within current working buffer - shift pos.
-            if (working_buffer.size() && size_t(getPosition() + offset_) < absolute_position)
+            if (!working_buffer.empty() && size_t(getPosition() + offset_) < absolute_position)
             {
                 pos += offset_;
                 return getPosition();
@@ -257,7 +257,7 @@ public:
         else if (whence == SEEK_SET)
         {
             /// If position within current working buffer - shift pos.
-            if (working_buffer.size() && size_t(offset_) >= absolute_position - working_buffer.size()
+            if (!working_buffer.empty() && size_t(offset_) >= absolute_position - working_buffer.size()
                 && size_t(offset_) < absolute_position)
             {
                 pos = working_buffer.end() - (absolute_position - offset_);
diff --git a/src/IO/ReadWriteBufferFromHTTP.h b/src/IO/ReadWriteBufferFromHTTP.h
index 9c77fc3a517..9cd37bd00f8 100644
--- a/src/IO/ReadWriteBufferFromHTTP.h
+++ b/src/IO/ReadWriteBufferFromHTTP.h
@@ -76,9 +76,7 @@ public:
         }
     }
 
-    virtual ~UpdatableSessionBase()
-    {
-    }
+    virtual ~UpdatableSessionBase() = default;
 };
 
 
@@ -205,7 +203,7 @@ namespace detail
         {
             if (next_callback)
                 next_callback(count());
-            if (working_buffer.size())
+            if (!working_buffer.empty())
                 impl->position() = position();
             if (!impl->next())
                 return false;

From d2c1a97d8618bb94b28cabb7abaee0de187e2b4f Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Thu, 4 Feb 2021 22:41:12 +0300
Subject: [PATCH 067/122] Minor modification

---
 src/Storages/tests/columns_description_fuzzer.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/Storages/tests/columns_description_fuzzer.cpp b/src/Storages/tests/columns_description_fuzzer.cpp
index b0f1c0a14f9..44fd667ff1c 100644
--- a/src/Storages/tests/columns_description_fuzzer.cpp
+++ b/src/Storages/tests/columns_description_fuzzer.cpp
@@ -6,6 +6,7 @@ try
 {
     using namespace DB;
     ColumnsDescription columns = ColumnsDescription::parse(std::string(reinterpret_cast<const char *>(data), size));
+    std::cerr << columns.toString() << "\n";
     return 0;
 }
 catch (...)

From 0db2aa3c2e8aa39eb814b8c8a22e904cb8528025 Mon Sep 17 00:00:00 2001
From: Ivan <5627721+abyss7@users.noreply.github.com>
Date: Fri, 5 Feb 2021 01:21:24 +0300
Subject: [PATCH 068/122] Check that position doesn't go beyond end (#20039)

---
 src/IO/ReadBuffer.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/IO/ReadBuffer.h b/src/IO/ReadBuffer.h
index 3d6eb6970ce..68ebf154597 100644
--- a/src/IO/ReadBuffer.h
+++ b/src/IO/ReadBuffer.h
@@ -55,6 +55,8 @@ public:
       */
     bool next()
     {
+        assert(position() <= working_buffer.end());
+
         bytes += offset();
         bool res = nextImpl();
         if (!res)
@@ -62,6 +64,9 @@ public:
 
         pos = working_buffer.begin() + nextimpl_working_buffer_offset;
         nextimpl_working_buffer_offset = 0;
+
+        assert(position() <= working_buffer.end());
+
         return res;
     }
 

From d59b45e4961be70a794ee790f340034589f72683 Mon Sep 17 00:00:00 2001
From: Ivan Lezhankin <ilezhankin@yandex-team.ru>
Date: Fri, 5 Feb 2021 02:14:17 +0300
Subject: [PATCH 069/122] Fix build

---
 src/IO/BrotliReadBuffer.cpp        |  2 +-
 src/IO/HashingReadBuffer.h         | 11 ++++++-----
 src/IO/LZMAInflatingReadBuffer.cpp |  2 +-
 src/IO/LimitReadBuffer.cpp         |  2 +-
 src/IO/WriteBuffer.h               |  4 ++--
 src/IO/ZlibInflatingReadBuffer.cpp |  2 +-
 src/IO/ZstdInflatingReadBuffer.cpp |  2 +-
 7 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/src/IO/BrotliReadBuffer.cpp b/src/IO/BrotliReadBuffer.cpp
index 70d3a76e629..41991ad0516 100644
--- a/src/IO/BrotliReadBuffer.cpp
+++ b/src/IO/BrotliReadBuffer.cpp
@@ -77,7 +77,7 @@ bool BrotliReadBuffer::nextImpl()
         if (in->eof())
         {
             eof = true;
-            return working_buffer.size() != 0;
+            return !working_buffer.empty();
         }
         else
         {
diff --git a/src/IO/HashingReadBuffer.h b/src/IO/HashingReadBuffer.h
index 9fcd6dc6b41..08b6de69dcb 100644
--- a/src/IO/HashingReadBuffer.h
+++ b/src/IO/HashingReadBuffer.h
@@ -1,10 +1,11 @@
 #pragma once
 
-#include <IO/ReadBuffer.h>
 #include <IO/HashingWriteBuffer.h>
+#include <IO/ReadBuffer.h>
 
 namespace DB
 {
+
 /*
  * Calculates the hash from the read data. When reading, the data is read from the nested ReadBuffer.
  * Small pieces are copied into its own memory.
@@ -12,14 +13,14 @@ namespace DB
 class HashingReadBuffer : public IHashingBuffer<ReadBuffer>
 {
 public:
-    HashingReadBuffer(ReadBuffer & in_, size_t block_size_ = DBMS_DEFAULT_HASHING_BLOCK_SIZE) :
-        IHashingBuffer<ReadBuffer>(block_size_), in(in_)
+    explicit HashingReadBuffer(ReadBuffer & in_, size_t block_size_ = DBMS_DEFAULT_HASHING_BLOCK_SIZE)
+        : IHashingBuffer<ReadBuffer>(block_size_), in(in_)
     {
         working_buffer = in.buffer();
         pos = in.position();
 
         /// calculate hash from the data already read
-        if (working_buffer.size())
+        if (!working_buffer.empty())
         {
             calculateHash(pos, working_buffer.end() - pos);
         }
@@ -39,7 +40,7 @@ private:
         return res;
     }
 
-private:
     ReadBuffer & in;
 };
+
 }
diff --git a/src/IO/LZMAInflatingReadBuffer.cpp b/src/IO/LZMAInflatingReadBuffer.cpp
index e30e8df5f9d..6a0a7e5ee31 100644
--- a/src/IO/LZMAInflatingReadBuffer.cpp
+++ b/src/IO/LZMAInflatingReadBuffer.cpp
@@ -66,7 +66,7 @@ bool LZMAInflatingReadBuffer::nextImpl()
         if (in->eof())
         {
             eof = true;
-            return working_buffer.size() != 0;
+            return !working_buffer.empty();
         }
         else
         {
diff --git a/src/IO/LimitReadBuffer.cpp b/src/IO/LimitReadBuffer.cpp
index f36facfdd99..b0d734c9ca0 100644
--- a/src/IO/LimitReadBuffer.cpp
+++ b/src/IO/LimitReadBuffer.cpp
@@ -50,7 +50,7 @@ LimitReadBuffer::LimitReadBuffer(ReadBuffer & in_, UInt64 limit_, bool throw_exc
 LimitReadBuffer::~LimitReadBuffer()
 {
     /// Update underlying buffer's position in case when limit wasn't reached.
-    if (working_buffer.size() != 0)
+    if (!working_buffer.empty())
         in.position() = position();
 }
 
diff --git a/src/IO/WriteBuffer.h b/src/IO/WriteBuffer.h
index 6abcc1c8eed..d425f813d7b 100644
--- a/src/IO/WriteBuffer.h
+++ b/src/IO/WriteBuffer.h
@@ -61,7 +61,7 @@ public:
     /** it is desirable in the derived classes to place the next() call in the destructor,
       * so that the last data is written
       */
-    virtual ~WriteBuffer() {}
+    virtual ~WriteBuffer() = default;
 
     inline void nextIfAtEnd()
     {
@@ -75,7 +75,7 @@ public:
         size_t bytes_copied = 0;
 
         /// Produces endless loop
-        assert(working_buffer.size() > 0);
+        assert(!working_buffer.empty());
 
         while (bytes_copied < n)
         {
diff --git a/src/IO/ZlibInflatingReadBuffer.cpp b/src/IO/ZlibInflatingReadBuffer.cpp
index 0b23bef1b10..bea83c74e21 100644
--- a/src/IO/ZlibInflatingReadBuffer.cpp
+++ b/src/IO/ZlibInflatingReadBuffer.cpp
@@ -70,7 +70,7 @@ bool ZlibInflatingReadBuffer::nextImpl()
         if (in->eof())
         {
             eof = true;
-            return working_buffer.size() != 0;
+            return !working_buffer.empty();
         }
         else
         {
diff --git a/src/IO/ZstdInflatingReadBuffer.cpp b/src/IO/ZstdInflatingReadBuffer.cpp
index 94a0b56fc6d..b441a6a7210 100644
--- a/src/IO/ZstdInflatingReadBuffer.cpp
+++ b/src/IO/ZstdInflatingReadBuffer.cpp
@@ -54,7 +54,7 @@ bool ZstdInflatingReadBuffer::nextImpl()
     if (in->eof())
     {
         eof = true;
-        return working_buffer.size() != 0;
+        return !working_buffer.empty();
     }
 
     return true;

From b892fff406b2a69ed372acad528505f6f4314544 Mon Sep 17 00:00:00 2001
From: hexiaoting <hewenting_ict@163.com>
Date: Fri, 5 Feb 2021 10:31:16 +0800
Subject: [PATCH 070/122] Add alexey's fix: using another logic and  more
 comments

---
 src/DataTypes/NumberTraits.h                         | 12 +++++++-----
 src/Functions/tests/gtest_number_traits.cpp          |  2 +-
 .../0_stateless/01692_mod_enlarge_type.reference     |  2 +-
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/DataTypes/NumberTraits.h b/src/DataTypes/NumberTraits.h
index 4d534df0b42..14bd32c87a3 100644
--- a/src/DataTypes/NumberTraits.h
+++ b/src/DataTypes/NumberTraits.h
@@ -104,14 +104,16 @@ template <typename A, typename B> struct ResultOfIntegerDivision
         sizeof(A)>::Type;
 };
 
-/** Division with remainder you get a number with the same number of bits as in divisor.
+/** Division with remainder you get a number with the same number of bits as in divisor,
+  * or larger in case of signed type.
     */
 template <typename A, typename B> struct ResultOfModulo
 {
-    using Type0 = typename Construct<
-        is_signed_v<A> || is_signed_v<B>,
-        false,
-        (is_signed_v<A> || is_signed_v<B>) ? std::max(sizeof(A), sizeof(B)) : sizeof(B)>::Type;
+    static constexpr bool result_is_signed = is_signed_v<A>;
+    /// If modulo of division can yield negative number, we need larger type to accommodate it.
+    /// Example: toInt32(-199) % toUInt8(200) will return -199 that does not fit in Int8, only in Int16.
+    static constexpr size_t size_of_result = result_is_signed ? nextSize(sizeof(B)) : sizeof(B);
+    using Type0 = typename Construct<result_is_signed, false, size_of_result>::Type;
     using Type = std::conditional_t<std::is_floating_point_v<A> || std::is_floating_point_v<B>, Float64, Type0>;
 };
 
diff --git a/src/Functions/tests/gtest_number_traits.cpp b/src/Functions/tests/gtest_number_traits.cpp
index 7664b4fcbdc..7f25c6cbeb7 100644
--- a/src/Functions/tests/gtest_number_traits.cpp
+++ b/src/Functions/tests/gtest_number_traits.cpp
@@ -258,7 +258,7 @@ TEST(NumberTraits, Others)
     ASSERT_EQ(getTypeString(DB::NumberTraits::ResultOfFloatingPointDivision<DB::UInt16, DB::Int16>::Type()), "Float64");
     ASSERT_EQ(getTypeString(DB::NumberTraits::ResultOfFloatingPointDivision<DB::UInt32, DB::Int16>::Type()), "Float64");
     ASSERT_EQ(getTypeString(DB::NumberTraits::ResultOfIntegerDivision<DB::UInt8, DB::Int16>::Type()), "Int8");
-    ASSERT_EQ(getTypeString(DB::NumberTraits::ResultOfModulo<DB::UInt32, DB::Int8>::Type()), "Int8");
+    ASSERT_EQ(getTypeString(DB::NumberTraits::ResultOfModulo<DB::UInt32, DB::Int8>::Type()), "UInt8");
 }
 
 
diff --git a/tests/queries/0_stateless/01692_mod_enlarge_type.reference b/tests/queries/0_stateless/01692_mod_enlarge_type.reference
index 6d962821ad6..fe7df569ea2 100644
--- a/tests/queries/0_stateless/01692_mod_enlarge_type.reference
+++ b/tests/queries/0_stateless/01692_mod_enlarge_type.reference
@@ -1,2 +1,2 @@
 -199
-146	Int32
+146	Int16

From 52b52ede226282fe609c766566b806206c5985cb Mon Sep 17 00:00:00 2001
From: Alexander Kuzmenkov <36882414+akuzm@users.noreply.github.com>
Date: Fri, 5 Feb 2021 10:06:17 +0300
Subject: [PATCH 071/122] Update
 00840_long_concurrent_select_and_drop_deadlock.sh

---
 .../00840_long_concurrent_select_and_drop_deadlock.sh           | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/queries/0_stateless/00840_long_concurrent_select_and_drop_deadlock.sh b/tests/queries/0_stateless/00840_long_concurrent_select_and_drop_deadlock.sh
index f7659bc3728..60a2d8eb9a0 100755
--- a/tests/queries/0_stateless/00840_long_concurrent_select_and_drop_deadlock.sh
+++ b/tests/queries/0_stateless/00840_long_concurrent_select_and_drop_deadlock.sh
@@ -1,7 +1,5 @@
 #!/usr/bin/env bash
 
-# remove this comment before merge
-
 set -e
 
 CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)

From a1bcd4b128622ec2db79ea7564c60664784b52cc Mon Sep 17 00:00:00 2001
From: Alexander Kuzmenkov <36882414+akuzm@users.noreply.github.com>
Date: Fri, 5 Feb 2021 10:06:39 +0300
Subject: [PATCH 072/122] Update 01232_preparing_sets_race_condition_long.sh

---
 .../0_stateless/01232_preparing_sets_race_condition_long.sh     | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/queries/0_stateless/01232_preparing_sets_race_condition_long.sh b/tests/queries/0_stateless/01232_preparing_sets_race_condition_long.sh
index 5aaac7cd86e..e42e68a6589 100755
--- a/tests/queries/0_stateless/01232_preparing_sets_race_condition_long.sh
+++ b/tests/queries/0_stateless/01232_preparing_sets_race_condition_long.sh
@@ -1,7 +1,5 @@
 #!/usr/bin/env bash
 
-# remove this comment before merge
-
 CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 # shellcheck source=../shell_config.sh
 . "$CURDIR"/../shell_config.sh

From cb791dbc7fc3b71290a7f86c6e5494cbc14bd977 Mon Sep 17 00:00:00 2001
From: Alexander Kuzmenkov <36882414+akuzm@users.noreply.github.com>
Date: Fri, 5 Feb 2021 10:06:53 +0300
Subject: [PATCH 073/122] Update 01443_merge_truncate_long.sh

---
 tests/queries/0_stateless/01443_merge_truncate_long.sh | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/queries/0_stateless/01443_merge_truncate_long.sh b/tests/queries/0_stateless/01443_merge_truncate_long.sh
index 538e457a5d8..ffd5f225ffe 100755
--- a/tests/queries/0_stateless/01443_merge_truncate_long.sh
+++ b/tests/queries/0_stateless/01443_merge_truncate_long.sh
@@ -1,7 +1,5 @@
 #!/usr/bin/env bash
 
-# remove this comment before merge
-
 set -e
 
 CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)

From 3becb80c13f21f54f29229f596cfa6e93e063d23 Mon Sep 17 00:00:00 2001
From: bharatnc <bharatnc@gmail.com>
Date: Thu, 4 Feb 2021 23:43:03 -0800
Subject: [PATCH 074/122] Docs - date_add, date_diff

---
 .../functions/date-time-functions.md          | 81 +++++++++++++++++++
 1 file changed, 81 insertions(+)

diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md
index 9de780fb596..967f489f2ab 100644
--- a/docs/en/sql-reference/functions/date-time-functions.md
+++ b/docs/en/sql-reference/functions/date-time-functions.md
@@ -435,6 +435,87 @@ Result:
 
 -   [toStartOfInterval](#tostartofintervaltime-or-data-interval-x-unit-time-zone)
 
+## date\_add {#date_add}
+
+Adds specified date/time interval to the provided date.
+
+**Syntax** 
+
+``` sql
+date_add(unit, value, date)
+```
+
+Aliases: `dateAdd`, `DATE_ADD`. 
+
+**Parameters**
+
+-   `unit` - The unit of time - [String](../syntax.md#syntax-string-literal).
+    Possible values:
+
+    - `second`
+    - `minute`
+    - `hour`
+    - `day`
+    - `week`
+    - `month`
+    - `quarter`
+    - `year`
+-   `value` - Amount of the specified unit of time.    
+-   `date` — Date or Date with time - [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md).
+
+
+**Example**
+
+```sql
+select DATE_ADD(YEAR, 3, toDate('2018-01-01'));
+```
+
+```text
+┌─plus(toDate('2018-01-01'), toIntervalYear(3))─┐
+│                                    2021-01-01 │
+└───────────────────────────────────────────────┘
+```
+
+## date\_diff {#date_diff}
+
+Returns the difference between two dates in terms of the specified unit.
+
+**Syntax** 
+
+``` sql
+date_sub(unit, date1, date2)
+```
+
+Aliases: `date_diff`, `DATE_DIFF`. 
+
+**Parameters**
+
+-   `unit` - The unit of time - [String](../syntax.md#syntax-string-literal).
+    Possible values:
+
+    - `second`
+    - `minute`
+    - `hour`
+    - `day`
+    - `week`
+    - `month`
+    - `quarter`
+    - `year`
+    
+-   `date1`, `date2` — Date or Date with time - [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md).
+
+**Example**
+
+```sql
+select DATE_DIFF(MONTH, toDate('2018-12-18'), toDate('2018-01-01'));
+```
+
+```text
+┌─dateDiff('month', toDate('2018-12-18'), toDate('2018-01-01'))─┐
+│                                                           -11 │
+└───────────────────────────────────────────────────────────────┘
+```
+    
 ## now {#now}
 
 Returns the current date and time. 

From 606c914bb5b8dd602da08f75580d22823f8b81c0 Mon Sep 17 00:00:00 2001
From: bharatnc <bharatnc@gmail.com>
Date: Fri, 5 Feb 2021 00:03:45 -0800
Subject: [PATCH 075/122] Docs - timestamp_add|sub

---
 .../functions/date-time-functions.md          | 80 +++++++++++++++++++
 1 file changed, 80 insertions(+)

diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md
index 967f489f2ab..d0288d25074 100644
--- a/docs/en/sql-reference/functions/date-time-functions.md
+++ b/docs/en/sql-reference/functions/date-time-functions.md
@@ -515,6 +515,86 @@ select DATE_DIFF(MONTH, toDate('2018-12-18'), toDate('2018-01-01'));
 │                                                           -11 │
 └───────────────────────────────────────────────────────────────┘
 ```
+
+## timestamp\_add {#timestamp_add}
+
+Adds the specified time value with the provided date or date time value.
+
+**Syntax** 
+
+``` sql
+timestamp_add(date, INTERVAL value unit)
+```
+
+Aliases: `timeStampAdd`, `TIMESTAMP_ADD`. 
+
+**Parameters**
+    
+-   `date` — Date or Date with time - [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md).
+-   `value` -  Amount of the specified unit of time - [String](../syntax.md#syntax-string-literal)
+-   `unit` - The unit of time interval - [String](../syntax.md#syntax-string-literal).
+    Possible values:
+
+    - `second`
+    - `minute`
+    - `hour`
+    - `day`
+    - `week`
+    - `month`
+    - `quarter`
+    - `year`
+    
+**Example**
+
+```sql
+select timestamp_add(toDate('2018-01-01'), INTERVAL 3 MONTH);
+```
+
+```text
+┌─plus(toDate('2018-01-01'), toIntervalMonth(3))─┐
+│                                     2018-04-01 │
+└────────────────────────────────────────────────┘
+```
+
+## timestamp\_sub {#timestamp_sub}
+
+Returns the difference between two dates in terms of the specified unit.
+
+**Syntax** 
+
+``` sql
+timestamp_sub(unit, value, date)
+```
+
+Aliases: `timeStampSub`, `TIMESTAMP_SUB`. 
+
+**Parameters**
+
+-   `unit` - The unit of time - [String](../syntax.md#syntax-string-literal).
+    Possible values:
+
+    - `second`
+    - `minute`
+    - `hour`
+    - `day`
+    - `week`
+    - `month`
+    - `quarter`
+    - `year`
+- value -  Amount of the specified unit of time. [String](../syntax.md#syntax-string-literal).   
+-   `date1`, `date2` — Date or Date with time - [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md).
+
+**Example**
+
+```sql
+select timestamp_sub(MONTH, 5, toDateTime('2018-12-18 01:02:03'));
+```
+
+```text
+┌─minus(toDateTime('2018-12-18 01:02:03'), toIntervalMonth(5))─┐
+│                                          2018-07-18 01:02:03 │
+└──────────────────────────────────────────────────────────────┘
+```
     
 ## now {#now}
 

From 8c60e84067e1bac3f746c4c3b9b9faaaf9235bc8 Mon Sep 17 00:00:00 2001
From: bharatnc <bharatnc@gmail.com>
Date: Fri, 5 Feb 2021 00:04:56 -0800
Subject: [PATCH 076/122] Docs - minor fixes

---
 .../sql-reference/functions/date-time-functions.md  | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md
index d0288d25074..fa4ea7a739e 100644
--- a/docs/en/sql-reference/functions/date-time-functions.md
+++ b/docs/en/sql-reference/functions/date-time-functions.md
@@ -467,7 +467,7 @@ Aliases: `dateAdd`, `DATE_ADD`.
 **Example**
 
 ```sql
-select DATE_ADD(YEAR, 3, toDate('2018-01-01'));
+select date_add(YEAR, 3, toDate('2018-01-01'));
 ```
 
 ```text
@@ -501,13 +501,12 @@ Aliases: `date_diff`, `DATE_DIFF`.
     - `month`
     - `quarter`
     - `year`
-    
--   `date1`, `date2` — Date or Date with time - [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md).
+-   `date1`,`date2` - Dates or Dates with time - [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md).
 
 **Example**
 
 ```sql
-select DATE_DIFF(MONTH, toDate('2018-12-18'), toDate('2018-01-01'));
+select date_diff(MONTH, toDate('2018-12-18'), toDate('2018-01-01'));
 ```
 
 ```text
@@ -558,7 +557,7 @@ select timestamp_add(toDate('2018-01-01'), INTERVAL 3 MONTH);
 
 ## timestamp\_sub {#timestamp_sub}
 
-Returns the difference between two dates in terms of the specified unit.
+Returns the difference between two dates in the specified unit.
 
 **Syntax** 
 
@@ -581,8 +580,8 @@ Aliases: `timeStampSub`, `TIMESTAMP_SUB`.
     - `month`
     - `quarter`
     - `year`
-- value -  Amount of the specified unit of time. [String](../syntax.md#syntax-string-literal).   
--   `date1`, `date2` — Date or Date with time - [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md).
+- `value` -  Amount of the specified unit of time. [String](../syntax.md#syntax-string-literal).   
+- `date1`, `date2` — Date or Date with time - [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md).
 
 **Example**
 

From 7cbc135e726547a5e42f6fe16ef01197f9dfd440 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Fri, 5 Feb 2021 12:54:34 +0300
Subject: [PATCH 077/122] More isolated code

---
 src/DataStreams/RemoteQueryExecutor.cpp       |  78 +++----
 src/DataStreams/RemoteQueryExecutor.h         |   9 +-
 .../MergeTree/MergeTreeDataSelectExecutor.cpp | 219 +++++++++++-------
 .../MergeTree/MergeTreeDataSelectExecutor.h   |  19 ++
 4 files changed, 189 insertions(+), 136 deletions(-)

diff --git a/src/DataStreams/RemoteQueryExecutor.cpp b/src/DataStreams/RemoteQueryExecutor.cpp
index 27b3de66497..fc3870b3f22 100644
--- a/src/DataStreams/RemoteQueryExecutor.cpp
+++ b/src/DataStreams/RemoteQueryExecutor.cpp
@@ -173,9 +173,7 @@ void RemoteQueryExecutor::sendQuery()
     {
         std::lock_guard lock(duplicated_part_uuids_mutex);
         if (!duplicated_part_uuids.empty())
-        {
             multiplexed_connections->sendIgnoredPartUUIDs(duplicated_part_uuids);
-        }
     }
 
     multiplexed_connections->sendQuery(timeouts, query, query_id, stage, modified_client_info, true);
@@ -206,29 +204,9 @@ Block RemoteQueryExecutor::read()
         Packet packet = multiplexed_connections->receivePacket();
 
         if (auto block = processPacket(std::move(packet)))
-        {
-            if (got_duplicated_part_uuids)
-            {
-                /// Cancel previous query and disconnect before retry.
-                cancel();
-                multiplexed_connections->disconnect();
-
-                /// Only resend once, otherwise throw an exception
-                if (!resent_query)
-                {
-                    if (log)
-                        LOG_DEBUG(log, "Found duplicate UUIDs, will retry query without those parts");
-
-                    resent_query = true;
-                    sent_query = false;
-                    got_duplicated_part_uuids = false;
-                    /// Consecutive read will implicitly send query first.
-                    return read();
-                }
-                throw Exception("Found duplicate uuids while processing query.", ErrorCodes::DUPLICATED_PART_UUIDS);
-            }
             return *block;
-        }
+        else if (got_duplicated_part_uuids)
+            return std::get<Block>(restartQueryWithoutDuplicatedUUIDs());
     }
 }
 
@@ -266,29 +244,9 @@ std::variant<Block, int> RemoteQueryExecutor::read(std::unique_ptr<ReadContext>
         else
         {
             if (auto data = processPacket(std::move(read_context->packet)))
-            {
-                if (got_duplicated_part_uuids)
-                {
-                    /// Cancel previous query and disconnect before retry.
-                    cancel(&read_context);
-                    multiplexed_connections->disconnect();
-
-                    /// Only resend once, otherwise throw an exception
-                    if (!resent_query)
-                    {
-                        if (log)
-                            LOG_DEBUG(log, "Found duplicate UUIDs, will retry query without those parts");
-
-                        resent_query = true;
-                        sent_query = false;
-                        got_duplicated_part_uuids = false;
-                        /// Consecutive read will implicitly send query first.
-                        return read(read_context);
-                    }
-                    throw Exception("Found duplicate uuids while processing query.", ErrorCodes::DUPLICATED_PART_UUIDS);
-                }
                 return std::move(*data);
-            }
+            else if (got_duplicated_part_uuids)
+                return restartQueryWithoutDuplicatedUUIDs(&read_context);
         }
     }
     while (true);
@@ -297,16 +255,38 @@ std::variant<Block, int> RemoteQueryExecutor::read(std::unique_ptr<ReadContext>
 #endif
 }
 
+
+std::variant<Block, int> RemoteQueryExecutor::restartQueryWithoutDuplicatedUUIDs(std::unique_ptr<ReadContext> * read_context)
+{
+    /// Cancel previous query and disconnect before retry.
+    cancel(read_context);
+    multiplexed_connections->disconnect();
+
+    /// Only resend once, otherwise throw an exception
+    if (!resent_query)
+    {
+        if (log)
+            LOG_DEBUG(log, "Found duplicate UUIDs, will retry query without those parts");
+
+        resent_query = true;
+        sent_query = false;
+        got_duplicated_part_uuids = false;
+        /// Consecutive read will implicitly send query first.
+        if (!read_context)
+            return read();
+        else
+            return read(*read_context);
+    }
+    throw Exception("Found duplicate uuids while processing query.", ErrorCodes::DUPLICATED_PART_UUIDS);
+}
+
 std::optional<Block> RemoteQueryExecutor::processPacket(Packet packet)
 {
     switch (packet.type)
     {
         case Protocol::Server::PartUUIDs:
             if (!setPartUUIDs(packet.part_uuids))
-            {
                 got_duplicated_part_uuids = true;
-                return Block();
-            }
             break;
         case Protocol::Server::Data:
             /// If the block is not empty and is not a header block
diff --git a/src/DataStreams/RemoteQueryExecutor.h b/src/DataStreams/RemoteQueryExecutor.h
index 843cf75f1f8..6a10627b948 100644
--- a/src/DataStreams/RemoteQueryExecutor.h
+++ b/src/DataStreams/RemoteQueryExecutor.h
@@ -174,10 +174,14 @@ private:
     /// Send all temporary tables to remote servers
     void sendExternalTables();
 
-    /** Set part uuids to a query context, collected from remote replicas.
-      */
+    /// Set part uuids to a query context, collected from remote replicas.
+    /// Return true if duplicates found.
     bool setPartUUIDs(const std::vector<UUID> & uuids);
 
+    /// Cancell query and restart it with info about duplicated UUIDs
+    /// only for `allow_experimental_query_deduplication`.
+    std::variant<Block, int> restartQueryWithoutDuplicatedUUIDs(std::unique_ptr<ReadContext> * read_context = nullptr);
+
     /// If wasn't sent yet, send request to cancel all connections to replicas
     void tryCancel(const char * reason, std::unique_ptr<ReadContext> * read_context);
 
@@ -192,6 +196,7 @@ private:
 
     /// Reads packet by packet
     Block readPackets();
+
 };
 
 }
diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
index 740288e3b46..6b6098321cf 100644
--- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
@@ -6,7 +6,6 @@
 #include <Poco/File.h>
 
 #include <Common/FieldVisitors.h>
-#include <Storages/MergeTree/PartitionPruner.h>
 #include <Storages/MergeTree/MergeTreeDataSelectExecutor.h>
 #include <Storages/MergeTree/MergeTreeSelectProcessor.h>
 #include <Storages/MergeTree/MergeTreeReverseSelectProcessor.h>
@@ -267,92 +266,13 @@ QueryPlanPtr MergeTreeDataSelectExecutor::readFromParts(
         }
     }
 
-    /// Select the parts in which there can be data that satisfy `minmax_idx_condition` and that match the condition on `_part`,
-    ///  as well as `max_block_number_to_read`.
-    /// Skip parts uuids if any to the query context, or skip parts which uuids marked as excluded.
-    {
-        Context & query_context
-            = context.hasQueryContext() ? const_cast<Context &>(context).getQueryContext() : const_cast<Context &>(context);
+    const Context & query_context = context.hasQueryContext() ? context.getQueryContext() : context;
 
-        /// process_parts prepare parts that have to be read for the query,
-        /// returns false if duplicated parts' UUID have been met
-        auto select_parts = [&] (MergeTreeData::DataPartsVector & selected_parts) -> bool
-        {
-            auto ignored_part_uuids = query_context.getIgnoredPartUUIDs();
-            std::unordered_set<UUID> temp_part_uuids;
+    if (query_context.getSettingsRef().allow_experimental_query_deduplication)
+        selectPartsToReadWithUUIDFilter(parts, part_values, minmax_idx_condition, partition_pruner, max_block_numbers_to_read, query_context);
+    else
+        selectPartsToRead(parts, part_values, minmax_idx_condition, partition_pruner, max_block_numbers_to_read);
 
-            auto prev_parts = selected_parts;
-            selected_parts.clear();
-
-            for (const auto & part : prev_parts)
-            {
-                if (part_values.find(part->name) == part_values.end())
-                    continue;
-
-                if (part->isEmpty())
-                    continue;
-
-                if (minmax_idx_condition
-                    && !minmax_idx_condition->checkInHyperrectangle(part->minmax_idx.hyperrectangle, data.minmax_idx_column_types)
-                            .can_be_true)
-                    continue;
-
-                if (partition_pruner)
-                {
-                    if (partition_pruner->canBePruned(part))
-                        continue;
-                }
-
-                if (max_block_numbers_to_read)
-                {
-                    auto blocks_iterator = max_block_numbers_to_read->find(part->info.partition_id);
-                    if (blocks_iterator == max_block_numbers_to_read->end() || part->info.max_block > blocks_iterator->second)
-                        continue;
-                }
-
-                /// populate UUIDs and exclude ignored parts if enabled
-                if (query_context.getSettingsRef().allow_experimental_query_deduplication && part->uuid != UUIDHelpers::Nil)
-                {
-                    /// Skip the part if its uuid is meant to be excluded
-                    if (ignored_part_uuids->has(part->uuid))
-                        continue;
-
-                    auto result = temp_part_uuids.insert(part->uuid);
-                    if (!result.second)
-                        throw Exception("Found a part with the same UUID on the same replica.", ErrorCodes::LOGICAL_ERROR);
-                }
-
-                selected_parts.push_back(part);
-            }
-
-            if (!temp_part_uuids.empty())
-            {
-                auto duplicates = query_context.getPartUUIDs()->add(std::vector<UUID>{temp_part_uuids.begin(), temp_part_uuids.end()});
-                if (!duplicates.empty())
-                {
-                    /// on a local replica with prefer_localhost_replica=1 if any duplicates appeared during the first pass,
-                    /// adding them to the exclusion, so they will be skipped on second pass
-                    query_context.getIgnoredPartUUIDs()->add(duplicates);
-                    return false;
-                }
-            }
-
-            return true;
-        };
-
-        /// Process parts that have to be read for a query.
-        auto needs_retry = !select_parts(parts);
-        /// If any duplicated part UUIDs met during the first step, try to ignore them in second pass
-        if (needs_retry)
-        {
-            if (log)
-                LOG_DEBUG(log, "Found duplicate uuids locally, will retry part selection without them");
-
-            /// Second attempt didn't help, throw an exception
-            if (!select_parts(parts))
-                throw Exception("Found duplicate UUIDs while processing query.", ErrorCodes::DUPLICATED_PART_UUIDS);
-        }
-    }
 
     /// Sampling.
     Names column_names_to_read = real_column_names;
@@ -1924,5 +1844,134 @@ MarkRanges MergeTreeDataSelectExecutor::filterMarksUsingIndex(
     return res;
 }
 
+void MergeTreeDataSelectExecutor::selectPartsToRead(
+    MergeTreeData::DataPartsVector & parts,
+    const std::unordered_set<String> & part_values,
+    const std::optional<KeyCondition> & minmax_idx_condition,
+    std::optional<PartitionPruner> & partition_pruner,
+    const PartitionIdToMaxBlock * max_block_numbers_to_read) const
+{
+    auto prev_parts = parts;
+    parts.clear();
+
+    for (const auto & part : prev_parts)
+    {
+        if (part_values.find(part->name) == part_values.end())
+            continue;
+
+        if (part->isEmpty())
+            continue;
+
+        if (minmax_idx_condition && !minmax_idx_condition->checkInHyperrectangle(
+                part->minmax_idx.hyperrectangle, data.minmax_idx_column_types).can_be_true)
+            continue;
+
+        if (partition_pruner)
+        {
+            if (partition_pruner->canBePruned(part))
+                continue;
+        }
+
+        if (max_block_numbers_to_read)
+        {
+            auto blocks_iterator = max_block_numbers_to_read->find(part->info.partition_id);
+            if (blocks_iterator == max_block_numbers_to_read->end() || part->info.max_block > blocks_iterator->second)
+                continue;
+        }
+
+        parts.push_back(part);
+    }
+}
+
+void MergeTreeDataSelectExecutor::selectPartsToReadWithUUIDFilter(
+    MergeTreeData::DataPartsVector & parts,
+    const std::unordered_set<String> & part_values,
+    const std::optional<KeyCondition> & minmax_idx_condition,
+    std::optional<PartitionPruner> & partition_pruner,
+    const PartitionIdToMaxBlock * max_block_numbers_to_read,
+    const Context & query_context) const
+{
+    /// const_cast to add UUIDs to context. Bad practice.
+    Context & non_const_context = const_cast<Context &>(query_context);
+
+    /// process_parts prepare parts that have to be read for the query,
+    /// returns false if duplicated parts' UUID have been met
+    auto select_parts = [&] (MergeTreeData::DataPartsVector & selected_parts) -> bool
+    {
+        auto ignored_part_uuids = non_const_context.getIgnoredPartUUIDs();
+        std::unordered_set<UUID> temp_part_uuids;
+
+        auto prev_parts = selected_parts;
+        selected_parts.clear();
+
+        for (const auto & part : prev_parts)
+        {
+            if (part_values.find(part->name) == part_values.end())
+                continue;
+
+            if (part->isEmpty())
+                continue;
+
+            if (minmax_idx_condition
+                && !minmax_idx_condition->checkInHyperrectangle(part->minmax_idx.hyperrectangle, data.minmax_idx_column_types)
+                        .can_be_true)
+                continue;
+
+            if (partition_pruner)
+            {
+                if (partition_pruner->canBePruned(part))
+                    continue;
+            }
+
+            if (max_block_numbers_to_read)
+            {
+                auto blocks_iterator = max_block_numbers_to_read->find(part->info.partition_id);
+                if (blocks_iterator == max_block_numbers_to_read->end() || part->info.max_block > blocks_iterator->second)
+                    continue;
+            }
+
+            /// populate UUIDs and exclude ignored parts if enabled
+            if (part->uuid != UUIDHelpers::Nil)
+            {
+                /// Skip the part if its uuid is meant to be excluded
+                if (ignored_part_uuids->has(part->uuid))
+                    continue;
+
+                auto result = temp_part_uuids.insert(part->uuid);
+                if (!result.second)
+                    throw Exception("Found a part with the same UUID on the same replica.", ErrorCodes::LOGICAL_ERROR);
+            }
+
+            selected_parts.push_back(part);
+        }
+
+        if (!temp_part_uuids.empty())
+        {
+            auto duplicates = non_const_context.getPartUUIDs()->add(std::vector<UUID>{temp_part_uuids.begin(), temp_part_uuids.end()});
+            if (!duplicates.empty())
+            {
+                /// on a local replica with prefer_localhost_replica=1 if any duplicates appeared during the first pass,
+                /// adding them to the exclusion, so they will be skipped on second pass
+                non_const_context.getIgnoredPartUUIDs()->add(duplicates);
+                return false;
+            }
+        }
+
+        return true;
+    };
+
+    /// Process parts that have to be read for a query.
+    auto needs_retry = !select_parts(parts);
+
+    /// If any duplicated part UUIDs met during the first step, try to ignore them in second pass
+    if (needs_retry)
+    {
+        LOG_DEBUG(log, "Found duplicate uuids locally, will retry part selection without them");
+
+        /// Second attempt didn't help, throw an exception
+        if (!select_parts(parts))
+            throw Exception("Found duplicate UUIDs while processing query.", ErrorCodes::DUPLICATED_PART_UUIDS);
+    }
+}
 
 }
diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.h b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.h
index c3b3020ebf5..04a3be3d3f0 100644
--- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.h
+++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.h
@@ -4,6 +4,7 @@
 #include <Storages/SelectQueryInfo.h>
 #include <Storages/MergeTree/MergeTreeData.h>
 #include <Storages/MergeTree/RangesInDataPart.h>
+#include <Storages/MergeTree/PartitionPruner.h>
 
 
 namespace DB
@@ -113,6 +114,24 @@ private:
         const Settings & settings,
         const MergeTreeReaderSettings & reader_settings,
         Poco::Logger * log);
+
+    /// Select the parts in which there can be data that satisfy `minmax_idx_condition` and that match the condition on `_part`,
+    ///  as well as `max_block_number_to_read`.
+    void selectPartsToRead(
+        MergeTreeData::DataPartsVector & parts,
+        const std::unordered_set<String> & part_values,
+        const std::optional<KeyCondition> & minmax_idx_condition,
+        std::optional<PartitionPruner> & partition_pruner,
+        const PartitionIdToMaxBlock * max_block_numbers_to_read) const;
+
+    /// Same as previous but also skip parts uuids if any to the query context, or skip parts which uuids marked as excluded.
+    void selectPartsToReadWithUUIDFilter(
+        MergeTreeData::DataPartsVector & parts,
+        const std::unordered_set<String> & part_values,
+        const std::optional<KeyCondition> & minmax_idx_condition,
+        std::optional<PartitionPruner> & partition_pruner,
+        const PartitionIdToMaxBlock * max_block_numbers_to_read,
+        const Context & query_context) const;
 };
 
 }

From 449e8e3fd97ecc3c02fa64a6e763ebe8e54a019b Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Fri, 5 Feb 2021 13:15:02 +0300
Subject: [PATCH 078/122] More checks for setting

---
 src/Server/TCPHandler.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/Server/TCPHandler.cpp b/src/Server/TCPHandler.cpp
index 0d040652342..fa213dcdc55 100644
--- a/src/Server/TCPHandler.cpp
+++ b/src/Server/TCPHandler.cpp
@@ -535,7 +535,9 @@ void TCPHandler::processOrdinaryQuery()
     /// Pull query execution result, if exists, and send it to network.
     if (state.io.in)
     {
-        sendPartUUIDs();
+
+        if (query_context->getSettingsRef().allow_experimental_query_deduplication)
+            sendPartUUIDs();
 
         /// This allows the client to prepare output format
         if (Block header = state.io.in->getHeader())
@@ -601,7 +603,8 @@ void TCPHandler::processOrdinaryQueryWithProcessors()
 {
     auto & pipeline = state.io.pipeline;
 
-    sendPartUUIDs();
+    if (query_context->getSettingsRef().allow_experimental_query_deduplication)
+        sendPartUUIDs();
 
     /// Send header-block, to allow client to prepare output format for data to send.
     {

From 16d399aa3539be6a5f4d6b4ba3d7bb6acd542096 Mon Sep 17 00:00:00 2001
From: Ivan Lezhankin <ilezhankin@yandex-team.ru>
Date: Fri, 5 Feb 2021 13:31:18 +0300
Subject: [PATCH 079/122] Another build fix

---
 src/IO/MemoryReadWriteBuffer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/IO/MemoryReadWriteBuffer.cpp b/src/IO/MemoryReadWriteBuffer.cpp
index 0b0d9704de6..69bcd52a8d2 100644
--- a/src/IO/MemoryReadWriteBuffer.cpp
+++ b/src/IO/MemoryReadWriteBuffer.cpp
@@ -61,7 +61,7 @@ private:
             position() = nullptr;
         }
 
-        return buffer().size() != 0;
+        return !buffer().empty();
     }
 
     using Container = std::forward_list<BufferBase::Buffer>;

From aafadc06df5136c15f260ed6eb271f250e973571 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Fri, 5 Feb 2021 13:31:46 +0300
Subject: [PATCH 080/122] Better types

---
 src/Interpreters/Context.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h
index 4dbdf390473..ea9ea172d3f 100644
--- a/src/Interpreters/Context.h
+++ b/src/Interpreters/Context.h
@@ -266,8 +266,8 @@ private:
     using SampleBlockCache = std::unordered_map<std::string, Block>;
     mutable SampleBlockCache sample_block_cache;
 
-    std::shared_ptr<PartUUIDs> part_uuids; /// set of parts' uuids, is used for query parts deduplication
-    std::shared_ptr<PartUUIDs> ignored_part_uuids; /// set of parts' uuids are meant to be excluded from query processing
+    PartUUIDsPtr part_uuids; /// set of parts' uuids, is used for query parts deduplication
+    PartUUIDsPtr ignored_part_uuids; /// set of parts' uuids are meant to be excluded from query processing
 
     NameToNameMap query_parameters;   /// Dictionary with query parameters for prepared statements.
                                                      /// (key=name, value)

From f7dbdc623cc4122f7b03f621e2c3f4c4d745b74f Mon Sep 17 00:00:00 2001
From: Alexander Kuzmenkov <36882414+akuzm@users.noreply.github.com>
Date: Fri, 5 Feb 2021 13:45:38 +0300
Subject: [PATCH 081/122] Update style.md

---
 docs/ru/development/style.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/ru/development/style.md b/docs/ru/development/style.md
index 4d71dca46a7..1b211259bbb 100644
--- a/docs/ru/development/style.md
+++ b/docs/ru/development/style.md
@@ -714,6 +714,7 @@ auto s = std::string{"Hello"};
 ### Пользовательская ошибка {#error-messages-user-error}
 
 Такая ошибка вызвана действиями пользователя (неверный синтаксис запроса) или конфигурацией внешних систем (кончилось место на диске). Предполагается, что пользователь может устранить её самостоятельно. Для этого в сообщении об ошибке должна содержаться следующая информация:
+
 * что произошло. Это должно объясняться в пользовательских терминах (`Function pow() is not supported for data type UInt128`), а не загадочными конструкциями из кода (`runtime overload resolution failed in DB::BinaryOperationBuilder<FunctionAdaptor<pow>::Impl, UInt128, Int8>::kaboongleFastPath()`).
 * почему/где/когда -- любой контекст, который помогает отладить проблему. Представьте, как бы её отлаживали вы (программировать и пользоваться отладчиком нельзя).
 * что можно предпринять для устранения ошибки. Здесь можно перечислить типичные причины проблемы, настройки, влияющие на это поведение, и так далее.

From 7b96ef61e8e3e9d9c74f850375003d025ab9f739 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Fri, 5 Feb 2021 14:48:09 +0300
Subject: [PATCH 082/122] Add logging if Poco cannot allocate thread in tcp
 server

---
 contrib/poco | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/contrib/poco b/contrib/poco
index e11f3c97157..fbaaba4a02e 160000
--- a/contrib/poco
+++ b/contrib/poco
@@ -1 +1 @@
-Subproject commit e11f3c971570cf6a31006cd21cadf41a259c360a
+Subproject commit fbaaba4a02e29987b8c584747a496c79528f125f

From 3fc1641d9167a1b285826e21d95305568dc57a34 Mon Sep 17 00:00:00 2001
From: Stig Bakken <stig@stigbakken.com>
Date: Thu, 4 Feb 2021 04:52:06 +0800
Subject: [PATCH 083/122] Show details of MaterializeMySQL tables in
 `system.tables`

---
 src/Storages/StorageMaterializeMySQL.cpp                  | 3 +--
 .../materialize_with_ddl.py                               | 8 ++++++++
 tests/integration/test_materialize_mysql_database/test.py | 4 ++++
 3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/src/Storages/StorageMaterializeMySQL.cpp b/src/Storages/StorageMaterializeMySQL.cpp
index 721221e3fdc..e59f1e22958 100644
--- a/src/Storages/StorageMaterializeMySQL.cpp
+++ b/src/Storages/StorageMaterializeMySQL.cpp
@@ -30,9 +30,8 @@ namespace DB
 StorageMaterializeMySQL::StorageMaterializeMySQL(const StoragePtr & nested_storage_, const IDatabase * database_)
     : StorageProxy(nested_storage_->getStorageID()), nested_storage(nested_storage_), database(database_)
 {
-    auto nested_memory_metadata = nested_storage->getInMemoryMetadata();
     StorageInMemoryMetadata in_memory_metadata;
-    in_memory_metadata.setColumns(nested_memory_metadata.getColumns());
+    in_memory_metadata = nested_storage->getInMemoryMetadata();
     setInMemoryMetadata(in_memory_metadata);
 }
 
diff --git a/tests/integration/test_materialize_mysql_database/materialize_with_ddl.py b/tests/integration/test_materialize_mysql_database/materialize_with_ddl.py
index b7f432d963b..c9be2387fc7 100644
--- a/tests/integration/test_materialize_mysql_database/materialize_with_ddl.py
+++ b/tests/integration/test_materialize_mysql_database/materialize_with_ddl.py
@@ -772,3 +772,11 @@ def multi_table_update_test(clickhouse_node, mysql_node, service_name):
 
     check_query(clickhouse_node, "SELECT * FROM multi_table_update.a", "1\tbaz\n")
     check_query(clickhouse_node, "SELECT * FROM multi_table_update.b", "1\tquux\n")
+
+def system_tables_test(clickhouse_node, mysql_node, service_name):
+    mysql_node.query("DROP DATABASE IF EXISTS system_tables_test")
+    clickhouse_node.query("DROP DATABASE IF EXISTS system_tables_test")
+    mysql_node.query("CREATE DATABASE system_tables_test")
+    mysql_node.query("CREATE TABLE system_tables_test.test (id int NOT NULL PRIMARY KEY) ENGINE=InnoDB")
+    clickhouse_node.query("CREATE DATABASE system_tables_test ENGINE = MaterializeMySQL('{}:3306', 'system_tables_test', 'root', 'clickhouse')".format(service_name))
+    check_query(clickhouse_node, "SELECT partition_key, sorting_key, primary_key FROM system.tables WHERE database = 'system_tables_test' AND name = 'test'", "intDiv(id, 4294967)\tid\tid\n")
diff --git a/tests/integration/test_materialize_mysql_database/test.py b/tests/integration/test_materialize_mysql_database/test.py
index 32316901dce..e55772d9e1d 100644
--- a/tests/integration/test_materialize_mysql_database/test.py
+++ b/tests/integration/test_materialize_mysql_database/test.py
@@ -242,3 +242,7 @@ def test_system_parts_table(started_cluster, started_mysql_8_0, clickhouse_node)
 def test_multi_table_update(started_cluster, started_mysql_8_0, started_mysql_5_7, clickhouse_node):
     materialize_with_ddl.multi_table_update_test(clickhouse_node, started_mysql_5_7, "mysql1")
     materialize_with_ddl.multi_table_update_test(clickhouse_node, started_mysql_8_0, "mysql8_0")
+
+@pytest.mark.parametrize(('clickhouse_node'), [node_db_ordinary, node_db_ordinary])
+def test_system_tables_table(started_cluster, started_mysql_8_0, clickhouse_node):
+    materialize_with_ddl.system_tables_test(clickhouse_node, started_mysql_8_0, "mysql8_0")

From 92a4c4e318ac52f0e01da2bd1f2637f93b677f9a Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Fri, 5 Feb 2021 18:28:20 +0300
Subject: [PATCH 084/122] Add librdkafka to integration tests runner

---
 docker/test/integration/runner/Dockerfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docker/test/integration/runner/Dockerfile b/docker/test/integration/runner/Dockerfile
index f353931f0a0..fb853ecf751 100644
--- a/docker/test/integration/runner/Dockerfile
+++ b/docker/test/integration/runner/Dockerfile
@@ -27,6 +27,7 @@ RUN apt-get update \
     luajit \
     libssl-dev \
     libcurl4-openssl-dev \
+    librdkafka-dev \
     gdb \
     software-properties-common \
     libkrb5-dev \

From 6247d59c32922985380be690fe134fa5bddc0748 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Fri, 5 Feb 2021 18:30:45 +0300
Subject: [PATCH 085/122] Use fixed version

---
 docker/test/integration/runner/Dockerfile | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/docker/test/integration/runner/Dockerfile b/docker/test/integration/runner/Dockerfile
index fb853ecf751..502dc3736b2 100644
--- a/docker/test/integration/runner/Dockerfile
+++ b/docker/test/integration/runner/Dockerfile
@@ -27,7 +27,6 @@ RUN apt-get update \
     luajit \
     libssl-dev \
     libcurl4-openssl-dev \
-    librdkafka-dev \
     gdb \
     software-properties-common \
     libkrb5-dev \
@@ -62,7 +61,7 @@ RUN python3 -m pip install \
     aerospike \
     avro \
     cassandra-driver \
-    confluent-kafka \
+    confluent-kafka==1.5.0 \
     dict2xml \
     dicttoxml \
     docker \

From 95e21ba093374e1b637a7189282ec38e6a7471b7 Mon Sep 17 00:00:00 2001
From: bharatnc <bharatnc@gmail.com>
Date: Fri, 5 Feb 2021 07:53:22 -0800
Subject: [PATCH 086/122] Docs - remove duplicate date_diff and minor fixes

---
 .../functions/date-time-functions.md          | 131 +++++-------------
 1 file changed, 34 insertions(+), 97 deletions(-)

diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md
index fa4ea7a739e..35c033b5a2c 100644
--- a/docs/en/sql-reference/functions/date-time-functions.md
+++ b/docs/en/sql-reference/functions/date-time-functions.md
@@ -449,17 +449,9 @@ Aliases: `dateAdd`, `DATE_ADD`.
 
 **Parameters**
 
--   `unit` - The unit of time - [String](../syntax.md#syntax-string-literal).
-    Possible values:
+-   `unit` — Time unit, in which the returned value is expressed. [String](../../sql-reference/syntax.md#syntax-string-literal).
 
-    - `second`
-    - `minute`
-    - `hour`
-    - `day`
-    - `week`
-    - `month`
-    - `quarter`
-    - `year`
+        Supported values: second, minute, hour, day, week, month, quarter, year.
 -   `value` - Amount of the specified unit of time.    
 -   `date` — Date or Date with time - [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md).
 
@@ -476,43 +468,48 @@ select date_add(YEAR, 3, toDate('2018-01-01'));
 └───────────────────────────────────────────────┘
 ```
 
-## date\_diff {#date_diff}
+## date\_diff {#dated_diff}
 
-Returns the difference between two dates in terms of the specified unit.
+Returns the difference between two Date or DateTime values.
 
-**Syntax** 
+**Syntax**
 
 ``` sql
-date_sub(unit, date1, date2)
+date_diff('unit', startdate, enddate, [timezone])
 ```
 
-Aliases: `date_diff`, `DATE_DIFF`. 
-
 **Parameters**
 
--   `unit` - The unit of time - [String](../syntax.md#syntax-string-literal).
-    Possible values:
+-   `unit` — Time unit, in which the returned value is expressed. [String](../../sql-reference/syntax.md#syntax-string-literal).
 
-    - `second`
-    - `minute`
-    - `hour`
-    - `day`
-    - `week`
-    - `month`
-    - `quarter`
-    - `year`
--   `date1`,`date2` - Dates or Dates with time - [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md).
+        Supported values: second, minute, hour, day, week, month, quarter, year.
+
+-   `startdate` — The first time value to compare. [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md).
+
+-   `enddate` — The second time value to compare. [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md).
+
+-   `timezone` — Optional parameter. If specified, it is applied to both `startdate` and `enddate`. If not specified, timezones of `startdate` and `enddate` are used. If they are not the same, the result is unspecified.
+
+**Returned value**
+
+Difference between `startdate` and `enddate` expressed in `unit`.
+
+Type: `int`.
 
 **Example**
 
-```sql
-select date_diff(MONTH, toDate('2018-12-18'), toDate('2018-01-01'));
+Query:
+
+``` sql
+SELECT dateDiff('hour', toDateTime('2018-01-01 22:00:00'), toDateTime('2018-01-02 23:00:00'));
 ```
 
-```text
-┌─dateDiff('month', toDate('2018-12-18'), toDate('2018-01-01'))─┐
-│                                                           -11 │
-└───────────────────────────────────────────────────────────────┘
+Result:
+
+``` text
+┌─dateDiff('hour', toDateTime('2018-01-01 22:00:00'), toDateTime('2018-01-02 23:00:00'))─┐
+│                                                                                     25 │
+└────────────────────────────────────────────────────────────────────────────────────────┘
 ```
 
 ## timestamp\_add {#timestamp_add}
@@ -531,17 +528,9 @@ Aliases: `timeStampAdd`, `TIMESTAMP_ADD`.
     
 -   `date` — Date or Date with time - [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md).
 -   `value` -  Amount of the specified unit of time - [String](../syntax.md#syntax-string-literal)
--   `unit` - The unit of time interval - [String](../syntax.md#syntax-string-literal).
-    Possible values:
+-   `unit` — Time unit, in which the returned value is expressed. [String](../../sql-reference/syntax.md#syntax-string-literal).
 
-    - `second`
-    - `minute`
-    - `hour`
-    - `day`
-    - `week`
-    - `month`
-    - `quarter`
-    - `year`
+        Supported values: second, minute, hour, day, week, month, quarter, year.
     
 **Example**
 
@@ -569,17 +558,9 @@ Aliases: `timeStampSub`, `TIMESTAMP_SUB`.
 
 **Parameters**
 
--   `unit` - The unit of time - [String](../syntax.md#syntax-string-literal).
-    Possible values:
+-   `unit` — Time unit, in which the returned value is expressed. [String](../../sql-reference/syntax.md#syntax-string-literal).
 
-    - `second`
-    - `minute`
-    - `hour`
-    - `day`
-    - `week`
-    - `month`
-    - `quarter`
-    - `year`
+        Supported values: second, minute, hour, day, week, month, quarter, year.
 - `value` -  Amount of the specified unit of time. [String](../syntax.md#syntax-string-literal).   
 - `date1`, `date2` — Date or Date with time - [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md).
 
@@ -710,50 +691,6 @@ SELECT
 └──────────────────────────┴───────────────────────────────┘
 ```
 
-## dateDiff {#datediff}
-
-Returns the difference between two Date or DateTime values.
-
-**Syntax**
-
-``` sql
-dateDiff('unit', startdate, enddate, [timezone])
-```
-
-**Parameters**
-
--   `unit` — Time unit, in which the returned value is expressed. [String](../../sql-reference/syntax.md#syntax-string-literal).
-
-        Supported values: second, minute, hour, day, week, month, quarter, year.
-
--   `startdate` — The first time value to compare. [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md).
-
--   `enddate` — The second time value to compare. [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md).
-
--   `timezone` — Optional parameter. If specified, it is applied to both `startdate` and `enddate`. If not specified, timezones of `startdate` and `enddate` are used. If they are not the same, the result is unspecified.
-
-**Returned value**
-
-Difference between `startdate` and `enddate` expressed in `unit`.
-
-Type: `int`.
-
-**Example**
-
-Query:
-
-``` sql
-SELECT dateDiff('hour', toDateTime('2018-01-01 22:00:00'), toDateTime('2018-01-02 23:00:00'));
-```
-
-Result:
-
-``` text
-┌─dateDiff('hour', toDateTime('2018-01-01 22:00:00'), toDateTime('2018-01-02 23:00:00'))─┐
-│                                                                                     25 │
-└────────────────────────────────────────────────────────────────────────────────────────┘
-```
-
 ## timeSlots(StartTime, Duration,\[, Size\]) {#timeslotsstarttime-duration-size}
 
 For a time interval starting at ‘StartTime’ and continuing for ‘Duration’ seconds, it returns an array of moments in time, consisting of points from this interval rounded down to the ‘Size’ in seconds. ‘Size’ is an optional parameter: a constant UInt32, set to 1800 by default.

From facdc749cb1322499f21cfd4a2147f2c8c2b53c7 Mon Sep 17 00:00:00 2001
From: Bharat Nallan <bharatnc@gmail.com>
Date: Fri, 5 Feb 2021 07:54:14 -0800
Subject: [PATCH 087/122] Docs - commit suggestion

Co-authored-by: vdimir <vdimir@yandex-team.ru>
---
 docs/en/sql-reference/functions/date-time-functions.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md
index 35c033b5a2c..3e85ff42834 100644
--- a/docs/en/sql-reference/functions/date-time-functions.md
+++ b/docs/en/sql-reference/functions/date-time-functions.md
@@ -453,7 +453,7 @@ Aliases: `dateAdd`, `DATE_ADD`.
 
         Supported values: second, minute, hour, day, week, month, quarter, year.
 -   `value` - Amount of the specified unit of time.    
--   `date` — Date or Date with time - [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md).
+-   `date` — [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md).
 
 
 **Example**

From 6eb145697e44dd78a2e80cdfdb4fae476b69855f Mon Sep 17 00:00:00 2001
From: Yatsishin Ilya <2159081+qoega@users.noreply.github.com>
Date: Fri, 5 Feb 2021 18:56:24 +0300
Subject: [PATCH 088/122] Add benchmark results for Broadwell 8vCPU S3/SSD in
 Yandex.Cloud

---
 .../yandex_cloud_broadwell_8_vcpu.json        | 55 +++++++++++++++++++
 .../yandex_cloud_broadwell_8_vcpu_s3.json     | 55 +++++++++++++++++++
 2 files changed, 110 insertions(+)
 create mode 100644 website/benchmark/hardware/results/yandex_cloud_broadwell_8_vcpu.json
 create mode 100644 website/benchmark/hardware/results/yandex_cloud_broadwell_8_vcpu_s3.json

diff --git a/website/benchmark/hardware/results/yandex_cloud_broadwell_8_vcpu.json b/website/benchmark/hardware/results/yandex_cloud_broadwell_8_vcpu.json
new file mode 100644
index 00000000000..1217adbbff5
--- /dev/null
+++ b/website/benchmark/hardware/results/yandex_cloud_broadwell_8_vcpu.json
@@ -0,0 +1,55 @@
+[
+    {
+        "system":       "Yandex Cloud 8vCPU",
+        "system_full":  "Yandex Cloud Broadwell, 8 vCPU (4 threads), 64 GB RAM, 500 GB SSD",
+        "cpu_vendor":   "Intel",
+        "time":         "2021-02-05 00:00:00",
+        "kind":         "cloud",
+        "result":
+        [
+            [0.004, 0.003, 0.003],
+            [0.047, 0.030, 0.021],
+            [0.129, 0.066, 0.067],
+            [0.873, 0.098, 0.095],
+            [0.869, 0.247, 0.257],
+            [1.429, 0.818, 0.768],
+            [0.055, 0.042, 0.043],
+            [0.034, 0.025, 0.024],
+            [1.372, 1.003, 1.051],
+            [1.605, 1.281, 1.209],
+            [0.942, 0.503, 0.483],
+            [0.980, 0.537, 0.558],
+            [2.076, 1.664, 1.635],
+            [3.136, 2.235, 2.171],
+            [2.351, 1.973, 1.974],
+            [2.369, 2.170, 2.133],
+            [6.281, 5.576, 5.498],
+            [3.739, 3.481, 3.354],
+            [10.947, 10.225, 10.271],
+            [0.875, 0.111, 0.108],
+            [10.832, 1.844, 1.877],
+            [12.344, 2.330, 2.227],
+            [22.999, 5.000, 4.903],
+            [20.086, 2.390, 2.278],
+            [3.036, 0.722, 0.673],
+            [1.420, 0.602, 0.578],
+            [3.040, 0.728, 0.714],
+            [10.842, 1.874, 1.783],
+            [9.207, 2.809, 2.705],
+            [2.751, 2.703, 2.714],
+            [2.810, 1.675, 1.568],
+            [6.507, 2.449, 2.505],
+            [15.968, 15.014, 15.318],
+            [13.479, 7.951, 7.702],
+            [13.227, 7.791, 7.699],
+            [2.811, 2.723, 2.549],
+            [0.358, 0.249, 0.273],
+            [0.157, 0.099, 0.101],
+            [0.189, 0.088, 0.080],
+            [0.758, 0.544, 0.525],
+            [0.115, 0.033, 0.027],
+            [0.063, 0.048, 0.023],
+            [0.014, 0.011, 0.008]
+        ]
+    }
+]
diff --git a/website/benchmark/hardware/results/yandex_cloud_broadwell_8_vcpu_s3.json b/website/benchmark/hardware/results/yandex_cloud_broadwell_8_vcpu_s3.json
new file mode 100644
index 00000000000..ace2442c86e
--- /dev/null
+++ b/website/benchmark/hardware/results/yandex_cloud_broadwell_8_vcpu_s3.json
@@ -0,0 +1,55 @@
+[
+    {
+        "system":       "Yandex Cloud 8vCPU Object Storage",
+        "system_full":  "Yandex Cloud Broadwell, 8 vCPU (4 threads), 64 GB RAM, Object Storage",
+        "cpu_vendor":   "Intel",
+        "time":         "2021-02-05 00:00:00",
+        "kind":         "cloud",
+        "result":
+        [
+            [0.007, 0.003, 0.003],
+            [0.214, 0.111, 0.096],
+            [1.239, 1.359, 0.718],
+            [3.056, 3.366, 1.869],
+            [1.946, 1.552, 2.450],
+            [4.804, 2.307, 2.398],
+            [0.198, 0.108, 0.114],
+            [0.141, 0.104, 0.100],
+            [2.755, 2.749, 3.608],
+            [3.140, 3.905, 3.830],
+            [2.353, 4.996, 1.637],
+            [3.796, 1.536, 1.724],
+            [3.565, 3.016, 3.381],
+            [4.962, 4.263, 4.352],
+            [4.210, 3.974, 4.318],
+            [3.884, 3.434, 3.124],
+            [10.451, 9.147, 7.526],
+            [6.288, 5.882, 7.714],
+            [15.239, 33.243, 17.968],
+            [1.645, 1.870, 3.230],
+            [10.980, 8.984, 7.589],
+            [14.345, 11.503, 12.449],
+            [17.687, 17.764, 18.984],
+            [76.606, 65.179, 94.215],
+            [5.833, 3.347, 3.127],
+            [3.815, 2.574, 2.402],
+            [4.916, 6.169, 5.731],
+            [7.961, 9.930, 8.555],
+            [5.995, 7.382, 6.054],
+            [3.113, 4.176, 3.172],
+            [5.077, 5.221, 5.709],
+            [8.990, 9.598, 6.272],
+            [17.832, 17.668, 17.276],
+            [11.846, 14.692, 13.225],
+            [12.544, 12.502, 12.725],
+            [3.604, 4.811, 3.267],
+            [0.738, 0.751, 0.862],
+            [0.718, 0.611, 0.561],
+            [2.125, 0.688, 0.522],
+            [1.469, 1.546, 1.373],
+            [1.382, 1.069, 0.976],
+            [1.353, 1.212, 1.119],
+            [0.045, 0.031, 0.041]
+        ]
+    }
+]

From e8d5fbc0a2dd01e1bb762e0451c5d2a138b2e37c Mon Sep 17 00:00:00 2001
From: bharatnc <bharatnc@gmail.com>
Date: Fri, 5 Feb 2021 08:07:02 -0800
Subject: [PATCH 089/122] Docs - more minor fixes

---
 .../functions/date-time-functions.md             | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md
index 3e85ff42834..5d96fd78c11 100644
--- a/docs/en/sql-reference/functions/date-time-functions.md
+++ b/docs/en/sql-reference/functions/date-time-functions.md
@@ -456,6 +456,10 @@ Aliases: `dateAdd`, `DATE_ADD`.
 -   `date` — [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md).
 
 
+**Returned value**
+
+Returns Date or DateTime with `value` expressed in `unit` added to `date`. 
+
 **Example**
 
 ```sql
@@ -478,6 +482,8 @@ Returns the difference between two Date or DateTime values.
 date_diff('unit', startdate, enddate, [timezone])
 ```
 
+Aliases: `dateDiff`, `DATE_DIFF`. 
+
 **Parameters**
 
 -   `unit` — Time unit, in which the returned value is expressed. [String](../../sql-reference/syntax.md#syntax-string-literal).
@@ -531,6 +537,10 @@ Aliases: `timeStampAdd`, `TIMESTAMP_ADD`.
 -   `unit` — Time unit, in which the returned value is expressed. [String](../../sql-reference/syntax.md#syntax-string-literal).
 
         Supported values: second, minute, hour, day, week, month, quarter, year.
+
+**Returned value**
+
+Returns Date or DateTime with the specified `value`  expressed in `unit` added to `date`. 
     
 **Example**
 
@@ -562,7 +572,11 @@ Aliases: `timeStampSub`, `TIMESTAMP_SUB`.
 
         Supported values: second, minute, hour, day, week, month, quarter, year.
 - `value` -  Amount of the specified unit of time. [String](../syntax.md#syntax-string-literal).   
-- `date1`, `date2` — Date or Date with time - [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md).
+- `date`- [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md).
+
+**Returned value**
+
+Difference between `date` and the specified `value` expressed in `unit`.
 
 **Example**
 

From 4906fd9c8c1cbaaed741d1dc015d50198ac093c5 Mon Sep 17 00:00:00 2001
From: bharatnc <bharatnc@gmail.com>
Date: Fri, 5 Feb 2021 08:55:30 -0800
Subject: [PATCH 090/122] Docs - fixes to  doc link

---
 .../functions/date-time-functions.md           | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md
index 5d96fd78c11..173fd8e4af0 100644
--- a/docs/en/sql-reference/functions/date-time-functions.md
+++ b/docs/en/sql-reference/functions/date-time-functions.md
@@ -380,7 +380,7 @@ Alias: `dateTrunc`.
 
 **Parameters**
 
--   `unit` — Part of date. [String](../syntax.md#syntax-string-literal).
+-   `unit` — Part of date. [String](../../sql-reference/data-types/string.md).
     Possible values:
 
     - `second`
@@ -449,10 +449,10 @@ Aliases: `dateAdd`, `DATE_ADD`.
 
 **Parameters**
 
--   `unit` — Time unit, in which the returned value is expressed. [String](../../sql-reference/syntax.md#syntax-string-literal).
+-   `unit` — Time unit, in which the returned value is expressed. [String](../../sql-reference/data-types/string.md).
 
         Supported values: second, minute, hour, day, week, month, quarter, year.
--   `value` - Amount of the specified unit of time.    
+-   `value` - Value in specified unit - [Int](../../sql-reference/data-types/int-uint.md)    
 -   `date` — [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md).
 
 
@@ -472,7 +472,7 @@ select date_add(YEAR, 3, toDate('2018-01-01'));
 └───────────────────────────────────────────────┘
 ```
 
-## date\_diff {#dated_diff}
+## date\_diff {#date_diff}
 
 Returns the difference between two Date or DateTime values.
 
@@ -486,7 +486,7 @@ Aliases: `dateDiff`, `DATE_DIFF`.
 
 **Parameters**
 
--   `unit` — Time unit, in which the returned value is expressed. [String](../../sql-reference/syntax.md#syntax-string-literal).
+-   `unit` — Time unit, in which the returned value is expressed. [String](../../sql-reference/data-types/string.md).
 
         Supported values: second, minute, hour, day, week, month, quarter, year.
 
@@ -533,8 +533,8 @@ Aliases: `timeStampAdd`, `TIMESTAMP_ADD`.
 **Parameters**
     
 -   `date` — Date or Date with time - [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md).
--   `value` -  Amount of the specified unit of time - [String](../syntax.md#syntax-string-literal)
--   `unit` — Time unit, in which the returned value is expressed. [String](../../sql-reference/syntax.md#syntax-string-literal).
+-   `value` - Value in specified unit - [Int](../../sql-reference/data-types/int-uint.md)
+-   `unit` — Time unit, in which the returned value is expressed. [String](../../sql-reference/data-types/string.md).
 
         Supported values: second, minute, hour, day, week, month, quarter, year.
 
@@ -568,10 +568,10 @@ Aliases: `timeStampSub`, `TIMESTAMP_SUB`.
 
 **Parameters**
 
--   `unit` — Time unit, in which the returned value is expressed. [String](../../sql-reference/syntax.md#syntax-string-literal).
+-   `unit` — Time unit, in which the returned value is expressed. [String](../../sql-reference/data-types/string.md).
 
         Supported values: second, minute, hour, day, week, month, quarter, year.
-- `value` -  Amount of the specified unit of time. [String](../syntax.md#syntax-string-literal).   
+- `value` - Value in specified unit - [Int](../../sql-reference/data-types/int-uint.md).   
 - `date`- [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md).
 
 **Returned value**

From 60f2e89cfba58dbaf263a7899cd4a63fedb4b729 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Fri, 5 Feb 2021 22:21:30 +0300
Subject: [PATCH 091/122] Fix the case when DataType parser may have
 exponential complexity

---
 src/Parsers/ParserDataType.cpp                            | 6 ++++--
 .../01691_parser_data_type_exponential.reference          | 0
 .../0_stateless/01691_parser_data_type_exponential.sh     | 8 ++++++++
 3 files changed, 12 insertions(+), 2 deletions(-)
 create mode 100644 tests/queries/0_stateless/01691_parser_data_type_exponential.reference
 create mode 100755 tests/queries/0_stateless/01691_parser_data_type_exponential.sh

diff --git a/src/Parsers/ParserDataType.cpp b/src/Parsers/ParserDataType.cpp
index 3d3f393a300..dd495fe6d53 100644
--- a/src/Parsers/ParserDataType.cpp
+++ b/src/Parsers/ParserDataType.cpp
@@ -32,8 +32,10 @@ private:
         const char * operators[] = {"=", "equals", nullptr};
         ParserLeftAssociativeBinaryOperatorList enum_parser(operators, std::make_unique<ParserLiteral>());
 
-        return nested_parser.parse(pos, node, expected)
-            || enum_parser.parse(pos, node, expected)
+        if (pos->type == TokenType::BareWord && std::string_view(pos->begin, pos->size()) == "Nested")
+            return nested_parser.parse(pos, node, expected);
+
+        return enum_parser.parse(pos, node, expected)
             || literal_parser.parse(pos, node, expected)
             || data_type_parser.parse(pos, node, expected);
     }
diff --git a/tests/queries/0_stateless/01691_parser_data_type_exponential.reference b/tests/queries/0_stateless/01691_parser_data_type_exponential.reference
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/queries/0_stateless/01691_parser_data_type_exponential.sh b/tests/queries/0_stateless/01691_parser_data_type_exponential.sh
new file mode 100755
index 00000000000..2b1d34982a2
--- /dev/null
+++ b/tests/queries/0_stateless/01691_parser_data_type_exponential.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+# shellcheck source=../shell_config.sh
+. "$CURDIR"/../shell_config.sh
+
+# Check that DataType parser does not have exponential complexity in the case found by fuzzer.
+for _ in {1..10}; do ${CLICKHOUSE_CLIENT} -n --testmode --query "SELECT CAST(1 AS A2222222222222222222222222222222222222222222222222teFuncpion(groupBitmap, 222222222222222ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222222222222222222222ggregateF222222222222222222222222222222222222222222222222222222222teFuncpion(groupBitmap, 222222222222222ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222222222222222222222222222ggregateFuncpion(groupBitmapp, 222222222222222ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222222222222222222222222222ggregateF222222222222222222222222222222222222222222222222222222222teFuncpion(groupBitmap, 222222222222223ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222222222222222222222222222ggregateFuncpion(groupBitmap, 00000000000000000000000000000000000000000000000000000000000000000000000000000001841416382, 222222222222222ggregateFuncpion(groupBitmap22222222222222222222222222222222222222222222222222220000000000000000000000000000000000000000000000000000000000000000000000000000002260637443813394204222222222222222222ggregateFuncpion(groupBitmapp, 222222222222222ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222222222222222222222222222ggregateF222222222222222222222222222222222222222222222222222222222teFuncpion(groupBitmap, 222222222222223ggregateFuncpio22222222ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222222222222222222222222222ggregateF222222222222222222222222222222222222222222222222222222222teFuncpion(groupBitmap, 222222222222222ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222222222222222222222ggregateF222222222222222222222222222222222222222222222222222222222teFuncpion(groupBitmap, 222222222222222ggregateFuncpion(groupBitmap2222222222222ggregateFuncpion(groupBitmapp, 222222222222222ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222222222222222222222222222ggregateF222222222222222222222222222222222222222222222222222222222teFuncpion(groupBitmap, 222222222222223ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222222222222222222222222222ggre222222222222222222222222222222222222222222222222222teFuncpion(groupBitmap, 22222222222222222ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222222222222222222222222222ggregateF222222222222222222222222222222222222222222222222222222222teFuncpion(groupBitmap, 2222222222222eFuncpion(groupBitmap, 00000000000000000000000000000000000000000000000000000000000000000000000000000001841416382, 222222222222222ggregateFuncpion(groupBitmap22222222222222222222222222222222222222222222222222222222222222222222222200000000000000000000178859639454016722222222222222222222222222222222222222222222222222222222222ggregateFuncpion(groupBitmapp, 222222222222222ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222222222222222222222222222ggregateF222222222222222222222222222222222222222222222222222222222teFuncpion(groupBitmap, 222222222222223ggregateFuncpio22222222ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222222222222222222222222222ggregateF222222222222222222222222222222222222222222222222222222222teFuncpion(groupBitmap, 222222222222222ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222222222222222222222ggregateF222222222222222222222222222222222222222222222222222222222teFuncpion(groupBitmap, 222222222222222ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222222222222222222222222222ggregateFuncpion(groupBitmapp, 222222222222222ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222222222222222222222222222ggregateF222222222222222222222222222222222222222222222222222222222teFuncpion(groupBitmap, 222222222222223ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222222222222222222222222222ggregateFuncpion(groupBitmap, 00000000000000000000000000000000000000000000000000000000000000000000000000000001841416382, 222222222222222ggregateFuncpion(groupBitmap22222222222222222222222222222222222222222222222222222222222222222222222200000000000000000000178859639454016722222222222222222222222222222222222222222222222222222222222ggregateFuncpion(groupBitmapp, 222222222222222ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222222222222222222222222222ggregateF222222222222222222222222222222222222222222222222222222222teFuncpion(groupBitmap, 222222222222223ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222222222222222222222222222ggregateFuncpion(groupBitmap, 22222222222222222222222222222222222222222222222222222222222222222222222222222222222222222, 222222222222222ggregateFuncpion(groupBitmap222222222222222222222222222222222222222222222222222222222222222222222222000000000000000000001788596394540167623222222222222222222ggregateFu22222222222222222222222222222222222, UInt33)); -- { clientError 62 }"; done

From f81a407cddc69960cbad86bd62c5b6dfac767d4a Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Fri, 5 Feb 2021 22:39:26 +0300
Subject: [PATCH 092/122] Fix if with tuple then/else arguments

---
 src/Functions/if.cpp                          |  7 ++--
 .../01701_if_tuple_segfault.reference         |  3 ++
 .../0_stateless/01701_if_tuple_segfault.sql   | 33 +++++++++++++++++++
 3 files changed, 40 insertions(+), 3 deletions(-)
 create mode 100644 tests/queries/0_stateless/01701_if_tuple_segfault.reference
 create mode 100644 tests/queries/0_stateless/01701_if_tuple_segfault.sql

diff --git a/src/Functions/if.cpp b/src/Functions/if.cpp
index 3be4848f1ff..614bfcf700e 100644
--- a/src/Functions/if.cpp
+++ b/src/Functions/if.cpp
@@ -532,7 +532,7 @@ private:
         return nullptr;
     }
 
-    ColumnPtr executeTuple(const ColumnsWithTypeAndName & arguments, size_t input_rows_count) const
+    ColumnPtr executeTuple(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const
     {
         /// Calculate function for each corresponding elements of tuples.
 
@@ -558,6 +558,7 @@ private:
 
         const DataTypeTuple & type1 = static_cast<const DataTypeTuple &>(*arg1.type);
         const DataTypeTuple & type2 = static_cast<const DataTypeTuple &>(*arg2.type);
+        const DataTypeTuple & tuple_result = static_cast<const DataTypeTuple &>(*result_type);
 
         ColumnsWithTypeAndName temporary_columns(3);
         temporary_columns[0] = arguments[0];
@@ -570,7 +571,7 @@ private:
             temporary_columns[1] = {col1_contents[i], type1.getElements()[i], {}};
             temporary_columns[2] = {col2_contents[i], type2.getElements()[i], {}};
 
-            tuple_columns[i] = executeImpl(temporary_columns, std::make_shared<DataTypeUInt8>(), input_rows_count);
+            tuple_columns[i] = executeImpl(temporary_columns, tuple_result.getElements()[i], input_rows_count);
         }
 
         return ColumnTuple::create(tuple_columns);
@@ -988,7 +989,7 @@ public:
             || (res = executeTyped<UInt128, UInt128>(cond_col, arguments, result_type, input_rows_count))
             || (res = executeString(cond_col, arguments, result_type))
             || (res = executeGenericArray(cond_col, arguments, result_type))
-            || (res = executeTuple(arguments, input_rows_count))))
+            || (res = executeTuple(arguments, result_type, input_rows_count))))
         {
             return executeGeneric(cond_col, arguments, input_rows_count);
         }
diff --git a/tests/queries/0_stateless/01701_if_tuple_segfault.reference b/tests/queries/0_stateless/01701_if_tuple_segfault.reference
new file mode 100644
index 00000000000..001e50da954
--- /dev/null
+++ b/tests/queries/0_stateless/01701_if_tuple_segfault.reference
@@ -0,0 +1,3 @@
+2020-10-01 19:20:30	hello	([0],45)	45	([0,1,2,3,4,5,6,7,8,9,10,11,12],[45,55,65,75,85,95,105,115,125,135,145,155,165])
+([3],4)
+2020-10-01 19:20:30	hello	([0],45)	5	([0,1,2,3,4,5,6,7,8,9,10,11,12],[22,27,32,37,42,47,52,57,62,67,72,77,82])
diff --git a/tests/queries/0_stateless/01701_if_tuple_segfault.sql b/tests/queries/0_stateless/01701_if_tuple_segfault.sql
new file mode 100644
index 00000000000..93b28c578a9
--- /dev/null
+++ b/tests/queries/0_stateless/01701_if_tuple_segfault.sql
@@ -0,0 +1,33 @@
+DROP TABLE IF EXISTS agg_table;
+
+CREATE TABLE IF NOT EXISTS agg_table
+(
+    time DateTime CODEC(DoubleDelta, LZ4),
+    xxx String,
+    two_values Tuple(Array(UInt16), UInt32),
+    agg_simple SimpleAggregateFunction(sum, UInt64),
+    agg SimpleAggregateFunction(sumMap, Tuple(Array(Int16), Array(UInt64)))
+)
+ENGINE = AggregatingMergeTree()
+ORDER BY (xxx, time);
+
+INSERT INTO agg_table SELECT toDateTime('2020-10-01 19:20:30'), 'hello', ([any(number)], sum(number)), sum(number),
+    sumMap((arrayMap(i -> toString(i), range(13)), arrayMap(i -> (number + i), range(13)))) FROM numbers(10);
+
+SELECT * FROM agg_table;
+
+SELECT if(xxx = 'x', ([2], 3), ([3], 4)) FROM agg_table;
+
+SELECT if(xxx = 'x', ([2], 3), ([3], 4, 'q', 'w', 7)) FROM agg_table; --{ serverError 386 }
+
+ALTER TABLE agg_table UPDATE two_values = (two_values.1, two_values.2) WHERE time BETWEEN toDateTime('2020-08-01 00:00:00') AND toDateTime('2020-12-01 00:00:00') SETTINGS mutations_sync = 2;
+
+ALTER TABLE agg_table UPDATE agg_simple = 5 WHERE time BETWEEN toDateTime('2020-08-01 00:00:00') AND toDateTime('2020-12-01 00:00:00') SETTINGS mutations_sync = 2;
+
+ALTER TABLE agg_table UPDATE agg = (agg.1, agg.2) WHERE time BETWEEN toDateTime('2020-08-01 00:00:00') AND toDateTime('2020-12-01 00:00:00') SETTINGS mutations_sync = 2;
+
+ALTER TABLE agg_table UPDATE agg = (agg.1, arrayMap(x -> toUInt64(x / 2), agg.2)) WHERE time BETWEEN toDateTime('2020-08-01 00:00:00') AND toDateTime('2020-12-01 00:00:00') SETTINGS mutations_sync = 2;
+
+SELECT * FROM agg_table;
+
+DROP TABLE IF EXISTS agg_table;

From 8dc3a207207a600e825088fd1cccbb67189a5248 Mon Sep 17 00:00:00 2001
From: bharatnc <bharatnc@gmail.com>
Date: Fri, 5 Feb 2021 13:33:47 -0800
Subject: [PATCH 093/122] Docs - improve unit description

---
 .../sql-reference/functions/date-time-functions.md | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md
index 173fd8e4af0..9080d191ce3 100644
--- a/docs/en/sql-reference/functions/date-time-functions.md
+++ b/docs/en/sql-reference/functions/date-time-functions.md
@@ -380,7 +380,7 @@ Alias: `dateTrunc`.
 
 **Parameters**
 
--   `unit` — Part of date. [String](../../sql-reference/data-types/string.md).
+-   `unit` — The type of interval to add. [String](../../sql-reference/data-types/string.md).
     Possible values:
 
     - `second`
@@ -449,7 +449,7 @@ Aliases: `dateAdd`, `DATE_ADD`.
 
 **Parameters**
 
--   `unit` — Time unit, in which the returned value is expressed. [String](../../sql-reference/data-types/string.md).
+-   `unit` — The type of interval to add. [String](../../sql-reference/data-types/string.md).
 
         Supported values: second, minute, hour, day, week, month, quarter, year.
 -   `value` - Value in specified unit - [Int](../../sql-reference/data-types/int-uint.md)    
@@ -486,7 +486,7 @@ Aliases: `dateDiff`, `DATE_DIFF`.
 
 **Parameters**
 
--   `unit` — Time unit, in which the returned value is expressed. [String](../../sql-reference/data-types/string.md).
+-   `unit` — The type of interval to add. [String](../../sql-reference/data-types/string.md).
 
         Supported values: second, minute, hour, day, week, month, quarter, year.
 
@@ -534,7 +534,7 @@ Aliases: `timeStampAdd`, `TIMESTAMP_ADD`.
     
 -   `date` — Date or Date with time - [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md).
 -   `value` - Value in specified unit - [Int](../../sql-reference/data-types/int-uint.md)
--   `unit` — Time unit, in which the returned value is expressed. [String](../../sql-reference/data-types/string.md).
+-   `unit` — The type of interval to add. [String](../../sql-reference/data-types/string.md).
 
         Supported values: second, minute, hour, day, week, month, quarter, year.
 
@@ -568,11 +568,11 @@ Aliases: `timeStampSub`, `TIMESTAMP_SUB`.
 
 **Parameters**
 
--   `unit` — Time unit, in which the returned value is expressed. [String](../../sql-reference/data-types/string.md).
+-   `unit` — The type of interval to add. [String](../../sql-reference/data-types/string.md).
 
         Supported values: second, minute, hour, day, week, month, quarter, year.
-- `value` - Value in specified unit - [Int](../../sql-reference/data-types/int-uint.md).   
-- `date`- [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md).
+-   `value` - Value in specified unit - [Int](../../sql-reference/data-types/int-uint.md).   
+-   `date`- [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md).
 
 **Returned value**
 

From af26ad6df9dc2fd22ea989d94429f8de4f2019bb Mon Sep 17 00:00:00 2001
From: vdimir <vdimir@yandex-team.ru>
Date: Sat, 6 Feb 2021 12:49:04 +0300
Subject: [PATCH 094/122] Minor changes in missed out date time functions

---
 docs/en/sql-reference/functions/date-time-functions.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md
index 9080d191ce3..2cec116f986 100644
--- a/docs/en/sql-reference/functions/date-time-functions.md
+++ b/docs/en/sql-reference/functions/date-time-functions.md
@@ -380,7 +380,7 @@ Alias: `dateTrunc`.
 
 **Parameters**
 
--   `unit` — The type of interval to add. [String](../../sql-reference/data-types/string.md).
+-   `unit` — The type of interval to truncate the result. [String](../../sql-reference/data-types/string.md).
     Possible values:
 
     - `second`
@@ -486,19 +486,19 @@ Aliases: `dateDiff`, `DATE_DIFF`.
 
 **Parameters**
 
--   `unit` — The type of interval to add. [String](../../sql-reference/data-types/string.md).
+-   `unit` — The type of interval for result [String](../../sql-reference/data-types/string.md).
 
         Supported values: second, minute, hour, day, week, month, quarter, year.
 
--   `startdate` — The first time value to compare. [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md).
+-   `startdate` — The first time value to subtract (the subtrahend). [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md).
 
--   `enddate` — The second time value to compare. [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md).
+-   `enddate` — The second time value to subtract from (the minuend). [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md).
 
 -   `timezone` — Optional parameter. If specified, it is applied to both `startdate` and `enddate`. If not specified, timezones of `startdate` and `enddate` are used. If they are not the same, the result is unspecified.
 
 **Returned value**
 
-Difference between `startdate` and `enddate` expressed in `unit`.
+Difference between `enddate` and `startdate` expressed in `unit`.
 
 Type: `int`.
 

From f4c2048bf21b467fb542ee9a897df2921b500080 Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Fri, 5 Feb 2021 22:06:22 +0300
Subject: [PATCH 095/122] Fix toDateTime64(toDate()/toDateTime()) for
 DateTime64

Maybe this is even does not worth it, but at least there was code that
assume that this should work - ToDateTime64Transform in
FunctionsConversion.h.
---
 src/Functions/DateTimeTransforms.h                          | 6 +++++-
 .../0_stateless/01692_DateTime64_from_DateTime.reference    | 5 +++++
 .../queries/0_stateless/01692_DateTime64_from_DateTime.sql  | 3 +++
 3 files changed, 13 insertions(+), 1 deletion(-)
 create mode 100644 tests/queries/0_stateless/01692_DateTime64_from_DateTime.reference
 create mode 100644 tests/queries/0_stateless/01692_DateTime64_from_DateTime.sql

diff --git a/src/Functions/DateTimeTransforms.h b/src/Functions/DateTimeTransforms.h
index b55f78e71bd..333b397312d 100644
--- a/src/Functions/DateTimeTransforms.h
+++ b/src/Functions/DateTimeTransforms.h
@@ -704,7 +704,11 @@ struct DateTimeTransformImpl
     {
         using Op = Transformer<typename FromDataType::FieldType, typename ToDataType::FieldType, Transform>;
 
-        const DateLUTImpl & time_zone = extractTimeZoneFromFunctionArguments(arguments, 1, 0);
+        size_t time_zone_argument_position = 1;
+        if constexpr (std::is_same_v<ToDataType, DataTypeDateTime64>)
+            time_zone_argument_position = 2;
+
+        const DateLUTImpl & time_zone = extractTimeZoneFromFunctionArguments(arguments, time_zone_argument_position, 0);
 
         const ColumnPtr source_col = arguments[0].column;
         if (const auto * sources = checkAndGetColumn<typename FromDataType::ColumnType>(source_col.get()))
diff --git a/tests/queries/0_stateless/01692_DateTime64_from_DateTime.reference b/tests/queries/0_stateless/01692_DateTime64_from_DateTime.reference
new file mode 100644
index 00000000000..183d6f1222c
--- /dev/null
+++ b/tests/queries/0_stateless/01692_DateTime64_from_DateTime.reference
@@ -0,0 +1,5 @@
+-- { echo }
+select toDateTime64(toDateTime(1), 2);
+1970-01-01 03:00:01.00
+select toDateTime64(toDate(1), 2);
+1970-01-02 00:00:00.00
diff --git a/tests/queries/0_stateless/01692_DateTime64_from_DateTime.sql b/tests/queries/0_stateless/01692_DateTime64_from_DateTime.sql
new file mode 100644
index 00000000000..543c6b373da
--- /dev/null
+++ b/tests/queries/0_stateless/01692_DateTime64_from_DateTime.sql
@@ -0,0 +1,3 @@
+-- { echo }
+select toDateTime64(toDateTime(1), 2);
+select toDateTime64(toDate(1), 2);

From 0627ba0e36435f73bcb30bb42db5e92e1a89b0af Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Fri, 5 Feb 2021 22:06:23 +0300
Subject: [PATCH 096/122] Fix timezone argument for DateTime64

It should be marked with always const, otherwise it will bail:

    Code: 44, e.displayText() = DB::Exception: Illegal column String of time zone argument of function, must be constant string: While processing toDateTime(-1, 1, 'GMT'), Stack trace (when copying this message, always include the lines below):
---
 src/Functions/FunctionsConversion.h                        | 7 ++++++-
 .../0_stateless/01692_DateTime64_from_DateTime.reference   | 4 ++++
 .../queries/0_stateless/01692_DateTime64_from_DateTime.sql | 2 ++
 3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/Functions/FunctionsConversion.h b/src/Functions/FunctionsConversion.h
index 96e49686526..df58e184d54 100644
--- a/src/Functions/FunctionsConversion.h
+++ b/src/Functions/FunctionsConversion.h
@@ -1294,7 +1294,12 @@ public:
     bool useDefaultImplementationForNulls() const override { return checked_return_type; }
 
     bool useDefaultImplementationForConstants() const override { return true; }
-    ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; }
+    ColumnNumbers getArgumentsThatAreAlwaysConstant() const override
+    {
+        if constexpr (std::is_same_v<ToDataType, DataTypeDateTime64>)
+            return {2};
+        return {1};
+    }
     bool canBeExecutedOnDefaultArguments() const override { return false; }
 
     ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override
diff --git a/tests/queries/0_stateless/01692_DateTime64_from_DateTime.reference b/tests/queries/0_stateless/01692_DateTime64_from_DateTime.reference
index 183d6f1222c..a0562e40027 100644
--- a/tests/queries/0_stateless/01692_DateTime64_from_DateTime.reference
+++ b/tests/queries/0_stateless/01692_DateTime64_from_DateTime.reference
@@ -3,3 +3,7 @@ select toDateTime64(toDateTime(1), 2);
 1970-01-01 03:00:01.00
 select toDateTime64(toDate(1), 2);
 1970-01-02 00:00:00.00
+select toDateTime64(toDateTime(1), 2, 'GMT');
+1970-01-01 00:00:01.00
+select toDateTime64(toDate(1), 2, 'GMT');
+1970-01-02 00:00:00.00
diff --git a/tests/queries/0_stateless/01692_DateTime64_from_DateTime.sql b/tests/queries/0_stateless/01692_DateTime64_from_DateTime.sql
index 543c6b373da..60f76e9192c 100644
--- a/tests/queries/0_stateless/01692_DateTime64_from_DateTime.sql
+++ b/tests/queries/0_stateless/01692_DateTime64_from_DateTime.sql
@@ -1,3 +1,5 @@
 -- { echo }
 select toDateTime64(toDateTime(1), 2);
 select toDateTime64(toDate(1), 2);
+select toDateTime64(toDateTime(1), 2, 'GMT');
+select toDateTime64(toDate(1), 2, 'GMT');

From c4b5eed4ff00506b52c1280380bbf23e911359a3 Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Fri, 5 Feb 2021 22:06:23 +0300
Subject: [PATCH 097/122] Fix DateTime64 initialization (to match DateTime
 behaviour)

There was no specializations for toDateTime64(<numeric>), and because of
this default decimal conversion was used, however this is not enough for
DateTime/DateTime64 types, since the date may overflow and the proper
check is required (like DateTime has), and this what UBsan found [1]:

    ../src/IO/WriteHelpers.h:812:33: runtime error: index 508 out of bounds for type 'const char [201]' Received signal -3 Received signal Unknown signal (-3)

Backtrace:

    (gdb) bt
    0  LocalDateTime::LocalDateTime (this=0x7fffffff8418, year_=1970, month_=1 '\001', day_=1 '\001', hour_=2 '\002', minute_=0 '\000', second_=254 '\376') at LocalDateTime.h:83
    1  0x00000000138a5edb in DB::writeDateTimeText<(char)45, (char)58, (char)32, (char)46> (datetime64=..., scale=7, buf=..., date_lut=...) at WriteHelpers.h:852
    2  0x0000000019c379b4 in DB::DataTypeDateTime64::serializeText (this=0x7ffff5c4b0d8, column=..., row_num=0, ostr=..., settings=...) at DataTypeDateTime64.cpp:66
    3  0x0000000019d297e4 in DB::IDataType::serializeAsText (this=0x7ffff5c4b0d8, column=..., row_num=0, ostr=..., settings=...) at IDataType.cpp:387

  [1]: https://clickhouse-test-reports.s3.yandex.net/19527/cea8ae162ffbf92e5ed29304ab010704c5d611c8/fuzzer_ubsan/report.html#fail1

Also fix CAST for DateTime64
---
 src/Functions/FunctionsConversion.h           | 55 ++++++++++++++++++-
 .../01691_DateTime64_clamp.reference          |  9 +++
 .../0_stateless/01691_DateTime64_clamp.sql    |  5 ++
 3 files changed, 68 insertions(+), 1 deletion(-)
 create mode 100644 tests/queries/0_stateless/01691_DateTime64_clamp.reference
 create mode 100644 tests/queries/0_stateless/01691_DateTime64_clamp.sql

diff --git a/src/Functions/FunctionsConversion.h b/src/Functions/FunctionsConversion.h
index df58e184d54..df0cba4c844 100644
--- a/src/Functions/FunctionsConversion.h
+++ b/src/Functions/FunctionsConversion.h
@@ -477,6 +477,59 @@ template <typename Name> struct ConvertImpl<DataTypeDate, DataTypeDateTime64, Na
 template <typename Name> struct ConvertImpl<DataTypeDateTime, DataTypeDateTime64, Name, ConvertDefaultBehaviorTag>
     : DateTimeTransformImpl<DataTypeDateTime, DataTypeDateTime64, ToDateTime64Transform> {};
 
+/** Conversion of numeric to DateTime64
+  */
+
+template <typename FromType>
+struct ToDateTime64TransformUnsigned
+{
+    static constexpr auto name = "toDateTime64";
+
+    const DateTime64::NativeType scale_multiplier = 1;
+
+    ToDateTime64TransformUnsigned(UInt32 scale = 0)
+        : scale_multiplier(DecimalUtils::scaleMultiplier<DateTime64::NativeType>(scale))
+    {}
+
+    inline NO_SANITIZE_UNDEFINED DateTime64::NativeType execute(const FromType & from, const DateLUTImpl &) const
+    {
+        return DecimalUtils::decimalFromComponentsWithMultiplier<DateTime64>(from, 0, scale_multiplier);
+    }
+};
+template <typename FromType>
+struct ToDateTime64TransformSigned
+{
+    static constexpr auto name = "toDateTime64";
+
+    const DateTime64::NativeType scale_multiplier = 1;
+
+    ToDateTime64TransformSigned(UInt32 scale = 0)
+        : scale_multiplier(DecimalUtils::scaleMultiplier<DateTime64::NativeType>(scale))
+    {}
+
+    inline NO_SANITIZE_UNDEFINED DateTime64::NativeType execute(const FromType & from, const DateLUTImpl &) const
+    {
+        if (from < 0)
+            return 0;
+        return DecimalUtils::decimalFromComponentsWithMultiplier<DateTime64>(from, 0, scale_multiplier);
+    }
+};
+
+template <typename Name> struct ConvertImpl<DataTypeInt8, DataTypeDateTime64, Name>
+    : DateTimeTransformImpl<DataTypeInt8, DataTypeDateTime64, ToDateTime64TransformSigned<Int8>> {};
+template <typename Name> struct ConvertImpl<DataTypeInt16, DataTypeDateTime64, Name>
+    : DateTimeTransformImpl<DataTypeInt16, DataTypeDateTime64, ToDateTime64TransformSigned<Int16>> {};
+template <typename Name> struct ConvertImpl<DataTypeInt32, DataTypeDateTime64, Name>
+    : DateTimeTransformImpl<DataTypeInt32, DataTypeDateTime64, ToDateTime64TransformSigned<Int32>> {};
+template <typename Name> struct ConvertImpl<DataTypeInt64, DataTypeDateTime64, Name>
+    : DateTimeTransformImpl<DataTypeInt64, DataTypeDateTime64, ToDateTime64TransformSigned<Int64>> {};
+template <typename Name> struct ConvertImpl<DataTypeUInt64, DataTypeDateTime64, Name>
+    : DateTimeTransformImpl<DataTypeUInt64, DataTypeDateTime64, ToDateTime64TransformUnsigned<UInt64>> {};
+template <typename Name> struct ConvertImpl<DataTypeFloat32, DataTypeDateTime64, Name>
+    : DateTimeTransformImpl<DataTypeFloat32, DataTypeDateTime64, ToDateTime64TransformSigned<Float32>> {};
+template <typename Name> struct ConvertImpl<DataTypeFloat64, DataTypeDateTime64, Name>
+    : DateTimeTransformImpl<DataTypeFloat64, DataTypeDateTime64, ToDateTime64TransformSigned<Float64>> {};
+
 /** Conversion of DateTime64 to Date or DateTime: discards fractional part.
  */
 template <typename Transform>
@@ -2318,7 +2371,7 @@ private:
                 using LeftDataType = typename Types::LeftType;
                 using RightDataType = typename Types::RightType;
 
-                if constexpr (IsDataTypeDecimalOrNumber<LeftDataType> && IsDataTypeDecimalOrNumber<RightDataType>)
+                if constexpr (IsDataTypeDecimalOrNumber<LeftDataType> && IsDataTypeDecimalOrNumber<RightDataType> && !std::is_same_v<DataTypeDateTime64, RightDataType>)
                 {
                     if (wrapper_cast_type == CastType::accurate)
                     {
diff --git a/tests/queries/0_stateless/01691_DateTime64_clamp.reference b/tests/queries/0_stateless/01691_DateTime64_clamp.reference
new file mode 100644
index 00000000000..1d222e2cb21
--- /dev/null
+++ b/tests/queries/0_stateless/01691_DateTime64_clamp.reference
@@ -0,0 +1,9 @@
+-- { echo }
+SELECT toDateTime(-2, 2);
+1970-01-01 03:00:00.00
+SELECT toDateTime64(-2, 2);
+1970-01-01 03:00:00.00
+SELECT CAST(-1 AS DateTime64);
+1970-01-01 03:00:00.000
+SELECT CAST('2020-01-01 00:00:00.3' AS DateTime64);
+2020-01-01 00:00:00.300
diff --git a/tests/queries/0_stateless/01691_DateTime64_clamp.sql b/tests/queries/0_stateless/01691_DateTime64_clamp.sql
new file mode 100644
index 00000000000..7ccce597adf
--- /dev/null
+++ b/tests/queries/0_stateless/01691_DateTime64_clamp.sql
@@ -0,0 +1,5 @@
+-- { echo }
+SELECT toDateTime(-2, 2);
+SELECT toDateTime64(-2, 2);
+SELECT CAST(-1 AS DateTime64);
+SELECT CAST('2020-01-01 00:00:00.3' AS DateTime64);

From 4092916db68de1264bb2afb3f1a27f681c43e28c Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Sat, 6 Feb 2021 14:56:05 +0300
Subject: [PATCH 098/122] Useless changes

---
 src/Common/ThreadProfileEvents.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Common/ThreadProfileEvents.cpp b/src/Common/ThreadProfileEvents.cpp
index e6336baecda..327178c92ff 100644
--- a/src/Common/ThreadProfileEvents.cpp
+++ b/src/Common/ThreadProfileEvents.cpp
@@ -68,7 +68,7 @@ TasksStatsCounters::TasksStatsCounters(const UInt64 tid, const MetricsProvider p
     case MetricsProvider::Netlink:
         stats_getter = [metrics_provider = std::make_shared<TaskStatsInfoGetter>(), tid]()
                 {
-                    ::taskstats result;
+                    ::taskstats result{};
                     metrics_provider->getStat(result, tid);
                     return result;
                 };
@@ -76,7 +76,7 @@ TasksStatsCounters::TasksStatsCounters(const UInt64 tid, const MetricsProvider p
     case MetricsProvider::Procfs:
         stats_getter = [metrics_provider = std::make_shared<ProcfsMetricsProvider>(tid)]()
                 {
-                    ::taskstats result;
+                    ::taskstats result{};
                     metrics_provider->getTaskStats(result);
                     return result;
                 };

From a790cd9bc5e96be05052201eae5fc65f91b20055 Mon Sep 17 00:00:00 2001
From: George <gyuton@yandex-team.ru>
Date: Sat, 6 Feb 2021 16:37:12 +0300
Subject: [PATCH 099/122] Fixes

---
 docs/en/sql-reference/functions/array-functions.md | 6 +++---
 docs/ru/sql-reference/functions/array-functions.md | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/en/sql-reference/functions/array-functions.md b/docs/en/sql-reference/functions/array-functions.md
index be6440bbe9c..d5b357795d7 100644
--- a/docs/en/sql-reference/functions/array-functions.md
+++ b/docs/en/sql-reference/functions/array-functions.md
@@ -1311,7 +1311,7 @@ arrayMin([func,] arr)
 
 -   The minimum of function values (or the array minimum). 
 
-Type: matches the array elements type. 
+Type: if `func` is specified, matches `func` return value type, else matches the array elements type. 
 
 **Examples**
 
@@ -1366,7 +1366,7 @@ arrayMax([func,] arr)
 
 -   The maximum of function values (or the array maximum). 
 
-Type: matches the array elements type. 
+Type: if `func` is specified, matches `func` return value type, else matches the array elements type. 
 
 **Examples**
 
@@ -1421,7 +1421,7 @@ arraySum([func,] arr)
 
 -   The sum of the function values (or the array sum).
 
-Type: for decimal numbers in source array — [Decimal128](../../sql-reference/data-types/decimal.md), for floating point numbers — [Float64](../../sql-reference/data-types/float.md), for numeric unsigned — [UInt64](../../sql-reference/data-types/int-uint.md), and for numeric signed — [Int64](../../sql-reference/data-types/int-uint.md).
+Type: for decimal numbers in source array (or for converted values, if `func` is specified) — [Decimal128](../../sql-reference/data-types/decimal.md), for floating point numbers — [Float64](../../sql-reference/data-types/float.md), for numeric unsigned — [UInt64](../../sql-reference/data-types/int-uint.md), and for numeric signed — [Int64](../../sql-reference/data-types/int-uint.md).
 
 **Examples**
 
diff --git a/docs/ru/sql-reference/functions/array-functions.md b/docs/ru/sql-reference/functions/array-functions.md
index 3bba6f799c3..82976af5fbc 100644
--- a/docs/ru/sql-reference/functions/array-functions.md
+++ b/docs/ru/sql-reference/functions/array-functions.md
@@ -1158,7 +1158,7 @@ arrayMin([func,] arr)
 
 -   Минимальное значение функции (или минимальный элемент массива).
 
-Тип: соответствует типу элементов массива.
+Тип: если передана `func`, соответствует типу ее возвращаемого значения, иначе соответствует типу элементов массива.
 
 **Примеры**
 
@@ -1213,7 +1213,7 @@ arrayMax([func,] arr)
 
 -   Максимальное значение функции (или максимальный элемент массива).
 
-Тип: соответствует типу элементов массива.
+Тип: если передана `func`, соответствует типу ее возвращаемого значения, иначе соответствует типу элементов массива.
 
 **Примеры**
 
@@ -1268,7 +1268,7 @@ arraySum([func,] arr)
 
 -   Сумма значений функции (или сумма элементов массива).
 
-Тип: для Decimal чисел в исходном массиве — [Decimal128](../../sql-reference/data-types/decimal.md), для чисел с плавающей точкой — [Float64](../../sql-reference/data-types/float.md), для беззнаковых целых чисел — [UInt64](../../sql-reference/data-types/int-uint.md), для целых чисел со знаком — [Int64](../../sql-reference/data-types/int-uint.md).
+Тип: для Decimal чисел в исходном массиве (если функция `func` была передана, то для чисел, преобразованных ею) — [Decimal128](../../sql-reference/data-types/decimal.md), для чисел с плавающей точкой — [Float64](../../sql-reference/data-types/float.md), для беззнаковых целых чисел — [UInt64](../../sql-reference/data-types/int-uint.md), для целых чисел со знаком — [Int64](../../sql-reference/data-types/int-uint.md).
 
 **Примеры**
 

From 529fb1ea49d2e0071ecaa938d04a65be42e75324 Mon Sep 17 00:00:00 2001
From: feng lv <fenglv15@mails.ucas.ac.cn>
Date: Sat, 6 Feb 2021 14:59:48 +0000
Subject: [PATCH 100/122] remove some useless code

---
 .../RewriteSumIfFunctionVisitor.cpp           | 20 ++++---------------
 1 file changed, 4 insertions(+), 16 deletions(-)

diff --git a/src/Interpreters/RewriteSumIfFunctionVisitor.cpp b/src/Interpreters/RewriteSumIfFunctionVisitor.cpp
index 2fb0765db13..2593c220c63 100644
--- a/src/Interpreters/RewriteSumIfFunctionVisitor.cpp
+++ b/src/Interpreters/RewriteSumIfFunctionVisitor.cpp
@@ -13,18 +13,6 @@ void RewriteSumIfFunctionMatcher::visit(ASTPtr & ast, Data & data)
         visit(*func, ast, data);
 }
 
-static ASTPtr createNewFunctionWithOneArgument(const String & func_name, const ASTPtr & argument)
-{
-    auto new_func = std::make_shared<ASTFunction>();
-    new_func->name = func_name;
-
-    auto new_arguments = std::make_shared<ASTExpressionList>();
-    new_arguments->children.push_back(argument);
-    new_func->arguments = new_arguments;
-    new_func->children.push_back(new_arguments);
-    return new_func;
-}
-
 void RewriteSumIfFunctionMatcher::visit(const ASTFunction & func, ASTPtr & ast, Data &)
 {
     if (!func.arguments || func.arguments->children.empty())
@@ -46,7 +34,7 @@ void RewriteSumIfFunctionMatcher::visit(const ASTFunction & func, ASTPtr & ast,
 
         if (func_arguments.size() == 2 && literal->value.get<UInt64>() == 1)
         {
-            auto new_func = createNewFunctionWithOneArgument("countIf", func_arguments[1]);
+            auto new_func = makeASTFunction("countIf", func_arguments[1]);
             new_func->setAlias(func.alias);
             ast = std::move(new_func);
             return;
@@ -74,7 +62,7 @@ void RewriteSumIfFunctionMatcher::visit(const ASTFunction & func, ASTPtr & ast,
             /// sum(if(cond, 1, 0)) -> countIf(cond)
             if (first_value == 1 && second_value == 0)
             {
-                auto new_func = createNewFunctionWithOneArgument("countIf", if_arguments[0]);
+                auto new_func = makeASTFunction("countIf", if_arguments[0]);
                 new_func->setAlias(func.alias);
                 ast = std::move(new_func);
                 return;
@@ -82,8 +70,8 @@ void RewriteSumIfFunctionMatcher::visit(const ASTFunction & func, ASTPtr & ast,
             /// sum(if(cond, 0, 1)) -> countIf(not(cond))
             if (first_value == 0 && second_value == 1)
             {
-                auto not_func = createNewFunctionWithOneArgument("not", if_arguments[0]);
-                auto new_func = createNewFunctionWithOneArgument("countIf", not_func);
+                auto not_func = makeASTFunction("not", if_arguments[0]);
+                auto new_func = makeASTFunction("countIf", not_func);
                 new_func->setAlias(func.alias);
                 ast = std::move(new_func);
                 return;

From 0cd36019280bc243d9b5bb926cfa2ec08ccaf623 Mon Sep 17 00:00:00 2001
From: Maxim Akhmedov <max42@yandex-team.ru>
Date: Sat, 6 Feb 2021 18:08:42 +0300
Subject: [PATCH 101/122] Allow using MergeTreeWhereOptimizer not only with
 MergeTree-based storages

---
 src/Interpreters/InterpreterSelectQuery.cpp    |  9 +++++++--
 .../MergeTree/MergeTreeWhereOptimizer.cpp      | 18 +++++-------------
 .../MergeTree/MergeTreeWhereOptimizer.h        |  4 +---
 3 files changed, 13 insertions(+), 18 deletions(-)

diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp
index 2ee1b3956e4..6122719d94e 100644
--- a/src/Interpreters/InterpreterSelectQuery.cpp
+++ b/src/Interpreters/InterpreterSelectQuery.cpp
@@ -390,13 +390,18 @@ InterpreterSelectQuery::InterpreterSelectQuery(
         if (try_move_to_prewhere && storage && !row_policy_filter && query.where() && !query.prewhere() && !query.final())
         {
             /// PREWHERE optimization: transfer some condition from WHERE to PREWHERE if enabled and viable
-            if (const auto * merge_tree = dynamic_cast<const MergeTreeData *>(storage.get()))
+            if (const auto & column_sizes = storage->getColumnSizes(); !column_sizes.empty())
             {
+                /// Extract column compressed sizes.
+                std::unordered_map<std::string, UInt64> column_compressed_sizes;
+                for (const auto & [name, sizes] : column_sizes)
+                    column_compressed_sizes[name] = sizes.data_compressed;
+
                 SelectQueryInfo current_info;
                 current_info.query = query_ptr;
                 current_info.syntax_analyzer_result = syntax_analyzer_result;
 
-                MergeTreeWhereOptimizer{current_info, *context, *merge_tree, metadata_snapshot, syntax_analyzer_result->requiredSourceColumns(), log};
+                MergeTreeWhereOptimizer{current_info, *context, std::move(column_compressed_sizes), metadata_snapshot, syntax_analyzer_result->requiredSourceColumns(), log};
             }
         }
 
diff --git a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp
index 5d6b74cabe9..34cac56d74c 100644
--- a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp
+++ b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp
@@ -30,7 +30,7 @@ static constexpr auto threshold = 2;
 MergeTreeWhereOptimizer::MergeTreeWhereOptimizer(
     SelectQueryInfo & query_info,
     const Context & context,
-    const MergeTreeData & data,
+    std::unordered_map<std::string, UInt64> column_sizes_,
     const StorageMetadataPtr & metadata_snapshot,
     const Names & queried_columns_,
     Poco::Logger * log_)
@@ -39,28 +39,20 @@ MergeTreeWhereOptimizer::MergeTreeWhereOptimizer(
     , queried_columns{queried_columns_}
     , block_with_constants{KeyCondition::getBlockWithConstants(query_info.query, query_info.syntax_analyzer_result, context)}
     , log{log_}
+    , column_sizes{std::move(column_sizes_)}
 {
     const auto & primary_key = metadata_snapshot->getPrimaryKey();
     if (!primary_key.column_names.empty())
         first_primary_key_column = primary_key.column_names[0];
 
-    calculateColumnSizes(data, queried_columns);
+    for (const auto & [_, size] : column_sizes)
+        total_size_of_queried_columns += size;
+
     determineArrayJoinedNames(query_info.query->as<ASTSelectQuery &>());
     optimize(query_info.query->as<ASTSelectQuery &>());
 }
 
 
-void MergeTreeWhereOptimizer::calculateColumnSizes(const MergeTreeData & data, const Names & column_names)
-{
-    for (const auto & column_name : column_names)
-    {
-        UInt64 size = data.getColumnCompressedSize(column_name);
-        column_sizes[column_name] = size;
-        total_size_of_queried_columns += size;
-    }
-}
-
-
 static void collectIdentifiersNoSubqueries(const ASTPtr & ast, NameSet & set)
 {
     if (auto opt_name = tryGetIdentifierName(ast))
diff --git a/src/Storages/MergeTree/MergeTreeWhereOptimizer.h b/src/Storages/MergeTree/MergeTreeWhereOptimizer.h
index 939c265b3e5..cad77fb9eed 100644
--- a/src/Storages/MergeTree/MergeTreeWhereOptimizer.h
+++ b/src/Storages/MergeTree/MergeTreeWhereOptimizer.h
@@ -33,7 +33,7 @@ public:
     MergeTreeWhereOptimizer(
         SelectQueryInfo & query_info,
         const Context & context,
-        const MergeTreeData & data,
+        std::unordered_map<std::string, UInt64> column_sizes_,
         const StorageMetadataPtr & metadata_snapshot,
         const Names & queried_columns_,
         Poco::Logger * log_);
@@ -75,8 +75,6 @@ private:
     /// Transform Conditions list to WHERE or PREWHERE expression.
     static ASTPtr reconstruct(const Conditions & conditions);
 
-    void calculateColumnSizes(const MergeTreeData & data, const Names & column_names);
-
     void optimizeConjunction(ASTSelectQuery & select, ASTFunction * const fun) const;
 
     void optimizeArbitrary(ASTSelectQuery & select) const;

From 8953fe1eb29774689e3163c161323109526c072c Mon Sep 17 00:00:00 2001
From: Ivan Lezhankin <ilezhankin@yandex-team.ru>
Date: Sat, 6 Feb 2021 19:30:46 +0300
Subject: [PATCH 102/122] Fix seekable buffer

---
 src/IO/ReadBufferFromFileDescriptor.cpp | 1 +
 src/IO/SeekableReadBuffer.h             | 6 ++++++
 2 files changed, 7 insertions(+)

diff --git a/src/IO/ReadBufferFromFileDescriptor.cpp b/src/IO/ReadBufferFromFileDescriptor.cpp
index 0ab07b85027..dd5d9e67cd7 100644
--- a/src/IO/ReadBufferFromFileDescriptor.cpp
+++ b/src/IO/ReadBufferFromFileDescriptor.cpp
@@ -90,6 +90,7 @@ bool ReadBufferFromFileDescriptor::nextImpl()
     if (bytes_read)
     {
         ProfileEvents::increment(ProfileEvents::ReadBufferFromFileDescriptorReadBytes, bytes_read);
+        working_buffer = internal_buffer;
         working_buffer.resize(bytes_read);
     }
     else
diff --git a/src/IO/SeekableReadBuffer.h b/src/IO/SeekableReadBuffer.h
index f7a468b0490..f8e6d817fb1 100644
--- a/src/IO/SeekableReadBuffer.h
+++ b/src/IO/SeekableReadBuffer.h
@@ -21,6 +21,12 @@ public:
      */
     virtual off_t seek(off_t off, int whence) = 0;
 
+    /**
+     * Keep in mind that seekable buffer may encounter eof() once and the working buffer
+     * may get into inconsistent state. Don't forget to reset it on the first nextImpl()
+     * after seek().
+     */
+
     /**
      * @return Offset from the begin of the underlying buffer / file corresponds to the buffer current position.
      */

From ab55556c5e3600528ba8f5e3d54638990b5b3a5b Mon Sep 17 00:00:00 2001
From: Ivan Lezhankin <ilezhankin@yandex-team.ru>
Date: Sat, 6 Feb 2021 20:24:52 +0300
Subject: [PATCH 103/122] Fix build of utils

---
 utils/check-mysql-binlog/main.cpp | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/utils/check-mysql-binlog/main.cpp b/utils/check-mysql-binlog/main.cpp
index ccdc4cd168c..04dfb56ff08 100644
--- a/utils/check-mysql-binlog/main.cpp
+++ b/utils/check-mysql-binlog/main.cpp
@@ -69,21 +69,27 @@ static DB::MySQLReplication::BinlogEventPtr parseSingleEventBody(
         case DB::MySQLReplication::WRITE_ROWS_EVENT_V1:
         case DB::MySQLReplication::WRITE_ROWS_EVENT_V2:
         {
-            event = std::make_shared<DB::MySQLReplication::WriteRowsEvent>(last_table_map_event, std::move(header));
+            DB::MySQLReplication::RowsEventHeader rows_header(header.type);
+            rows_header.parse(*event_payload);
+            event = std::make_shared<DB::MySQLReplication::WriteRowsEvent>(last_table_map_event, std::move(header), rows_header);
             event->parseEvent(*event_payload);
             break;
         }
         case DB::MySQLReplication::DELETE_ROWS_EVENT_V1:
         case DB::MySQLReplication::DELETE_ROWS_EVENT_V2:
         {
-            event = std::make_shared<DB::MySQLReplication::DeleteRowsEvent>(last_table_map_event, std::move(header));
+            DB::MySQLReplication::RowsEventHeader rows_header(header.type);
+            rows_header.parse(*event_payload);
+            event = std::make_shared<DB::MySQLReplication::DeleteRowsEvent>(last_table_map_event, std::move(header), rows_header);
             event->parseEvent(*event_payload);
             break;
         }
         case DB::MySQLReplication::UPDATE_ROWS_EVENT_V1:
         case DB::MySQLReplication::UPDATE_ROWS_EVENT_V2:
         {
-            event = std::make_shared<DB::MySQLReplication::UpdateRowsEvent>(last_table_map_event, std::move(header));
+            DB::MySQLReplication::RowsEventHeader rows_header(header.type);
+            rows_header.parse(*event_payload);
+            event = std::make_shared<DB::MySQLReplication::UpdateRowsEvent>(last_table_map_event, std::move(header), rows_header);
             event->parseEvent(*event_payload);
             break;
         }

From fd899daa0096c10dc8f0c18c0bff1f97615aabbf Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Sat, 6 Feb 2021 22:17:19 +0300
Subject: [PATCH 104/122] Fix UBSan report in arrayCumSum

---
 src/Functions/array/arrayCumSum.cpp | 62 +++++++++++++++++------------
 1 file changed, 37 insertions(+), 25 deletions(-)

diff --git a/src/Functions/array/arrayCumSum.cpp b/src/Functions/array/arrayCumSum.cpp
index 40c0cd4ade2..97c623d0be9 100644
--- a/src/Functions/array/arrayCumSum.cpp
+++ b/src/Functions/array/arrayCumSum.cpp
@@ -45,6 +45,41 @@ struct ArrayCumSumImpl
     }
 
 
+    template <typename Src, typename Dst>
+    static void NO_SANITIZE_UNDEFINED implConst(
+        size_t size, const IColumn::Offset * __restrict offsets, Dst * __restrict res_values, Src src_value)
+    {
+        size_t pos = 0;
+        for (const auto * end = offsets + size; offsets < end; ++offsets)
+        {
+            auto offset = *offsets;
+            Src accumulated{};
+            for (; pos < offset; ++pos)
+            {
+                accumulated += src_value;
+                res_values[pos] = accumulated;
+            }
+        }
+    }
+
+    template <typename Src, typename Dst>
+    static void NO_SANITIZE_UNDEFINED implVector(
+        size_t size, const IColumn::Offset * __restrict offsets, Dst * __restrict res_values, const Src * __restrict src_values)
+    {
+        size_t pos = 0;
+        for (const auto * end = offsets + size; offsets < end; ++offsets)
+        {
+            auto offset = *offsets;
+            Src accumulated{};
+            for (; pos < offset; ++pos)
+            {
+                accumulated += src_values[pos];
+                res_values[pos] = accumulated;
+            }
+        }
+    }
+
+
     template <typename Element, typename Result>
     static bool executeType(const ColumnPtr & mapped, const ColumnArray & array, ColumnPtr & res_ptr)
     {
@@ -75,19 +110,7 @@ struct ArrayCumSumImpl
 
             typename ColVecResult::Container & res_values = res_nested->getData();
             res_values.resize(column_const->size());
-
-            size_t pos = 0;
-            for (auto offset : offsets)
-            {
-                // skip empty arrays
-                if (pos < offset)
-                {
-                    res_values[pos++] = x; // NOLINT
-                    for (; pos < offset; ++pos)
-                        res_values[pos] = res_values[pos - 1] + x;
-                }
-            }
-
+            implConst(offsets.size(), offsets.data(), res_values.data(), x);
             res_ptr = ColumnArray::create(std::move(res_nested), array.getOffsetsPtr());
             return true;
         }
@@ -103,18 +126,7 @@ struct ArrayCumSumImpl
 
         typename ColVecResult::Container & res_values = res_nested->getData();
         res_values.resize(data.size());
-
-        size_t pos = 0;
-        for (auto offset : offsets)
-        {
-            // skip empty arrays
-            if (pos < offset)
-            {
-                res_values[pos] = data[pos]; // NOLINT
-                for (++pos; pos < offset; ++pos)
-                    res_values[pos] = res_values[pos - 1] + data[pos];
-            }
-        }
+        implVector(offsets.size(), offsets.data(), res_values.data(), data.data());
         res_ptr = ColumnArray::create(std::move(res_nested), array.getOffsetsPtr());
         return true;
 

From 65902f4c6e21fdb6b48764d4ef4c96e588eb946b Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Sat, 6 Feb 2021 22:21:22 +0300
Subject: [PATCH 105/122] Fix UBSan report in arrayCumSum

---
 src/Functions/array/arrayCumSum.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Functions/array/arrayCumSum.cpp b/src/Functions/array/arrayCumSum.cpp
index 97c623d0be9..96001901a6e 100644
--- a/src/Functions/array/arrayCumSum.cpp
+++ b/src/Functions/array/arrayCumSum.cpp
@@ -53,7 +53,7 @@ struct ArrayCumSumImpl
         for (const auto * end = offsets + size; offsets < end; ++offsets)
         {
             auto offset = *offsets;
-            Src accumulated{};
+            Dst accumulated{};
             for (; pos < offset; ++pos)
             {
                 accumulated += src_value;
@@ -70,7 +70,7 @@ struct ArrayCumSumImpl
         for (const auto * end = offsets + size; offsets < end; ++offsets)
         {
             auto offset = *offsets;
-            Src accumulated{};
+            Dst accumulated{};
             for (; pos < offset; ++pos)
             {
                 accumulated += src_values[pos];

From d9d49a4d30c5fcbe970f69d4c4157cd1b5f85a51 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Sat, 6 Feb 2021 22:22:21 +0300
Subject: [PATCH 106/122] Fix UBSan report in arrayCumSum

---
 .../array/arrayCumSumNonNegative.cpp          | 41 ++++++++++---------
 1 file changed, 21 insertions(+), 20 deletions(-)

diff --git a/src/Functions/array/arrayCumSumNonNegative.cpp b/src/Functions/array/arrayCumSumNonNegative.cpp
index ff0f081d70b..148d4283701 100644
--- a/src/Functions/array/arrayCumSumNonNegative.cpp
+++ b/src/Functions/array/arrayCumSumNonNegative.cpp
@@ -48,6 +48,26 @@ struct ArrayCumSumNonNegativeImpl
     }
 
 
+    template <typename Src, typename Dst>
+    static void NO_SANITIZE_UNDEFINED implVector(
+        size_t size, const IColumn::Offset * __restrict offsets, Dst * __restrict res_values, const Src * __restrict src_values)
+    {
+        size_t pos = 0;
+        for (const auto * end = offsets + size; offsets < end; ++offsets)
+        {
+            auto offset = *offsets;
+            Dst accumulated{};
+            for (; pos < offset; ++pos)
+            {
+                accumulated += src_values[pos];
+                if (accumulated < 0)
+                    accumulated = 0;
+                res_values[pos] = accumulated;
+            }
+        }
+    }
+
+
     template <typename Element, typename Result>
     static bool executeType(const ColumnPtr & mapped, const ColumnArray & array, ColumnPtr & res_ptr)
     {
@@ -70,26 +90,7 @@ struct ArrayCumSumNonNegativeImpl
 
         typename ColVecResult::Container & res_values = res_nested->getData();
         res_values.resize(data.size());
-
-        size_t pos = 0;
-        Result accum_sum = 0;
-        for (auto offset : offsets)
-        {
-            // skip empty arrays
-            if (pos < offset)
-            {
-                accum_sum = data[pos] > 0 ? data[pos] : Element(0); // NOLINT
-                res_values[pos] = accum_sum;
-                for (++pos; pos < offset; ++pos)
-                {
-                    accum_sum = accum_sum + data[pos];
-                    if (accum_sum < 0)
-                        accum_sum = 0;
-
-                    res_values[pos] = accum_sum;
-                }
-            }
-        }
+        implVector(offsets.size(), offsets.data(), res_values.data(), data.data());
         res_ptr = ColumnArray::create(std::move(res_nested), array.getOffsetsPtr());
         return true;
 

From 1209c02869b9c742afdee459edbbd6c1c25cf29f Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Sat, 6 Feb 2021 23:18:42 +0300
Subject: [PATCH 107/122] Fix overflow in mapPopulateSeries

---
 src/Functions/array/mapPopulateSeries.cpp                | 9 ++++++++-
 .../0_stateless/01698_map_populate_overflow.reference    | 1 +
 .../queries/0_stateless/01698_map_populate_overflow.sql  | 2 ++
 3 files changed, 11 insertions(+), 1 deletion(-)
 create mode 100644 tests/queries/0_stateless/01698_map_populate_overflow.reference
 create mode 100644 tests/queries/0_stateless/01698_map_populate_overflow.sql

diff --git a/src/Functions/array/mapPopulateSeries.cpp b/src/Functions/array/mapPopulateSeries.cpp
index 46c99dba483..2050e0c28ab 100644
--- a/src/Functions/array/mapPopulateSeries.cpp
+++ b/src/Functions/array/mapPopulateSeries.cpp
@@ -16,6 +16,7 @@ namespace ErrorCodes
     extern const int ILLEGAL_COLUMN;
     extern const int ILLEGAL_TYPE_OF_ARGUMENT;
     extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
+    extern const int TOO_LARGE_ARRAY_SIZE;
 }
 
 class FunctionMapPopulateSeries : public IFunction
@@ -188,9 +189,13 @@ private:
                 }
             }
 
+            static constexpr size_t MAX_ARRAY_SIZE = 1ULL << 30;
+            if (static_cast<size_t>(max_key - min_key) > MAX_ARRAY_SIZE)
+                throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Too large array size in the result of function {}", getName());
+
             /* fill the result arrays */
             KeyType key;
-            for (key = min_key; key <= max_key; ++key)
+            for (key = min_key;; ++key)
             {
                 to_keys_data.insert(key);
 
@@ -205,6 +210,8 @@ private:
                 }
 
                 ++offset;
+                if (key == max_key)
+                    break;
             }
 
             to_keys_offsets.push_back(offset);
diff --git a/tests/queries/0_stateless/01698_map_populate_overflow.reference b/tests/queries/0_stateless/01698_map_populate_overflow.reference
new file mode 100644
index 00000000000..24e0038125a
--- /dev/null
+++ b/tests/queries/0_stateless/01698_map_populate_overflow.reference
@@ -0,0 +1 @@
+([18446744073709551615],[0])
diff --git a/tests/queries/0_stateless/01698_map_populate_overflow.sql b/tests/queries/0_stateless/01698_map_populate_overflow.sql
new file mode 100644
index 00000000000..90c47ff3949
--- /dev/null
+++ b/tests/queries/0_stateless/01698_map_populate_overflow.sql
@@ -0,0 +1,2 @@
+SELECT mapPopulateSeries([0xFFFFFFFFFFFFFFFF], [0], 0xFFFFFFFFFFFFFFFF);
+SELECT mapPopulateSeries([toUInt64(1)], [1], 0xFFFFFFFFFFFFFFFF); -- { serverError 128 }

From 4514c06c2fe194d4b49344bdf59fa8cc7b5071f0 Mon Sep 17 00:00:00 2001
From: bharatnc <bharatnc@gmail.com>
Date: Sat, 6 Feb 2021 12:59:31 -0800
Subject: [PATCH 108/122] Docs - date_sub

---
 .../functions/date-time-functions.md          | 40 +++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md
index 2cec116f986..664450b385a 100644
--- a/docs/en/sql-reference/functions/date-time-functions.md
+++ b/docs/en/sql-reference/functions/date-time-functions.md
@@ -518,6 +518,46 @@ Result:
 └────────────────────────────────────────────────────────────────────────────────────────┘
 ```
 
+## date\_sub {#date_sub}
+
+This subtracts a time/date interval from a date and then returns the date.
+
+**Syntax**
+
+``` sql
+date_sub(unit, value, date)
+```
+
+Aliases: `dateSub`, `DATE_SUB`. 
+
+**Parameters**
+
+-   `unit` — The type of interval to subtract. [String](../../sql-reference/data-types/string.md).
+
+        Supported values: second, minute, hour, day, week, month, quarter, year.
+-   `value` - Value in specified unit - [Int](../../sql-reference/data-types/int-uint.md)    
+-   `date` — [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md).
+
+**Returned value**
+
+Returns Date or DateTime with `value` expressed in `unit` subtracted from `date`. 
+
+**Example**
+
+Query:
+
+``` sql
+SELECT date_sub(YEAR, 3, toDate('2018-01-01'));
+```
+
+Result:
+
+``` text
+┌─minus(toDate('2018-01-01'), toIntervalYear(3))─┐
+│                                     2015-01-01 │
+└────────────────────────────────────────────────┘
+```
+
 ## timestamp\_add {#timestamp_add}
 
 Adds the specified time value with the provided date or date time value.

From 5ddcbe8b90f8e5964afed5a265a1284d44310797 Mon Sep 17 00:00:00 2001
From: bharatnc <bharatnc@gmail.com>
Date: Sat, 6 Feb 2021 13:05:06 -0800
Subject: [PATCH 109/122] Docs - minor unrelated fix to date_trunc

---
 docs/en/sql-reference/functions/date-time-functions.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md
index 664450b385a..86a1110caf9 100644
--- a/docs/en/sql-reference/functions/date-time-functions.md
+++ b/docs/en/sql-reference/functions/date-time-functions.md
@@ -380,7 +380,7 @@ Alias: `dateTrunc`.
 
 **Parameters**
 
--   `unit` — The type of interval to truncate the result. [String](../../sql-reference/data-types/string.md).
+-   `unit` — The type of interval to truncate the result. [String Literal](../syntax.md#syntax-string-literal).
     Possible values:
 
     - `second`

From 44b44c1fe7ca591087296cbdd783b9154ec7a6b0 Mon Sep 17 00:00:00 2001
From: alexey-milovidov <milovidov@yandex-team.ru>
Date: Sun, 7 Feb 2021 01:09:55 +0300
Subject: [PATCH 110/122] Update InterpreterSelectQuery.cpp

---
 src/Interpreters/InterpreterSelectQuery.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp
index 6122719d94e..4b89273cd86 100644
--- a/src/Interpreters/InterpreterSelectQuery.cpp
+++ b/src/Interpreters/InterpreterSelectQuery.cpp
@@ -69,7 +69,6 @@
 #include <Processors/Transforms/FilterTransform.h>
 #include <Processors/Transforms/JoiningTransform.h>
 
-#include <Storages/MergeTree/MergeTreeData.h>
 #include <Storages/MergeTree/MergeTreeWhereOptimizer.h>
 #include <Storages/IStorage.h>
 #include <Storages/StorageView.h>

From 417cfcd6989bbe760ae2e4061e6fd07697e3bb6e Mon Sep 17 00:00:00 2001
From: alexey-milovidov <milovidov@yandex-team.ru>
Date: Sun, 7 Feb 2021 01:31:20 +0300
Subject: [PATCH 111/122] Update ReadBuffer.h

---
 src/IO/ReadBuffer.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/IO/ReadBuffer.h b/src/IO/ReadBuffer.h
index ae8898efcef..e871205aef3 100644
--- a/src/IO/ReadBuffer.h
+++ b/src/IO/ReadBuffer.h
@@ -198,7 +198,7 @@ private:
       */
     virtual bool nextImpl() { return false; }
 
-    [[noreturn]] static inline void throwReadAfterEOF()
+    [[noreturn]] static void throwReadAfterEOF()
     {
         throw Exception("Attempt to read after eof", ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF);
     }

From 46ff7d2ab04d0e1dc914f4fc038dd15572eb96b0 Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Fri, 5 Feb 2021 22:06:23 +0300
Subject: [PATCH 112/122] Fix DateTime64 overflows

---
 src/Functions/FunctionsConversion.h                        | 6 ++++--
 tests/queries/0_stateless/01691_DateTime64_clamp.reference | 2 ++
 tests/queries/0_stateless/01691_DateTime64_clamp.sql       | 1 +
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/Functions/FunctionsConversion.h b/src/Functions/FunctionsConversion.h
index df0cba4c844..b95d4ea9790 100644
--- a/src/Functions/FunctionsConversion.h
+++ b/src/Functions/FunctionsConversion.h
@@ -491,8 +491,9 @@ struct ToDateTime64TransformUnsigned
         : scale_multiplier(DecimalUtils::scaleMultiplier<DateTime64::NativeType>(scale))
     {}
 
-    inline NO_SANITIZE_UNDEFINED DateTime64::NativeType execute(const FromType & from, const DateLUTImpl &) const
+    inline NO_SANITIZE_UNDEFINED DateTime64::NativeType execute(FromType from, const DateLUTImpl &) const
     {
+        from = std::min(time_t(from), time_t(0xFFFFFFFF));
         return DecimalUtils::decimalFromComponentsWithMultiplier<DateTime64>(from, 0, scale_multiplier);
     }
 };
@@ -507,10 +508,11 @@ struct ToDateTime64TransformSigned
         : scale_multiplier(DecimalUtils::scaleMultiplier<DateTime64::NativeType>(scale))
     {}
 
-    inline NO_SANITIZE_UNDEFINED DateTime64::NativeType execute(const FromType & from, const DateLUTImpl &) const
+    inline NO_SANITIZE_UNDEFINED DateTime64::NativeType execute(FromType from, const DateLUTImpl &) const
     {
         if (from < 0)
             return 0;
+        from = std::min(time_t(from), time_t(0xFFFFFFFF));
         return DecimalUtils::decimalFromComponentsWithMultiplier<DateTime64>(from, 0, scale_multiplier);
     }
 };
diff --git a/tests/queries/0_stateless/01691_DateTime64_clamp.reference b/tests/queries/0_stateless/01691_DateTime64_clamp.reference
index 1d222e2cb21..de72027334c 100644
--- a/tests/queries/0_stateless/01691_DateTime64_clamp.reference
+++ b/tests/queries/0_stateless/01691_DateTime64_clamp.reference
@@ -7,3 +7,5 @@ SELECT CAST(-1 AS DateTime64);
 1970-01-01 03:00:00.000
 SELECT CAST('2020-01-01 00:00:00.3' AS DateTime64);
 2020-01-01 00:00:00.300
+SELECT toDateTime64(bitShiftLeft(toUInt64(1),33), 2);
+2106-02-07 09:28:15.00
diff --git a/tests/queries/0_stateless/01691_DateTime64_clamp.sql b/tests/queries/0_stateless/01691_DateTime64_clamp.sql
index 7ccce597adf..6b5a4815f37 100644
--- a/tests/queries/0_stateless/01691_DateTime64_clamp.sql
+++ b/tests/queries/0_stateless/01691_DateTime64_clamp.sql
@@ -3,3 +3,4 @@ SELECT toDateTime(-2, 2);
 SELECT toDateTime64(-2, 2);
 SELECT CAST(-1 AS DateTime64);
 SELECT CAST('2020-01-01 00:00:00.3' AS DateTime64);
+SELECT toDateTime64(bitShiftLeft(toUInt64(1),33), 2);

From 4181f8d9b7686d277d9fecf575526739306d6373 Mon Sep 17 00:00:00 2001
From: Ivan <5627721+abyss7@users.noreply.github.com>
Date: Sun, 7 Feb 2021 11:06:39 +0300
Subject: [PATCH 113/122] Fix segfault using ANTLR parser (#20156)

* Fix build of utils

* Fix visitor
---
 src/Interpreters/MarkTableIdentifiersVisitor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Interpreters/MarkTableIdentifiersVisitor.cpp b/src/Interpreters/MarkTableIdentifiersVisitor.cpp
index 78563059ed1..6557e1b5292 100644
--- a/src/Interpreters/MarkTableIdentifiersVisitor.cpp
+++ b/src/Interpreters/MarkTableIdentifiersVisitor.cpp
@@ -47,7 +47,7 @@ void MarkTableIdentifiersMatcher::visit(const ASTFunction & func, ASTPtr &, Data
     // First argument of dictGet can be a dictionary name, perhaps with a database.
     if (functionIsJoinGet(func.name) || functionIsDictGet(func.name))
     {
-        if (func.arguments->children.empty())
+        if (!func.arguments || func.arguments->children.empty())
         {
             return;
         }

From db04af3dcef946dd33f37ef8224e7687ac224433 Mon Sep 17 00:00:00 2001
From: Maksim Kita <maksim-kita@yandex-team.ru>
Date: Sun, 7 Feb 2021 13:06:34 +0300
Subject: [PATCH 114/122] Fix macOS build docs

---
 docs/en/development/build-osx.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en/development/build-osx.md b/docs/en/development/build-osx.md
index c3a0a540b6d..d78dd12b2dc 100644
--- a/docs/en/development/build-osx.md
+++ b/docs/en/development/build-osx.md
@@ -40,7 +40,7 @@ $ cd ClickHouse
 ``` bash
 $ mkdir build
 $ cd build
-$ cmake ..-DCMAKE_C_COMPILER=`brew --prefix llvm`/bin/clang -DCMAKE_CXX_COMPILER=`brew --prefix llvm`/bin/clang++ -DCMAKE_PREFIX_PATH=`brew --prefix llvm`
+$ cmake .. -DCMAKE_C_COMPILER=`brew --prefix llvm`/bin/clang -DCMAKE_CXX_COMPILER=`brew --prefix llvm`/bin/clang++ -DCMAKE_PREFIX_PATH=`brew --prefix llvm`
 $ ninja
 $ cd ..
 ```

From 38a7248d3aceaed0ebaf73440df9a5e0cac79239 Mon Sep 17 00:00:00 2001
From: vdimir <vdimir@yandex-team.ru>
Date: Sun, 7 Feb 2021 14:01:00 +0300
Subject: [PATCH 115/122] Minor changes in date_sub doc

---
 docs/en/sql-reference/functions/date-time-functions.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md
index 86a1110caf9..4a73bdb2546 100644
--- a/docs/en/sql-reference/functions/date-time-functions.md
+++ b/docs/en/sql-reference/functions/date-time-functions.md
@@ -520,7 +520,7 @@ Result:
 
 ## date\_sub {#date_sub}
 
-This subtracts a time/date interval from a date and then returns the date.
+Subtracts a time/date interval from the provided date.
 
 **Syntax**
 
@@ -536,7 +536,7 @@ Aliases: `dateSub`, `DATE_SUB`.
 
         Supported values: second, minute, hour, day, week, month, quarter, year.
 -   `value` - Value in specified unit - [Int](../../sql-reference/data-types/int-uint.md)    
--   `date` — [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md).
+-   `date` — [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md) to subtract value from.
 
 **Returned value**
 

From b26ebd6df8e84c27285919cf257079ccd9405154 Mon Sep 17 00:00:00 2001
From: alexey-milovidov <milovidov@yandex-team.ru>
Date: Sun, 7 Feb 2021 16:13:23 +0300
Subject: [PATCH 116/122] Slack link was expired

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 8e114d5abe9..53778c79bef 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ ClickHouse® is an open-source column-oriented database management system that a
 * [Tutorial](https://clickhouse.tech/docs/en/getting_started/tutorial/) shows how to set up and query small ClickHouse cluster.
 * [Documentation](https://clickhouse.tech/docs/en/) provides more in-depth information.
 * [YouTube channel](https://www.youtube.com/c/ClickHouseDB) has a lot of content about ClickHouse in video format.
-* [Slack](https://join.slack.com/t/clickhousedb/shared_invite/zt-d2zxkf9e-XyxDa_ucfPxzuH4SJIm~Ng) and [Telegram](https://telegram.me/clickhouse_en) allow to chat with ClickHouse users in real-time.
+* [Slack](https://join.slack.com/t/clickhousedb/shared_invite/zt-ly9m4w1x-6j7x5Ts_pQZqrctAbRZ3cg) and [Telegram](https://telegram.me/clickhouse_en) allow to chat with ClickHouse users in real-time.
 * [Blog](https://clickhouse.yandex/blog/en/) contains various ClickHouse-related articles, as well as announcements and reports about events.
 * [Code Browser](https://clickhouse.tech/codebrowser/html_report/ClickHouse/index.html) with syntax highlight and navigation.
 * [Yandex.Messenger channel](https://yandex.ru/chat/#/join/20e380d9-c7be-4123-ab06-e95fb946975e) shares announcements and useful links in Russian.

From c941d3bf2092bbe16248bcd01127df4303336974 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Sun, 7 Feb 2021 16:14:31 +0300
Subject: [PATCH 117/122] Renew Slack link

---
 website/templates/index/community.html | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/templates/index/community.html b/website/templates/index/community.html
index e65f9ff0f86..20b09e7318b 100644
--- a/website/templates/index/community.html
+++ b/website/templates/index/community.html
@@ -66,7 +66,7 @@
                 </div>
                 <div class="row mb-3">
                     <div class="col w-100">
-                        <a href="https://join.slack.com/t/clickhousedb/shared_invite/zt-d2zxkf9e-XyxDa_ucfPxzuH4SJIm~Ng"
+                        <a href="https://join.slack.com/t/clickhousedb/shared_invite/zt-ly9m4w1x-6j7x5Ts_pQZqrctAbRZ3cg"
                             rel="external nofollow noreferrer" target="_blank" class="text-decoration-none">
                             <div class="bg-dark p-4">
                                 <img data-src="/images/index/slack.svg"

From 375a9b440795e13aac298d0229de02d390ab31f9 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Wed, 3 Feb 2021 11:00:20 +0300
Subject: [PATCH 118/122] Fix build

---
 contrib/base64-cmake/CMakeLists.txt    | 2 +-
 contrib/hyperscan-cmake/CMakeLists.txt | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/contrib/base64-cmake/CMakeLists.txt b/contrib/base64-cmake/CMakeLists.txt
index 63b4e324d29..a295ee45b84 100644
--- a/contrib/base64-cmake/CMakeLists.txt
+++ b/contrib/base64-cmake/CMakeLists.txt
@@ -11,7 +11,7 @@ endif ()
 target_compile_options(base64_scalar PRIVATE -falign-loops)
 
 if (ARCH_AMD64)
-    target_compile_options(base64_ssse3 PRIVATE -mssse3 -falign-loops)
+    target_compile_options(base64_ssse3 PRIVATE -mno-avx -mno-avx2 -mssse3 -falign-loops)
     target_compile_options(base64_avx PRIVATE -falign-loops -mavx)
     target_compile_options(base64_avx2 PRIVATE -falign-loops -mavx2)
 else ()
diff --git a/contrib/hyperscan-cmake/CMakeLists.txt b/contrib/hyperscan-cmake/CMakeLists.txt
index c44214cded8..75c45ff7bf5 100644
--- a/contrib/hyperscan-cmake/CMakeLists.txt
+++ b/contrib/hyperscan-cmake/CMakeLists.txt
@@ -252,6 +252,7 @@ if (NOT EXTERNAL_HYPERSCAN_LIBRARY_FOUND)
     target_compile_definitions (hyperscan PUBLIC USE_HYPERSCAN=1)
     target_compile_options (hyperscan
         PRIVATE -g0 # Library has too much debug information
+        -mno-avx -mno-avx2 # The library is using dynamic dispatch and is confused if AVX is enabled globally
         -march=corei7 -O2 -fno-strict-aliasing -fno-omit-frame-pointer -fvisibility=hidden # The options from original build system
         -fno-sanitize=undefined # Assume the library takes care of itself
     )

From 0145be85df3f11f7f809b96e5f9f9266b6ec3034 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Sun, 7 Feb 2021 16:50:57 +0300
Subject: [PATCH 119/122] Do not spill warnings suppressions from ANTLR

---
 src/Parsers/New/CMakeLists.txt | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/Parsers/New/CMakeLists.txt b/src/Parsers/New/CMakeLists.txt
index 360dd4d7488..468394b7bd8 100644
--- a/src/Parsers/New/CMakeLists.txt
+++ b/src/Parsers/New/CMakeLists.txt
@@ -65,8 +65,6 @@ target_compile_options (clickhouse_parsers_new
         -Wno-documentation-deprecated-sync
         -Wno-shadow-field
         -Wno-unused-parameter
-
-    PUBLIC
         -Wno-extra-semi
         -Wno-inconsistent-missing-destructor-override
 )

From e814db68360db68831ca631408d551cf00bfe2c5 Mon Sep 17 00:00:00 2001
From: robot-clickhouse <robot-clickhouse@yandex-team.ru>
Date: Sun, 7 Feb 2021 17:18:18 +0300
Subject: [PATCH 120/122] Update version_date.tsv after release 21.2.2.8

---
 utils/list-versions/version_date.tsv | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/list-versions/version_date.tsv b/utils/list-versions/version_date.tsv
index c4b27f3199d..8d05f5fff46 100644
--- a/utils/list-versions/version_date.tsv
+++ b/utils/list-versions/version_date.tsv
@@ -1,3 +1,4 @@
+v21.2.2.8-stable	2021-02-07
 v21.1.3.32-stable	2021-02-03
 v21.1.2.15-stable	2021-01-18
 v20.12.5.18-stable	2021-02-03

From 06a92e497345495f65db17c8ccdb045bfa0726c9 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Sun, 7 Feb 2021 18:17:13 +0300
Subject: [PATCH 121/122] Add changelog for 21.2

---
 CHANGELOG.md | 145 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 145 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d2cc3e51997..fffd732f7d7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,148 @@
+## ClickHouse release 21.2
+
+### ClickHouse release v21.2.2.8-stable, 2021-02-07
+
+#### Backward Incompatible Change
+
+* Bitwise functions (`bitAnd`, `bitOr`, etc) are forbidden for floating point arguments. Now you have to do explicit cast to integer. [#19853](https://github.com/ClickHouse/ClickHouse/pull/19853) ([Azat Khuzhin](https://github.com/azat)).
+* Forbid `lcm`/`gcd` for floats. [#19532](https://github.com/ClickHouse/ClickHouse/pull/19532) ([Azat Khuzhin](https://github.com/azat)).
+* Fix memory tracking for `OPTIMIZE TABLE`/merges; account query memory limits and sampling for `OPTIMIZE TABLE`/merges. [#18772](https://github.com/ClickHouse/ClickHouse/pull/18772) ([Azat Khuzhin](https://github.com/azat)).
+* Disallow floating point column as partition key, see [#18421](https://github.com/ClickHouse/ClickHouse/issues/18421#event-4147046255). [#18464](https://github.com/ClickHouse/ClickHouse/pull/18464) ([hexiaoting](https://github.com/hexiaoting)).
+* Excessive parenthesis in type definitions no longer supported, example: `Array((UInt8))`.
+
+#### New Feature
+
+* Added `PostgreSQL` table engine (both select/insert, with support for multidimensional arrays), also as table function. Added `PostgreSQL` dictionary source. Added `PostgreSQL` database engine. [#18554](https://github.com/ClickHouse/ClickHouse/pull/18554) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Data type `Nested` now supports arbitrary levels of nesting. Introduced subcolumns of complex types, such as `size0` in `Array`, `null` in `Nullable`, names of `Tuple` elements, which can be read without reading of whole column. [#17310](https://github.com/ClickHouse/ClickHouse/pull/17310) ([Anton Popov](https://github.com/CurtizJ)).
+* Added `Nullable` support for `FlatDictionary`, `HashedDictionary`, `ComplexKeyHashedDictionary`, `DirectDictionary`, `ComplexKeyDirectDictionary`, `RangeHashedDictionary`. [#18236](https://github.com/ClickHouse/ClickHouse/pull/18236) ([Maksim Kita](https://github.com/kitaisreal)).
+* Adds a new table called `system.distributed_ddl_queue` that displays the queries in the DDL worker queue. [#17656](https://github.com/ClickHouse/ClickHouse/pull/17656) ([Bharat Nallan](https://github.com/bharatnc)).
+* Added support of mapping LDAP group names, and attribute values in general, to local roles for users from ldap user directories. [#17211](https://github.com/ClickHouse/ClickHouse/pull/17211) ([Denis Glazachev](https://github.com/traceon)).
+* Support insert into table function `cluster`, and for both table functions `remote` and `cluster`, support distributing data across nodes by specify sharding key. Close [#16752](https://github.com/ClickHouse/ClickHouse/issues/16752). [#18264](https://github.com/ClickHouse/ClickHouse/pull/18264) ([flynn](https://github.com/ucasFL)).
+* Add function `decodeXMLComponent` to decode characters for XML. Example: `SELECT decodeXMLComponent('Hello,&quot;world&quot;!')` [#17659](https://github.com/ClickHouse/ClickHouse/issues/17659). [#18542](https://github.com/ClickHouse/ClickHouse/pull/18542) ([nauta](https://github.com/nautaa)).
+* Added functions `parseDateTimeBestEffortUSOrZero`, `parseDateTimeBestEffortUSOrNull`. [#19712](https://github.com/ClickHouse/ClickHouse/pull/19712) ([Maksim Kita](https://github.com/kitaisreal)).
+* Add `sign` math function. [#19527](https://github.com/ClickHouse/ClickHouse/pull/19527) ([flynn](https://github.com/ucasFL)).
+* Add information about used features (functions, table engines, etc) into system.query_log. [#18495](https://github.com/ClickHouse/ClickHouse/issues/18495). [#19371](https://github.com/ClickHouse/ClickHouse/pull/19371) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Function `formatDateTime` support the `%Q` modification to format date to quarter. [#19224](https://github.com/ClickHouse/ClickHouse/pull/19224) ([Jianmei Zhang](https://github.com/zhangjmruc)).
+* Support MetaKey+Enter hotkey binding in play UI. [#19012](https://github.com/ClickHouse/ClickHouse/pull/19012) ([sundyli](https://github.com/sundy-li)).
+* Add three functions for map data type: 1. `mapContains(map, key)` to check weather map.keys include the second parameter key. 2. `mapKeys(map)` return all the keys in Array format 3. `mapValues(map)` return all the values in Array format. [#18788](https://github.com/ClickHouse/ClickHouse/pull/18788) ([hexiaoting](https://github.com/hexiaoting)).
+* Add `log_comment` setting related to [#18494](https://github.com/ClickHouse/ClickHouse/issues/18494). [#18549](https://github.com/ClickHouse/ClickHouse/pull/18549) ([Zijie Lu](https://github.com/TszKitLo40)).
+* Add support of tuple argument to `argMin` and `argMax` functions. [#17359](https://github.com/ClickHouse/ClickHouse/pull/17359) ([Ildus Kurbangaliev](https://github.com/ildus)).
+* Support `EXISTS VIEW` syntax. [#18552](https://github.com/ClickHouse/ClickHouse/pull/18552) ([Du Chuan](https://github.com/spongedu)).
+* Add `SELECT ALL` syntax. closes [#18706](https://github.com/ClickHouse/ClickHouse/issues/18706). [#18723](https://github.com/ClickHouse/ClickHouse/pull/18723) ([flynn](https://github.com/ucasFL)).
+
+#### Performance Improvement
+
+* Faster parts removal by lowering the number of `stat` syscalls. This returns the optimization that existed while ago. More safe interface of `IDisk`. This closes [#19065](https://github.com/ClickHouse/ClickHouse/issues/19065). [#19086](https://github.com/ClickHouse/ClickHouse/pull/19086) ([alexey-milovidov](https://github.com/alexey-milovidov)).
+* Aliases declared in `WITH` statement are properly used in index analysis. Queries like `WITH column AS alias SELECT ... WHERE alias = ...` may use index now. [#18896](https://github.com/ClickHouse/ClickHouse/pull/18896) ([Amos Bird](https://github.com/amosbird)).
+* Add `optimize_alias_column_prediction` (on by default), that will: - Respect aliased columns in WHERE during partition pruning and skipping data using secondary indexes; - Respect aliased columns in WHERE for trivial count queries for optimize_trivial_count; - Respect aliased columns in GROUP BY/ORDER BY for optimize_aggregation_in_order/optimize_read_in_order. [#16995](https://github.com/ClickHouse/ClickHouse/pull/16995) ([sundyli](https://github.com/sundy-li)).
+* Speed up aggregate function `sum`. Improvement only visible on synthetic benchmarks and not very practical. [#19216](https://github.com/ClickHouse/ClickHouse/pull/19216) ([alexey-milovidov](https://github.com/alexey-milovidov)).
+* Update libc++ and use another ABI to provide better performance. [#18914](https://github.com/ClickHouse/ClickHouse/pull/18914) ([Danila Kutenin](https://github.com/danlark1)).
+* Rewrite `sumIf()` and `sum(if())` function to `countIf()` function when logically equivalent. [#17041](https://github.com/ClickHouse/ClickHouse/pull/17041) ([flynn](https://github.com/ucasFL)).
+* Use a connection pool for S3 connections, controlled by the `s3_max_connections` settings. [#13405](https://github.com/ClickHouse/ClickHouse/pull/13405) ([Vladimir Chebotarev](https://github.com/excitoon)).
+* Add support for zstd long option for better compression of string columns to save space. [#17184](https://github.com/ClickHouse/ClickHouse/pull/17184) ([ygrek](https://github.com/ygrek)).
+* Slightly improve server latency by removing access to configuration on every connection. [#19863](https://github.com/ClickHouse/ClickHouse/pull/19863) ([alexey-milovidov](https://github.com/alexey-milovidov)).
+* Reduce lock contention for multiple layers of the `Buffer` engine. [#19379](https://github.com/ClickHouse/ClickHouse/pull/19379) ([Azat Khuzhin](https://github.com/azat)).
+* Support splitting `Filter` step of query plan into `Expression + Filter` pair. Together with `Expression + Expression` merging optimization ([#17458](https://github.com/ClickHouse/ClickHouse/issues/17458)) it may delay execution for some expressions after `Filter` step. [#19253](https://github.com/ClickHouse/ClickHouse/pull/19253) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+
+#### Improvement
+
+* `SELECT count() FROM table` now can be executed if only one any column can be selected from the `table`. This PR fixes [#10639](https://github.com/ClickHouse/ClickHouse/issues/10639). [#18233](https://github.com/ClickHouse/ClickHouse/pull/18233) ([Vitaly Baranov](https://github.com/vitlibar)).
+* Set charset to `utf8mb4` when interacting with remote MySQL servers. Fixes [#19795](https://github.com/ClickHouse/ClickHouse/issues/19795). [#19800](https://github.com/ClickHouse/ClickHouse/pull/19800) ([alexey-milovidov](https://github.com/alexey-milovidov)).
+* `S3` table function now supports `auto` compression mode (autodetect). This closes [#18754](https://github.com/ClickHouse/ClickHouse/issues/18754). [#19793](https://github.com/ClickHouse/ClickHouse/pull/19793) ([Vladimir Chebotarev](https://github.com/excitoon)).
+* Correctly output infinite arguments for `formatReadableTimeDelta` function. In previous versions, there was implicit conversion to implementation specific integer value. [#19791](https://github.com/ClickHouse/ClickHouse/pull/19791) ([alexey-milovidov](https://github.com/alexey-milovidov)).
+* Table function `S3` will use global region if the region can't be determined exactly. This closes [#10998](https://github.com/ClickHouse/ClickHouse/issues/10998). [#19750](https://github.com/ClickHouse/ClickHouse/pull/19750) ([Vladimir Chebotarev](https://github.com/excitoon)).
+* In distributed queries if the setting `async_socket_for_remote` is enabled, it was possible to get stack overflow at least in debug build configuration if very deeply nested data type is used in table (e.g. `Array(Array(Array(...more...)))`). This fixes [#19108](https://github.com/ClickHouse/ClickHouse/issues/19108). This change introduces minor backward incompatibility: excessive parenthesis in type definitions no longer supported, example: `Array((UInt8))`. [#19736](https://github.com/ClickHouse/ClickHouse/pull/19736) ([alexey-milovidov](https://github.com/alexey-milovidov)).
+* Add separate pool for message brokers (RabbitMQ and Kafka). [#19722](https://github.com/ClickHouse/ClickHouse/pull/19722) ([Azat Khuzhin](https://github.com/azat)).
+* Fix rare `max_number_of_merges_with_ttl_in_pool` limit overrun (more merges with TTL can be assigned) for non-replicated MergeTree. [#19708](https://github.com/ClickHouse/ClickHouse/pull/19708) ([alesapin](https://github.com/alesapin)).
+* Dictionary: better error message during attribute parsing. [#19678](https://github.com/ClickHouse/ClickHouse/pull/19678) ([Maksim Kita](https://github.com/kitaisreal)).
+* Add an option to disable validation of checksums on reading. Should never be used in production. Please do not expect any benefits in disabling it. It may only be used for experiments and benchmarks. The setting only applicable for tables of MergeTree family. Checksums are always validated for other table engines and when receiving data over network. In my observations there is no performance difference or it is less than 0.5%. [#19588](https://github.com/ClickHouse/ClickHouse/pull/19588) ([alexey-milovidov](https://github.com/alexey-milovidov)).
+* Support constant result in function `multiIf`. [#19533](https://github.com/ClickHouse/ClickHouse/pull/19533) ([Maksim Kita](https://github.com/kitaisreal)).
+* Enable function length/empty/notEmpty for datatype Map, which returns keys number in Map. [#19530](https://github.com/ClickHouse/ClickHouse/pull/19530) ([taiyang-li](https://github.com/taiyang-li)).
+* Add `--reconnect` option to `clickhouse-benchmark`. When this option is specified, it will reconnect before every request. This is needed for testing. [#19872](https://github.com/ClickHouse/ClickHouse/pull/19872) ([alexey-milovidov](https://github.com/alexey-milovidov)).
+* Support using the new location of `.debug` file. This fixes [#19348](https://github.com/ClickHouse/ClickHouse/issues/19348). [#19520](https://github.com/ClickHouse/ClickHouse/pull/19520) ([Amos Bird](https://github.com/amosbird)).
+* `toIPv6` function parses `IPv4` addresses. [#19518](https://github.com/ClickHouse/ClickHouse/pull/19518) ([Bharat Nallan](https://github.com/bharatnc)).
+* Add `http_referer` field to `system.query_log`, `system.processes`, etc. This closes [#19389](https://github.com/ClickHouse/ClickHouse/issues/19389). [#19390](https://github.com/ClickHouse/ClickHouse/pull/19390) ([alexey-milovidov](https://github.com/alexey-milovidov)).
+* Improve MySQL compatibility by making more functions case insensitive and adding aliases. [#19387](https://github.com/ClickHouse/ClickHouse/pull/19387) ([Daniil Kondratyev](https://github.com/dankondr)).
+* Add metrics for MergeTree parts (Wide/Compact/InMemory) types. [#19381](https://github.com/ClickHouse/ClickHouse/pull/19381) ([Azat Khuzhin](https://github.com/azat)).
+* Allow docker to be executed with arbitrary uid. [#19374](https://github.com/ClickHouse/ClickHouse/pull/19374) ([filimonov](https://github.com/filimonov)).
+* Fix wrong alignment of values of `IPv4` data type in Pretty formats. They were aligned to the right, not to the left. This closes [#19184](https://github.com/ClickHouse/ClickHouse/issues/19184). [#19339](https://github.com/ClickHouse/ClickHouse/pull/19339) ([alexey-milovidov](https://github.com/alexey-milovidov)).
+* Allow change `max_server_memory_usage` without restart. This closes [#18154](https://github.com/ClickHouse/ClickHouse/issues/18154). [#19186](https://github.com/ClickHouse/ClickHouse/pull/19186) ([alexey-milovidov](https://github.com/alexey-milovidov)).
+* The exception when function `bar` is called with certain NaN argument may be slightly misleading in previous versions. This fixes [#19088](https://github.com/ClickHouse/ClickHouse/issues/19088). [#19107](https://github.com/ClickHouse/ClickHouse/pull/19107) ([alexey-milovidov](https://github.com/alexey-milovidov)).
+* Explicitly set uid / gid of clickhouse user & group to the fixed values (101) in clickhouse-server images. [#19096](https://github.com/ClickHouse/ClickHouse/pull/19096) ([filimonov](https://github.com/filimonov)).
+* Fixed `PeekableReadBuffer: Memory limit exceed` error when inserting data with huge strings. Fixes [#18690](https://github.com/ClickHouse/ClickHouse/issues/18690). [#18979](https://github.com/ClickHouse/ClickHouse/pull/18979) ([tavplubix](https://github.com/tavplubix)).
+* Docker image: several improvements for clickhouse-server entrypoint. [#18954](https://github.com/ClickHouse/ClickHouse/pull/18954) ([filimonov](https://github.com/filimonov)).
+* Add `normalizeQueryKeepNames` and `normalizedQueryHashKeepNames` to normalize queries without masking long names with `?`. This helps better analyze complex query logs. [#18910](https://github.com/ClickHouse/ClickHouse/pull/18910) ([Amos Bird](https://github.com/amosbird)).
+* - Check per-block checksum of the distributed batch on the sender before sending (without reading the file twice, the checksums will be verified while reading), this will avoid stuck of the INSERT on the receiver (on truncated .bin file on the sender) - Avoid reading .bin files twice for batched INSERT (it was required to calculate rows/bytes to take squashing into account, now this information included into the header, backward compatible is preserved). [#18853](https://github.com/ClickHouse/ClickHouse/pull/18853) ([Azat Khuzhin](https://github.com/azat)).
+* Fix issues with RIGHT and FULL JOIN of tables with aggregate function states. In previous versions exception about `cloneResized` method was thrown. [#18818](https://github.com/ClickHouse/ClickHouse/pull/18818) ([templarzq](https://github.com/templarzq)).
+* Added prefix-based S3 endpoint settings. [#18812](https://github.com/ClickHouse/ClickHouse/pull/18812) ([Vladimir Chebotarev](https://github.com/excitoon)).
+* Add [UInt8, UInt16, UInt32, UInt64] arguments types support for bitmapTransform, bitmapSubsetInRange, bitmapSubsetLimit, bitmapContains functions. This closes [#18713](https://github.com/ClickHouse/ClickHouse/issues/18713). [#18791](https://github.com/ClickHouse/ClickHouse/pull/18791) ([sundyli](https://github.com/sundy-li)).
+* Allow CTE (Common Table Expressions) to be further aliased. Propagate CSE (Common Subexpressions Elimination) to subqueries in the same level when `enable_global_with_statement = 1`. This fixes [#17378](https://github.com/ClickHouse/ClickHouse/issues/17378) . This fixes https://github.com/ClickHouse/ClickHouse/pull/16575#issuecomment-753416235 . [#18684](https://github.com/ClickHouse/ClickHouse/pull/18684) ([Amos Bird](https://github.com/amosbird)).
+* Update librdkafka to v1.6.0-RC2. Fixes [#18668](https://github.com/ClickHouse/ClickHouse/issues/18668). [#18671](https://github.com/ClickHouse/ClickHouse/pull/18671) ([filimonov](https://github.com/filimonov)).
+* In case of unexpected exceptions automatically restart background thread which is responsible for execution of distributed DDL queries. Fixes [#17991](https://github.com/ClickHouse/ClickHouse/issues/17991). [#18285](https://github.com/ClickHouse/ClickHouse/pull/18285) ([徐炘](https://github.com/weeds085490)).
+* Updated AWS C++ SDK in order to utilize global regions in S3. [#17870](https://github.com/ClickHouse/ClickHouse/pull/17870) ([Vladimir Chebotarev](https://github.com/excitoon)).
+* Added support for `WITH ... [AND] [PERIODIC] REFRESH [interval_in_sec]` clause when creating `LIVE VIEW` tables. [#14822](https://github.com/ClickHouse/ClickHouse/pull/14822) ([vzakaznikov](https://github.com/vzakaznikov)).
+* Restrict `MODIFY TTL` queries for `MergeTree` tables created in old syntax. Previously the query succeeded, but actually it had no effect. [#19064](https://github.com/ClickHouse/ClickHouse/pull/19064) ([Anton Popov](https://github.com/CurtizJ)).
+
+#### Bug Fix
+
+* Fix index analysis of binary functions with constant argument which leads to wrong query results. This fixes [#18364](https://github.com/ClickHouse/ClickHouse/issues/18364). [#18373](https://github.com/ClickHouse/ClickHouse/pull/18373) ([Amos Bird](https://github.com/amosbird)).
+* Fix starting the server with tables having default expressions containing dictGet(). Allow getting return type of dictGet() without loading dictionary. [#19805](https://github.com/ClickHouse/ClickHouse/pull/19805) ([Vitaly Baranov](https://github.com/vitlibar)).
+* Fix server crash after query with `if` function with `Tuple` type of then/else branches result. `Tuple` type must contain `Array` or another complex type. Fixes [#18356](https://github.com/ClickHouse/ClickHouse/issues/18356). [#20133](https://github.com/ClickHouse/ClickHouse/pull/20133) ([alesapin](https://github.com/alesapin)).
+* `MaterializeMySQL` (experimental feature): Fix replication for statements that update several tables. [#20066](https://github.com/ClickHouse/ClickHouse/pull/20066) ([Håvard Kvålen](https://github.com/havardk)).
+* Prevent "Connection refused" in docker during initialization script execution. [#20012](https://github.com/ClickHouse/ClickHouse/pull/20012) ([filimonov](https://github.com/filimonov)).
+* `EmbeddedRocksDB` is an experimental storage. Fix the issue with lack of proper type checking. Simplified code. This closes [#19967](https://github.com/ClickHouse/ClickHouse/issues/19967). [#19972](https://github.com/ClickHouse/ClickHouse/pull/19972) ([alexey-milovidov](https://github.com/alexey-milovidov)).
+* Fix a segfault in function `fromModifiedJulianDay` when the argument type is `Nullable(T)` for any integral types other than Int32. [#19959](https://github.com/ClickHouse/ClickHouse/pull/19959) ([PHO](https://github.com/depressed-pho)).
+* The function `greatCircleAngle` returned inaccurate results in previous versions. This closes [#19769](https://github.com/ClickHouse/ClickHouse/issues/19769). [#19789](https://github.com/ClickHouse/ClickHouse/pull/19789) ([alexey-milovidov](https://github.com/alexey-milovidov)).
+* Fix rare bug when some replicated operations (like mutation) cannot process some parts after data corruption. Fixes [#19593](https://github.com/ClickHouse/ClickHouse/issues/19593). [#19702](https://github.com/ClickHouse/ClickHouse/pull/19702) ([alesapin](https://github.com/alesapin)).
+* Background thread which executes `ON CLUSTER` queries might hang waiting for dropped replicated table to do something. It's fixed. [#19684](https://github.com/ClickHouse/ClickHouse/pull/19684) ([yiguolei](https://github.com/yiguolei)).
+* Fix wrong deserialization of columns description. It makes INSERT into a table with a column named `\` impossible. [#19479](https://github.com/ClickHouse/ClickHouse/pull/19479) ([alexey-milovidov](https://github.com/alexey-milovidov)).
+* Mark distributed batch as broken in case of empty data block in one of files. [#19449](https://github.com/ClickHouse/ClickHouse/pull/19449) ([Azat Khuzhin](https://github.com/azat)).
+* Fixed very rare bug that might cause mutation to hang after `DROP/DETACH/REPLACE/MOVE PARTITION`. It was partially fixed by [#15537](https://github.com/ClickHouse/ClickHouse/issues/15537) for the most cases. [#19443](https://github.com/ClickHouse/ClickHouse/pull/19443) ([tavplubix](https://github.com/tavplubix)).
+* Fix possible error `Extremes transform was already added to pipeline`. Fixes [#14100](https://github.com/ClickHouse/ClickHouse/issues/14100). [#19430](https://github.com/ClickHouse/ClickHouse/pull/19430) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Fix default value in join types with non-zero default (e.g. some Enums). Closes [#18197](https://github.com/ClickHouse/ClickHouse/issues/18197). [#19360](https://github.com/ClickHouse/ClickHouse/pull/19360) ([vdimir](https://github.com/vdimir)).
+* Do not mark file for distributed send as broken on EOF. [#19290](https://github.com/ClickHouse/ClickHouse/pull/19290) ([Azat Khuzhin](https://github.com/azat)).
+* Fix leaking of pipe fd for `async_socket_for_remote`. [#19153](https://github.com/ClickHouse/ClickHouse/pull/19153) ([Azat Khuzhin](https://github.com/azat)).
+* Fix infinite reading from file in `ORC` format (was introduced in [#10580](https://github.com/ClickHouse/ClickHouse/issues/10580)). Fixes [#19095](https://github.com/ClickHouse/ClickHouse/issues/19095). [#19134](https://github.com/ClickHouse/ClickHouse/pull/19134) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Fix issue in merge tree data writer which can lead to marks with bigger size than fixed granularity size. Fixes [#18913](https://github.com/ClickHouse/ClickHouse/issues/18913). [#19123](https://github.com/ClickHouse/ClickHouse/pull/19123) ([alesapin](https://github.com/alesapin)).
+* Fix startup bug when clickhouse was not able to read compression codec from `LowCardinality(Nullable(...))` and throws exception `Attempt to read after EOF`. Fixes [#18340](https://github.com/ClickHouse/ClickHouse/issues/18340). [#19101](https://github.com/ClickHouse/ClickHouse/pull/19101) ([alesapin](https://github.com/alesapin)).
+* Simplify the implementation of `tupleHammingDistance`. Support for tuples of any equal length. Fixes [#19029](https://github.com/ClickHouse/ClickHouse/issues/19029). [#19084](https://github.com/ClickHouse/ClickHouse/pull/19084) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Make sure `groupUniqArray` returns correct type for argument of Enum type. This closes [#17875](https://github.com/ClickHouse/ClickHouse/issues/17875). [#19019](https://github.com/ClickHouse/ClickHouse/pull/19019) ([alexey-milovidov](https://github.com/alexey-milovidov)).
+* Fix possible error `Expected single dictionary argument for function` if use function `ignore` with `LowCardinality` argument. Fixes [#14275](https://github.com/ClickHouse/ClickHouse/issues/14275). [#19016](https://github.com/ClickHouse/ClickHouse/pull/19016) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Fix inserting of `LowCardinality` column to table with `TinyLog` engine. Fixes [#18629](https://github.com/ClickHouse/ClickHouse/issues/18629). [#19010](https://github.com/ClickHouse/ClickHouse/pull/19010) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Fix minor issue in JOIN: Join tries to materialize const columns, but our code waits for them in other places. [#18982](https://github.com/ClickHouse/ClickHouse/pull/18982) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)).
+* Disable `optimize_move_functions_out_of_any` because optimization is not always correct. This closes [#18051](https://github.com/ClickHouse/ClickHouse/issues/18051). This closes [#18973](https://github.com/ClickHouse/ClickHouse/issues/18973). [#18981](https://github.com/ClickHouse/ClickHouse/pull/18981) ([alexey-milovidov](https://github.com/alexey-milovidov)).
+* Fix possible exception `QueryPipeline stream: different number of columns` caused by merging of query plan's `Expression` steps. Fixes [#18190](https://github.com/ClickHouse/ClickHouse/issues/18190). [#18980](https://github.com/ClickHouse/ClickHouse/pull/18980) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Fixed very rare deadlock at shutdown. [#18977](https://github.com/ClickHouse/ClickHouse/pull/18977) ([tavplubix](https://github.com/tavplubix)).
+* Fixed rare crashes when server run out of memory. [#18976](https://github.com/ClickHouse/ClickHouse/pull/18976) ([tavplubix](https://github.com/tavplubix)).
+* Fix incorrect behavior when `ALTER TABLE ... DROP PART 'part_name'` query removes all deduplication blocks for the whole partition. Fixes [#18874](https://github.com/ClickHouse/ClickHouse/issues/18874). [#18969](https://github.com/ClickHouse/ClickHouse/pull/18969) ([alesapin](https://github.com/alesapin)).
+* Fixed issue [#18894](https://github.com/ClickHouse/ClickHouse/issues/18894) Add a check to avoid exception when long column alias('table.column' style, usually auto-generated by BI tools like Looker) equals to long table name. [#18968](https://github.com/ClickHouse/ClickHouse/pull/18968) ([Daniel Qin](https://github.com/mathfool)).
+* Fix error `Task was not found in task queue` (possible only for remote queries, with `async_socket_for_remote = 1`). [#18964](https://github.com/ClickHouse/ClickHouse/pull/18964) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Fix bug when mutation with some escaped text (like `ALTER ... UPDATE e = CAST('foo', 'Enum8(\'foo\' = 1')` serialized incorrectly. Fixes [#18878](https://github.com/ClickHouse/ClickHouse/issues/18878). [#18944](https://github.com/ClickHouse/ClickHouse/pull/18944) ([alesapin](https://github.com/alesapin)).
+* ATTACH PARTITION will reset mutations. [#18804](https://github.com/ClickHouse/ClickHouse/issues/18804). [#18935](https://github.com/ClickHouse/ClickHouse/pull/18935) ([fastio](https://github.com/fastio)).
+* Fix issue with `bitmapOrCardinality` that may lead to nullptr dereference. This closes [#18911](https://github.com/ClickHouse/ClickHouse/issues/18911). [#18912](https://github.com/ClickHouse/ClickHouse/pull/18912) ([sundyli](https://github.com/sundy-li)).
+* Fixed `Attempt to read after eof` error when trying to `CAST` `NULL` from `Nullable(String)` to `Nullable(Decimal(P, S))`. Now function `CAST` returns `NULL` when it cannot parse decimal from nullable string. Fixes [#7690](https://github.com/ClickHouse/ClickHouse/issues/7690). [#18718](https://github.com/ClickHouse/ClickHouse/pull/18718) ([Winter Zhang](https://github.com/zhang2014)).
+* Fix data type convert issue for MySQL engine. [#18124](https://github.com/ClickHouse/ClickHouse/pull/18124) ([bo zeng](https://github.com/mis98zb)).
+* Fix clickhouse-client abort exception while executing only `select`. [#19790](https://github.com/ClickHouse/ClickHouse/pull/19790) ([taiyang-li](https://github.com/taiyang-li)).
+
+
+#### Build/Testing/Packaging Improvement
+
+* Run [SQLancer](https://twitter.com/RiggerManuel/status/1352345625480884228) (logical SQL fuzzer) in CI. [#19006](https://github.com/ClickHouse/ClickHouse/pull/19006) ([Ilya Yatsishin](https://github.com/qoega)).
+* Query Fuzzer will fuzz newly added tests more extensively. This closes [#18916](https://github.com/ClickHouse/ClickHouse/issues/18916). [#19185](https://github.com/ClickHouse/ClickHouse/pull/19185) ([alexey-milovidov](https://github.com/alexey-milovidov)).
+* Integrate with [Big List of Naughty Strings](https://github.com/minimaxir/big-list-of-naughty-strings/) for better fuzzing. [#19480](https://github.com/ClickHouse/ClickHouse/pull/19480) ([alexey-milovidov](https://github.com/alexey-milovidov)).
+* Add integration tests run with MSan. [#18974](https://github.com/ClickHouse/ClickHouse/pull/18974) ([alesapin](https://github.com/alesapin)).
+* Fixed MemorySanitizer errors in cyrus-sasl and musl. [#19821](https://github.com/ClickHouse/ClickHouse/pull/19821) ([Ilya Yatsishin](https://github.com/qoega)).
+* Insuffiient arguments check in `positionCaseInsensitiveUTF8` function triggered address sanitizer. [#19720](https://github.com/ClickHouse/ClickHouse/pull/19720) ([alexey-milovidov](https://github.com/alexey-milovidov)).
+* Remove --project-directory for docker-compose in integration test. Fix logs formatting from docker container. [#19706](https://github.com/ClickHouse/ClickHouse/pull/19706) ([Ilya Yatsishin](https://github.com/qoega)).
+* Made generation of macros.xml easier for integration tests. No more excessive logging from dicttoxml. dicttoxml project is not active for 5+ years. [#19697](https://github.com/ClickHouse/ClickHouse/pull/19697) ([Ilya Yatsishin](https://github.com/qoega)).
+* Allow to explicitly enable or disable watchdog via environment variable `CLICKHOUSE_WATCHDOG_ENABLE`. By default it is enabled if server is not attached to terminal. [#19522](https://github.com/ClickHouse/ClickHouse/pull/19522) ([alexey-milovidov](https://github.com/alexey-milovidov)).
+* Allow building ClickHouse with Kafka support on arm64. [#19369](https://github.com/ClickHouse/ClickHouse/pull/19369) ([filimonov](https://github.com/filimonov)).
+* Allow building librdkafka without ssl. [#19337](https://github.com/ClickHouse/ClickHouse/pull/19337) ([filimonov](https://github.com/filimonov)).
+* Restore Kafka input in FreeBSD builds. [#18924](https://github.com/ClickHouse/ClickHouse/pull/18924) ([Alexandre Snarskii](https://github.com/snar)).
+* Fix potential nullptr dereference in table function `VALUES`. [#19357](https://github.com/ClickHouse/ClickHouse/pull/19357) ([alexey-milovidov](https://github.com/alexey-milovidov)).
+* Avoid UBSan reports in `arrayElement` function, `substring` and `arraySum`. Fixes [#19305](https://github.com/ClickHouse/ClickHouse/issues/19305). Fixes [#19287](https://github.com/ClickHouse/ClickHouse/issues/19287). This closes [#19336](https://github.com/ClickHouse/ClickHouse/issues/19336). [#19347](https://github.com/ClickHouse/ClickHouse/pull/19347) ([alexey-milovidov](https://github.com/alexey-milovidov)).
+
+
 ## ClickHouse release 21.1
 
 ### ClickHouse release v21.1.3.32-stable, 2021-02-03

From 6325b15a63335e2efd7de1ae92d2907493a07a9c Mon Sep 17 00:00:00 2001
From: alexey-milovidov <milovidov@yandex-team.ru>
Date: Sun, 7 Feb 2021 18:19:10 +0300
Subject: [PATCH 122/122] Update CHANGELOG.md

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index fffd732f7d7..e2c777b3bcf 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -72,7 +72,7 @@
 * Fixed `PeekableReadBuffer: Memory limit exceed` error when inserting data with huge strings. Fixes [#18690](https://github.com/ClickHouse/ClickHouse/issues/18690). [#18979](https://github.com/ClickHouse/ClickHouse/pull/18979) ([tavplubix](https://github.com/tavplubix)).
 * Docker image: several improvements for clickhouse-server entrypoint. [#18954](https://github.com/ClickHouse/ClickHouse/pull/18954) ([filimonov](https://github.com/filimonov)).
 * Add `normalizeQueryKeepNames` and `normalizedQueryHashKeepNames` to normalize queries without masking long names with `?`. This helps better analyze complex query logs. [#18910](https://github.com/ClickHouse/ClickHouse/pull/18910) ([Amos Bird](https://github.com/amosbird)).
-* - Check per-block checksum of the distributed batch on the sender before sending (without reading the file twice, the checksums will be verified while reading), this will avoid stuck of the INSERT on the receiver (on truncated .bin file on the sender) - Avoid reading .bin files twice for batched INSERT (it was required to calculate rows/bytes to take squashing into account, now this information included into the header, backward compatible is preserved). [#18853](https://github.com/ClickHouse/ClickHouse/pull/18853) ([Azat Khuzhin](https://github.com/azat)).
+* Check per-block checksum of the distributed batch on the sender before sending (without reading the file twice, the checksums will be verified while reading), this will avoid stuck of the INSERT on the receiver (on truncated .bin file on the sender). Avoid reading .bin files twice for batched INSERT (it was required to calculate rows/bytes to take squashing into account, now this information included into the header, backward compatible is preserved). [#18853](https://github.com/ClickHouse/ClickHouse/pull/18853) ([Azat Khuzhin](https://github.com/azat)).
 * Fix issues with RIGHT and FULL JOIN of tables with aggregate function states. In previous versions exception about `cloneResized` method was thrown. [#18818](https://github.com/ClickHouse/ClickHouse/pull/18818) ([templarzq](https://github.com/templarzq)).
 * Added prefix-based S3 endpoint settings. [#18812](https://github.com/ClickHouse/ClickHouse/pull/18812) ([Vladimir Chebotarev](https://github.com/excitoon)).
 * Add [UInt8, UInt16, UInt32, UInt64] arguments types support for bitmapTransform, bitmapSubsetInRange, bitmapSubsetLimit, bitmapContains functions. This closes [#18713](https://github.com/ClickHouse/ClickHouse/issues/18713). [#18791](https://github.com/ClickHouse/ClickHouse/pull/18791) ([sundyli](https://github.com/sundy-li)).