From aa092eeffb8f09d9d65294bfa2a62cacc258e562 Mon Sep 17 00:00:00 2001
From: Anton Popov <pad11rus@gmail.com>
Date: Mon, 27 Dec 2021 16:42:06 +0300
Subject: [PATCH 01/58] proper handle of 'max_rows_to_read' in case of reading
 in order of sorting key and limit

---
 .../MergeTree/MergeTreeDataSelectExecutor.cpp | 14 ++++++++++--
 .../MergeTree/MergeTreeSelectProcessor.cpp    |  5 -----
 ...5_read_in_order_max_rows_to_read.reference |  6 +++++
 .../02155_read_in_order_max_rows_to_read.sql  | 22 +++++++++++++++++++
 4 files changed, 40 insertions(+), 7 deletions(-)
 create mode 100644 tests/queries/0_stateless/02155_read_in_order_max_rows_to_read.reference
 create mode 100644 tests/queries/0_stateless/02155_read_in_order_max_rows_to_read.sql

diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
index cdedd37e14a..07ac6f5764b 100644
--- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
@@ -875,12 +875,22 @@ RangesInDataParts MergeTreeDataSelectExecutor::filterPartsByPrimaryKeyAndSkipInd
     {
         std::atomic<size_t> total_rows{0};
 
+        /// Do not check number of read rows if we have reading
+        /// in order of sorting key with limit.
+        /// In general case, when there exists WHERE clause
+        /// it's impossible to estimate number of rows precisely,
+        /// because we can stop reading at any time.
+
         SizeLimits limits;
-        if (settings.read_overflow_mode == OverflowMode::THROW && settings.max_rows_to_read)
+        if (settings.read_overflow_mode == OverflowMode::THROW
+            && settings.max_rows_to_read
+            && !query_info.input_order_info)
             limits = SizeLimits(settings.max_rows_to_read, 0, settings.read_overflow_mode);
 
         SizeLimits leaf_limits;
-        if (settings.read_overflow_mode_leaf == OverflowMode::THROW && settings.max_rows_to_read_leaf)
+        if (settings.read_overflow_mode_leaf == OverflowMode::THROW
+            && settings.max_rows_to_read_leaf
+            && !query_info.input_order_info)
             leaf_limits = SizeLimits(settings.max_rows_to_read_leaf, 0, settings.read_overflow_mode_leaf);
 
         auto mark_cache = context->getIndexMarkCache();
diff --git a/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp
index 2d4d3617cee..332eb27094a 100644
--- a/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp
+++ b/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp
@@ -37,11 +37,6 @@ MergeTreeSelectProcessor::MergeTreeSelectProcessor(
     has_limit_below_one_block(has_limit_below_one_block_),
     total_rows(data_part->index_granularity.getRowsCountInRanges(all_mark_ranges))
 {
-    /// Actually it means that parallel reading from replicas enabled
-    /// and we have to collaborate with initiator.
-    /// In this case we won't set approximate rows, because it will be accounted multiple times
-    if (!extension_.has_value())
-        addTotalRowsApprox(total_rows);
     ordered_names = header_without_virtual_columns.getNames();
 }
 
diff --git a/tests/queries/0_stateless/02155_read_in_order_max_rows_to_read.reference b/tests/queries/0_stateless/02155_read_in_order_max_rows_to_read.reference
new file mode 100644
index 00000000000..b73ab43cabb
--- /dev/null
+++ b/tests/queries/0_stateless/02155_read_in_order_max_rows_to_read.reference
@@ -0,0 +1,6 @@
+10
+0
+1
+2
+3
+4
diff --git a/tests/queries/0_stateless/02155_read_in_order_max_rows_to_read.sql b/tests/queries/0_stateless/02155_read_in_order_max_rows_to_read.sql
new file mode 100644
index 00000000000..e82c78b5e42
--- /dev/null
+++ b/tests/queries/0_stateless/02155_read_in_order_max_rows_to_read.sql
@@ -0,0 +1,22 @@
+DROP TABLE IF EXISTS t_max_rows_to_read;
+
+CREATE TABLE t_max_rows_to_read (a UInt64)
+ENGINE = MergeTree ORDER BY a
+SETTINGS index_granularity = 4;
+
+INSERT INTO t_max_rows_to_read SELECT number FROM numbers(100);
+
+SET max_threads = 1;
+
+SELECT a FROM t_max_rows_to_read WHERE a = 10 SETTINGS max_rows_to_read = 4;
+
+SELECT a FROM t_max_rows_to_read ORDER BY a LIMIT 5 SETTINGS max_rows_to_read = 12;
+
+-- This should work, but actually it doesn't. Need to investigate.
+-- SELECT a FROM t_max_rows_to_read WHERE a > 10 ORDER BY a LIMIT 5 SETTINGS max_rows_to_read = 20;
+
+SELECT a FROM t_max_rows_to_read ORDER BY a LIMIT 20 FORMAT Null SETTINGS max_rows_to_read = 12; -- { serverError 158 }
+SELECT a FROM t_max_rows_to_read WHERE a > 10 ORDER BY a LIMIT 5 FORMAT Null SETTINGS max_rows_to_read = 12; -- { serverError 158 }
+SELECT a FROM t_max_rows_to_read WHERE a = 10 OR a = 20 FORMAT Null SETTINGS max_rows_to_read = 4; -- { serverError 158 }
+
+DROP TABLE t_max_rows_to_read;

From 04e6e6ab5e4818f8774ecf3878ca2205d8e1aae6 Mon Sep 17 00:00:00 2001
From: Antonio Andelic <antonio@clickhouse.com>
Date: Mon, 21 Mar 2022 14:52:26 +0000
Subject: [PATCH 02/58] Add ParallelReadBuffer for S3

---
 src/Common/RangeGenerator.h      | 46 ++++++++++++++++++++
 src/IO/ReadBufferFromS3.h        | 73 ++++++++++++++++++++++++++++++++
 src/IO/ReadWriteBufferFromHTTP.h | 38 +----------------
 src/IO/S3Common.cpp              | 23 ++++++++++
 src/IO/S3Common.h                |  2 +
 src/Storages/StorageS3.cpp       |  8 +++-
 6 files changed, 152 insertions(+), 38 deletions(-)
 create mode 100644 src/Common/RangeGenerator.h

diff --git a/src/Common/RangeGenerator.h b/src/Common/RangeGenerator.h
new file mode 100644
index 00000000000..8d3b88936fc
--- /dev/null
+++ b/src/Common/RangeGenerator.h
@@ -0,0 +1,46 @@
+#pragma once
+
+#include <optional>
+#include <cmath>
+
+namespace DB
+{
+
+class RangeGenerator
+{
+public:
+    explicit RangeGenerator(size_t total_size_, size_t range_step_, size_t range_start = 0)
+        : from(range_start), range_step(range_step_), total_size(total_size_)
+    {
+    }
+
+    size_t totalRanges() const { return static_cast<size_t>(round(static_cast<float>(total_size - from) / range_step)); }
+
+    using Range = std::pair<size_t, size_t>;
+
+    // return upper exclusive range of values, i.e. [from_range, to_range>
+    std::optional<Range> nextRange()
+    {
+        if (from >= total_size)
+        {
+            return std::nullopt;
+        }
+
+        auto to = from + range_step;
+        if (to >= total_size)
+        {
+            to = total_size;
+        }
+
+        Range range{from, to};
+        from = to;
+        return std::move(range);
+    }
+
+private:
+    size_t from;
+    size_t range_step;
+    size_t total_size;
+};
+
+}
diff --git a/src/IO/ReadBufferFromS3.h b/src/IO/ReadBufferFromS3.h
index 157b6d46b6d..6668f570cb6 100644
--- a/src/IO/ReadBufferFromS3.h
+++ b/src/IO/ReadBufferFromS3.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <Common/RangeGenerator.h>
 #include <Common/config.h>
 
 #if USE_AWS_S3
@@ -10,6 +11,7 @@
 #include <IO/ReadBuffer.h>
 #include <IO/ReadSettings.h>
 #include <IO/SeekableReadBuffer.h>
+#include <IO/ParallelReadBuffer.h>
 
 #include <aws/s3/model/GetObjectResult.h>
 
@@ -62,6 +64,12 @@ public:
 
     size_t getFileOffsetOfBufferEnd() const override { return offset; }
 
+    void setRange(off_t begin, off_t end)
+    {
+        offset = begin;
+        read_until_position = end;
+    }
+
 private:
     std::unique_ptr<ReadBuffer> initialize();
 
@@ -76,6 +84,71 @@ private:
     bool restricted_seek;
 };
 
+/// Creates separate ReadBufferFromS3 for sequence of ranges of particular object
+class ReadBufferS3Factory : public ParallelReadBuffer::ReadBufferFactory
+{
+public:
+    explicit ReadBufferS3Factory(
+        std::shared_ptr<Aws::S3::S3Client> client_ptr_,
+        const String & bucket_,
+        const String & key_,
+        size_t range_step_,
+        size_t object_size_,
+        UInt64 s3_max_single_read_retries_,
+        const ReadSettings &read_settings)
+        : client_ptr(client_ptr_)
+        , bucket(bucket_)
+        , key(key_)
+        , settings(read_settings)
+        , range_generator(object_size_, range_step_)
+        , range_step(range_step_)
+        , object_size(object_size_)
+        , s3_max_single_read_retries(s3_max_single_read_retries_)
+    {
+        assert(range_step > 0);
+        assert(range_step < object_size);
+    }
+
+    size_t totalRanges() const
+    {
+        return static_cast<size_t>(round(static_cast<float>(object_size) / range_step));
+    }
+
+    SeekableReadBufferPtr getReader() override
+    {
+        const auto next_range = range_generator.nextRange();
+        if (!next_range)
+        {
+            return nullptr;
+        }
+
+        auto reader = std::make_shared<ReadBufferFromS3>(
+                client_ptr, bucket, key, s3_max_single_read_retries, settings);
+        reader->setRange(next_range->first, next_range->second - 1);
+        return reader;
+    }
+
+    off_t seek(off_t off, [[maybe_unused]] int whence) override
+    {
+        range_generator = RangeGenerator{object_size, range_step, static_cast<size_t>(off)};
+        return off;
+    }
+
+    std::optional<size_t> getTotalSize() override { return object_size; }
+
+private:
+    std::shared_ptr<Aws::S3::S3Client> client_ptr;
+    const String bucket;
+    const String key;
+    ReadSettings settings;
+
+    RangeGenerator range_generator;
+    size_t range_step;
+    size_t object_size;
+
+    UInt64 s3_max_single_read_retries;
+};
+
 }
 
 #endif
diff --git a/src/IO/ReadWriteBufferFromHTTP.h b/src/IO/ReadWriteBufferFromHTTP.h
index 061dd772212..f0c10759142 100644
--- a/src/IO/ReadWriteBufferFromHTTP.h
+++ b/src/IO/ReadWriteBufferFromHTTP.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <functional>
+#include <Common/RangeGenerator.h>
 #include <IO/ConnectionTimeouts.h>
 #include <IO/HTTPCommon.h>
 #include <IO/ParallelReadBuffer.h>
@@ -635,43 +636,6 @@ public:
     void buildNewSession(const Poco::URI & uri) override { session = makeHTTPSession(uri, timeouts); }
 };
 
-class RangeGenerator
-{
-public:
-    explicit RangeGenerator(size_t total_size_, size_t range_step_, size_t range_start = 0)
-        : from(range_start), range_step(range_step_), total_size(total_size_)
-    {
-    }
-
-    size_t totalRanges() const { return static_cast<size_t>(round(static_cast<float>(total_size - from) / range_step)); }
-
-    using Range = std::pair<size_t, size_t>;
-
-    // return upper exclusive range of values, i.e. [from_range, to_range>
-    std::optional<Range> nextRange()
-    {
-        if (from >= total_size)
-        {
-            return std::nullopt;
-        }
-
-        auto to = from + range_step;
-        if (to >= total_size)
-        {
-            to = total_size;
-        }
-
-        Range range{from, to};
-        from = to;
-        return std::move(range);
-    }
-
-private:
-    size_t from;
-    size_t range_step;
-    size_t total_size;
-};
-
 class ReadWriteBufferFromHTTP : public detail::ReadWriteBufferFromHTTPBase<std::shared_ptr<UpdatableSession>>
 {
     using Parent = detail::ReadWriteBufferFromHTTPBase<std::shared_ptr<UpdatableSession>>;
diff --git a/src/IO/S3Common.cpp b/src/IO/S3Common.cpp
index e63e6fde1f4..971ac0d8a73 100644
--- a/src/IO/S3Common.cpp
+++ b/src/IO/S3Common.cpp
@@ -33,6 +33,8 @@
 #    include <base/logger_useful.h>
 
 #    include <fstream>
+#include <aws/s3/S3Client.h>  // Y_IGNORE
+#include <aws/s3/model/HeadObjectRequest.h>  // Y_IGNORE
 
 namespace
 {
@@ -682,6 +684,7 @@ namespace DB
 namespace ErrorCodes
 {
     extern const int BAD_ARGUMENTS;
+    extern const int S3_ERROR;
 }
 
 namespace S3
@@ -839,6 +842,26 @@ namespace S3
             throw Exception(ErrorCodes::BAD_ARGUMENTS, "Bucket name length is out of bounds in virtual hosted style S3 URI:     {}{}",
                             quoteString(bucket), !uri.empty() ? " (" + uri.toString() + ")" : "");
     }
+
+    size_t getObjectSize(std::shared_ptr<Aws::S3::S3Client> client_ptr, const String & bucket, const String & key, bool throw_on_error)
+    {
+        Aws::S3::Model::HeadObjectRequest req;
+        req.SetBucket(bucket);
+        req.SetKey(key);
+
+        Aws::S3::Model::HeadObjectOutcome outcome = client_ptr->HeadObject(req);
+
+        if (outcome.IsSuccess())
+        {
+            auto read_result = outcome.GetResultWithOwnership();
+            return static_cast<size_t>(read_result.GetContentLength());
+        }
+        else if (throw_on_error)
+        {
+            throw DB::Exception(outcome.GetError().GetMessage(), ErrorCodes::S3_ERROR);
+        }
+        return 0;
+    }
 }
 
 }
diff --git a/src/IO/S3Common.h b/src/IO/S3Common.h
index 97cb4f74f90..c33ce427e66 100644
--- a/src/IO/S3Common.h
+++ b/src/IO/S3Common.h
@@ -75,6 +75,8 @@ struct URI
     static void validateBucket(const String & bucket, const Poco::URI & uri);
 };
 
+size_t getObjectSize(std::shared_ptr<Aws::S3::S3Client> client_ptr, const String & bucket, const String & key, bool throw_on_error = true);
+
 }
 
 #endif
diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp
index f319bd1097b..6f5228c2c63 100644
--- a/src/Storages/StorageS3.cpp
+++ b/src/Storages/StorageS3.cpp
@@ -1,4 +1,6 @@
 #include <Common/config.h>
+#include "IO/ParallelReadBuffer.h"
+#include "IO/IOThreadPool.h"
 #include "Parsers/ASTCreateQuery.h"
 
 #if USE_AWS_S3
@@ -273,9 +275,13 @@ bool StorageS3Source::initialize()
 
     file_path = fs::path(bucket) / current_key;
 
+    size_t object_size = DB::S3::getObjectSize(client, bucket, current_key, false);
+    auto factory = std::make_unique<ReadBufferS3Factory>(client, bucket, current_key, 10 * 1024 * 1024, object_size, max_single_read_retries, getContext()->getReadSettings());
+
     read_buf = wrapReadBufferWithCompressionMethod(
-        std::make_unique<ReadBufferFromS3>(client, bucket, current_key, max_single_read_retries, getContext()->getReadSettings()),
+        std::make_unique<ParallelReadBuffer>(std::move(factory), &IOThreadPool::get(), 4),
         chooseCompressionMethod(current_key, compression_hint));
+
     auto input_format = getContext()->getInputFormat(format, *read_buf, sample_block, max_block_size, format_settings);
     QueryPipelineBuilder builder;
     builder.init(Pipe(input_format));

From 6785ad165a72e372f155a10e71a872e37bdfc2c1 Mon Sep 17 00:00:00 2001
From: Antonio Andelic <antonio@clickhouse.com>
Date: Tue, 22 Mar 2022 14:06:30 +0000
Subject: [PATCH 03/58] Fix issue for mutliple download threads

---
 src/IO/ParallelReadBuffer.cpp |  2 +-
 src/IO/ParallelReadBuffer.h   |  4 ++--
 src/IO/ReadBufferFromS3.h     |  4 ++--
 src/Storages/StorageS3.cpp    | 20 +++++++++++++++-----
 4 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/src/IO/ParallelReadBuffer.cpp b/src/IO/ParallelReadBuffer.cpp
index 7fa10c160ad..3ee62148c23 100644
--- a/src/IO/ParallelReadBuffer.cpp
+++ b/src/IO/ParallelReadBuffer.cpp
@@ -237,7 +237,7 @@ void ParallelReadBuffer::readerThreadFunction(ReadWorkerPtr read_worker)
         while (!emergency_stop && !read_worker->cancel)
         {
             if (!read_worker->reader->next())
-                throw Exception("Failed to read all the data from the reader", ErrorCodes::LOGICAL_ERROR);
+                throw Exception(ErrorCodes::LOGICAL_ERROR, "Failed to read all the data from the reader, missing {} bytes", read_worker->bytes_left);
 
             if (emergency_stop || read_worker->cancel)
                 break;
diff --git a/src/IO/ParallelReadBuffer.h b/src/IO/ParallelReadBuffer.h
index 7b364205e8e..7a3f0cb3773 100644
--- a/src/IO/ParallelReadBuffer.h
+++ b/src/IO/ParallelReadBuffer.h
@@ -82,8 +82,8 @@ public:
         std::unique_ptr<ReadBufferFactory> reader_factory_,
         ThreadPool * pool,
         size_t max_working_readers,
-        WorkerSetup worker_setup = {},
-        WorkerCleanup worker_cleanup = {});
+        WorkerSetup worker_setup = [](ThreadStatus &){},
+        WorkerCleanup worker_cleanup = [](ThreadStatus &){});
 
     ~ParallelReadBuffer() override { finishAndWait(); }
 
diff --git a/src/IO/ReadBufferFromS3.h b/src/IO/ReadBufferFromS3.h
index 6668f570cb6..e3708c4b748 100644
--- a/src/IO/ReadBufferFromS3.h
+++ b/src/IO/ReadBufferFromS3.h
@@ -60,7 +60,7 @@ public:
 
     void setReadUntilPosition(size_t position) override;
 
-    Range getRemainingReadRange() const override { return Range{ .left = static_cast<size_t>(offset), .right = read_until_position }; }
+    Range getRemainingReadRange() const override { return Range{ .left = static_cast<size_t>(offset), .right = read_until_position - 1}; }
 
     size_t getFileOffsetOfBufferEnd() const override { return offset; }
 
@@ -124,7 +124,7 @@ public:
 
         auto reader = std::make_shared<ReadBufferFromS3>(
                 client_ptr, bucket, key, s3_max_single_read_retries, settings);
-        reader->setRange(next_range->first, next_range->second - 1);
+        reader->setRange(next_range->first, next_range->second);
         return reader;
     }
 
diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp
index 6f5228c2c63..9dc15e54ecc 100644
--- a/src/Storages/StorageS3.cpp
+++ b/src/Storages/StorageS3.cpp
@@ -275,12 +275,22 @@ bool StorageS3Source::initialize()
 
     file_path = fs::path(bucket) / current_key;
 
-    size_t object_size = DB::S3::getObjectSize(client, bucket, current_key, false);
-    auto factory = std::make_unique<ReadBufferS3Factory>(client, bucket, current_key, 10 * 1024 * 1024, object_size, max_single_read_retries, getContext()->getReadSettings());
+    const auto max_download_threads = getContext()->getSettings().max_download_threads;
+    if (max_download_threads == 1)
+    {
+        read_buf = wrapReadBufferWithCompressionMethod(
+         std::make_unique<ReadBufferFromS3>(client, bucket, current_key, max_single_read_retries, getContext()->getReadSettings()),
+            chooseCompressionMethod(current_key, compression_hint));
+    }
+    else
+    {
+        size_t object_size = DB::S3::getObjectSize(client, bucket, current_key, false);
+        auto factory = std::make_unique<ReadBufferS3Factory>(client, bucket, current_key, 10 * 1024 * 1024, object_size, max_single_read_retries, getContext()->getReadSettings());
 
-    read_buf = wrapReadBufferWithCompressionMethod(
-        std::make_unique<ParallelReadBuffer>(std::move(factory), &IOThreadPool::get(), 4),
-        chooseCompressionMethod(current_key, compression_hint));
+        read_buf = wrapReadBufferWithCompressionMethod(
+         std::make_unique<ParallelReadBuffer>(std::move(factory), &IOThreadPool::get(), getContext()->getSettings().max_download_threads),
+            chooseCompressionMethod(current_key, compression_hint));
+    }
 
     auto input_format = getContext()->getInputFormat(format, *read_buf, sample_block, max_block_size, format_settings);
     QueryPipelineBuilder builder;

From f693eba568415e66d856da7c45df824d97949361 Mon Sep 17 00:00:00 2001
From: Anton Popov <ap@clickhouse.com>
Date: Tue, 22 Mar 2022 14:30:40 +0000
Subject: [PATCH 04/58] fix tests with approx rows

---
 src/Processors/QueryPlan/ReadFromMergeTree.cpp      | 9 ++++++---
 src/Storages/MergeTree/MergeTreeIOSettings.h        | 2 ++
 src/Storages/MergeTree/MergeTreeSelectProcessor.cpp | 8 ++++++++
 3 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp
index 1bfc1ec7306..cf7b2dcc855 100644
--- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp
+++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp
@@ -45,7 +45,8 @@ namespace ErrorCodes
     extern const int LOGICAL_ERROR;
 }
 
-static MergeTreeReaderSettings getMergeTreeReaderSettings(const ContextPtr & context)
+static MergeTreeReaderSettings getMergeTreeReaderSettings(
+    const ContextPtr & context, const SelectQueryInfo & query_info)
 {
     const auto & settings = context->getSettingsRef();
     return
@@ -53,6 +54,7 @@ static MergeTreeReaderSettings getMergeTreeReaderSettings(const ContextPtr & con
         .read_settings = context->getReadSettings(),
         .save_marks_in_cache = true,
         .checksum_on_read = settings.checksum_on_read,
+        .read_in_order = query_info.input_order_info != nullptr,
     };
 }
 
@@ -82,7 +84,7 @@ ReadFromMergeTree::ReadFromMergeTree(
         getPrewhereInfo(query_info_),
         data_.getPartitionValueType(),
         virt_column_names_)})
-    , reader_settings(getMergeTreeReaderSettings(context_))
+    , reader_settings(getMergeTreeReaderSettings(context_, query_info_))
     , prepared_parts(std::move(parts_))
     , real_column_names(std::move(real_column_names_))
     , virt_column_names(std::move(virt_column_names_))
@@ -203,6 +205,7 @@ ProcessorPtr ReadFromMergeTree::createSource(
             .colums_to_read = required_columns
         };
     }
+
     return std::make_shared<TSource>(
             data, storage_snapshot, part.data_part, max_block_size, preferred_block_size_bytes,
             preferred_max_column_in_block_size_bytes, required_columns, part.ranges, use_uncompressed_cache, prewhere_info,
@@ -918,7 +921,7 @@ MergeTreeDataSelectAnalysisResultPtr ReadFromMergeTree::selectRangesToRead(
             total_marks_pk += part->index_granularity.getMarksCountWithoutFinal();
         parts_before_pk = parts.size();
 
-        auto reader_settings = getMergeTreeReaderSettings(context);
+        auto reader_settings = getMergeTreeReaderSettings(context, query_info);
 
         bool use_skip_indexes = settings.use_skip_indexes;
         if (select.final() && !settings.use_skip_indexes_if_final)
diff --git a/src/Storages/MergeTree/MergeTreeIOSettings.h b/src/Storages/MergeTree/MergeTreeIOSettings.h
index aaa8fae7dba..4fe424c98ff 100644
--- a/src/Storages/MergeTree/MergeTreeIOSettings.h
+++ b/src/Storages/MergeTree/MergeTreeIOSettings.h
@@ -20,6 +20,8 @@ struct MergeTreeReaderSettings
     bool save_marks_in_cache = false;
     /// Validate checksums on reading (should be always enabled in production).
     bool checksum_on_read = true;
+    /// True if we read in order of sorting key.
+    bool read_in_order = false;
 };
 
 struct MergeTreeWriterSettings
diff --git a/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp
index 90e272e329e..f6cbf54b752 100644
--- a/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp
+++ b/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp
@@ -37,6 +37,14 @@ MergeTreeSelectProcessor::MergeTreeSelectProcessor(
     has_limit_below_one_block(has_limit_below_one_block_),
     total_rows(data_part->index_granularity.getRowsCountInRanges(all_mark_ranges))
 {
+    /// Actually it means that parallel reading from replicas enabled
+    /// and we have to collaborate with initiator.
+    /// In this case we won't set approximate rows, because it will be accounted multiple times.
+    /// Also do not count amount of read rows if we read in order of sorting key,
+    /// because we don't know actual amount of read rows in case when limit is set.
+    if (!extension_.has_value() && !reader_settings.read_in_order)
+        addTotalRowsApprox(total_rows);
+
     ordered_names = header_without_virtual_columns.getNames();
 }
 

From f32ef2a556f4b6a667ec2a0ff3d3553703046dc0 Mon Sep 17 00:00:00 2001
From: Antonio Andelic <antonio@clickhouse.com>
Date: Wed, 23 Mar 2022 08:15:18 +0000
Subject: [PATCH 05/58] Small polishing for S3 reader

---
 src/IO/ReadBufferFromS3.cpp |  4 +++-
 src/IO/ReadBufferFromS3.h   | 48 +++++++++++++++++--------------------
 2 files changed, 25 insertions(+), 27 deletions(-)

diff --git a/src/IO/ReadBufferFromS3.cpp b/src/IO/ReadBufferFromS3.cpp
index 93bbe02c9cd..05e1276bc63 100644
--- a/src/IO/ReadBufferFromS3.cpp
+++ b/src/IO/ReadBufferFromS3.cpp
@@ -42,6 +42,7 @@ ReadBufferFromS3::ReadBufferFromS3(
     UInt64 max_single_read_retries_,
     const ReadSettings & settings_,
     bool use_external_buffer_,
+    size_t offset_,
     size_t read_until_position_,
     bool restricted_seek_)
     : SeekableReadBufferWithSize(nullptr, 0)
@@ -49,9 +50,10 @@ ReadBufferFromS3::ReadBufferFromS3(
     , bucket(bucket_)
     , key(key_)
     , max_single_read_retries(max_single_read_retries_)
+    , offset(offset_)
+    , read_until_position(read_until_position_)
     , read_settings(settings_)
     , use_external_buffer(use_external_buffer_)
-    , read_until_position(read_until_position_)
     , restricted_seek(restricted_seek_)
 {
 }
diff --git a/src/IO/ReadBufferFromS3.h b/src/IO/ReadBufferFromS3.h
index e3708c4b748..8d71a03fb87 100644
--- a/src/IO/ReadBufferFromS3.h
+++ b/src/IO/ReadBufferFromS3.h
@@ -5,15 +5,15 @@
 
 #if USE_AWS_S3
 
-#include <memory>
+#    include <memory>
 
-#include <IO/HTTPCommon.h>
-#include <IO/ReadBuffer.h>
-#include <IO/ReadSettings.h>
-#include <IO/SeekableReadBuffer.h>
-#include <IO/ParallelReadBuffer.h>
+#    include <IO/HTTPCommon.h>
+#    include <IO/ParallelReadBuffer.h>
+#    include <IO/ReadBuffer.h>
+#    include <IO/ReadSettings.h>
+#    include <IO/SeekableReadBuffer.h>
 
-#include <aws/s3/model/GetObjectResult.h>
+#    include <aws/s3/model/GetObjectResult.h>
 
 namespace Aws::S3
 {
@@ -32,7 +32,9 @@ private:
     String bucket;
     String key;
     UInt64 max_single_read_retries;
+
     off_t offset = 0;
+    off_t read_until_position = 0;
 
     Aws::S3::Model::GetObjectResult read_result;
     std::unique_ptr<ReadBuffer> impl;
@@ -47,6 +49,7 @@ public:
         UInt64 max_single_read_retries_,
         const ReadSettings & settings_,
         bool use_external_buffer = false,
+        size_t offset_ = 0,
         size_t read_until_position_ = 0,
         bool restricted_seek_ = false);
 
@@ -60,16 +63,10 @@ public:
 
     void setReadUntilPosition(size_t position) override;
 
-    Range getRemainingReadRange() const override { return Range{ .left = static_cast<size_t>(offset), .right = read_until_position - 1}; }
+    Range getRemainingReadRange() const override { return Range{.left = static_cast<size_t>(offset), .right = read_until_position - 1}; }
 
     size_t getFileOffsetOfBufferEnd() const override { return offset; }
 
-    void setRange(off_t begin, off_t end)
-    {
-        offset = begin;
-        read_until_position = end;
-    }
-
 private:
     std::unique_ptr<ReadBuffer> initialize();
 
@@ -77,8 +74,6 @@ private:
 
     bool use_external_buffer;
 
-    off_t read_until_position = 0;
-
     /// There is different seek policy for disk seek and for non-disk seek
     /// (non-disk seek is applied for seekable input formats: orc, arrow, parquet).
     bool restricted_seek;
@@ -95,11 +90,11 @@ public:
         size_t range_step_,
         size_t object_size_,
         UInt64 s3_max_single_read_retries_,
-        const ReadSettings &read_settings)
+        const ReadSettings & read_settings_)
         : client_ptr(client_ptr_)
         , bucket(bucket_)
         , key(key_)
-        , settings(read_settings)
+        , read_settings(read_settings_)
         , range_generator(object_size_, range_step_)
         , range_step(range_step_)
         , object_size(object_size_)
@@ -109,11 +104,6 @@ public:
         assert(range_step < object_size);
     }
 
-    size_t totalRanges() const
-    {
-        return static_cast<size_t>(round(static_cast<float>(object_size) / range_step));
-    }
-
     SeekableReadBufferPtr getReader() override
     {
         const auto next_range = range_generator.nextRange();
@@ -123,8 +113,14 @@ public:
         }
 
         auto reader = std::make_shared<ReadBufferFromS3>(
-                client_ptr, bucket, key, s3_max_single_read_retries, settings);
-        reader->setRange(next_range->first, next_range->second);
+            client_ptr,
+            bucket,
+            key,
+            s3_max_single_read_retries,
+            read_settings,
+            false /*use_external_buffer*/,
+            next_range->first,
+            next_range->second + 1);
         return reader;
     }
 
@@ -140,7 +136,7 @@ private:
     std::shared_ptr<Aws::S3::S3Client> client_ptr;
     const String bucket;
     const String key;
-    ReadSettings settings;
+    ReadSettings read_settings;
 
     RangeGenerator range_generator;
     size_t range_step;

From 131b3a091c60a13ed7cd7ddfe090e5dda204b6ad Mon Sep 17 00:00:00 2001
From: Antonio Andelic <antonio@clickhouse.com>
Date: Wed, 23 Mar 2022 08:40:00 +0000
Subject: [PATCH 06/58] Refactor StorageS3

---
 src/IO/ReadBufferFromS3.h  |  2 +-
 src/Storages/StorageS3.cpp | 70 ++++++++++++++++++++++++++------------
 src/Storages/StorageS3.h   |  6 +++-
 3 files changed, 54 insertions(+), 24 deletions(-)

diff --git a/src/IO/ReadBufferFromS3.h b/src/IO/ReadBufferFromS3.h
index 8d71a03fb87..d984fbebe5f 100644
--- a/src/IO/ReadBufferFromS3.h
+++ b/src/IO/ReadBufferFromS3.h
@@ -120,7 +120,7 @@ public:
             read_settings,
             false /*use_external_buffer*/,
             next_range->first,
-            next_range->second + 1);
+            next_range->second);
         return reader;
     }
 
diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp
index 9dc15e54ecc..6966d356070 100644
--- a/src/Storages/StorageS3.cpp
+++ b/src/Storages/StorageS3.cpp
@@ -238,7 +238,8 @@ StorageS3Source::StorageS3Source(
     String compression_hint_,
     const std::shared_ptr<Aws::S3::S3Client> & client_,
     const String & bucket_,
-    std::shared_ptr<IteratorWrapper> file_iterator_)
+    std::shared_ptr<IteratorWrapper> file_iterator_,
+    const size_t download_thread_num_)
     : SourceWithProgress(getHeader(sample_block_, need_path, need_file))
     , WithContext(context_)
     , name(std::move(name_))
@@ -254,6 +255,7 @@ StorageS3Source::StorageS3Source(
     , with_file_column(need_file)
     , with_path_column(need_path)
     , file_iterator(file_iterator_)
+    , download_thread_num(download_thread_num_)
 {
     initialize();
 }
@@ -275,22 +277,7 @@ bool StorageS3Source::initialize()
 
     file_path = fs::path(bucket) / current_key;
 
-    const auto max_download_threads = getContext()->getSettings().max_download_threads;
-    if (max_download_threads == 1)
-    {
-        read_buf = wrapReadBufferWithCompressionMethod(
-         std::make_unique<ReadBufferFromS3>(client, bucket, current_key, max_single_read_retries, getContext()->getReadSettings()),
-            chooseCompressionMethod(current_key, compression_hint));
-    }
-    else
-    {
-        size_t object_size = DB::S3::getObjectSize(client, bucket, current_key, false);
-        auto factory = std::make_unique<ReadBufferS3Factory>(client, bucket, current_key, 10 * 1024 * 1024, object_size, max_single_read_retries, getContext()->getReadSettings());
-
-        read_buf = wrapReadBufferWithCompressionMethod(
-         std::make_unique<ParallelReadBuffer>(std::move(factory), &IOThreadPool::get(), getContext()->getSettings().max_download_threads),
-            chooseCompressionMethod(current_key, compression_hint));
-    }
+    read_buf = wrapReadBufferWithCompressionMethod(createS3ReadBuffer(current_key), chooseCompressionMethod(current_key, compression_hint));
 
     auto input_format = getContext()->getInputFormat(format, *read_buf, sample_block, max_block_size, format_settings);
     QueryPipelineBuilder builder;
@@ -298,10 +285,9 @@ bool StorageS3Source::initialize()
 
     if (columns_desc.hasDefaults())
     {
-        builder.addSimpleTransform([&](const Block & header)
-        {
-            return std::make_shared<AddingDefaultsTransform>(header, columns_desc, *input_format, getContext());
-        });
+        builder.addSimpleTransform(
+            [&](const Block & header)
+            { return std::make_shared<AddingDefaultsTransform>(header, columns_desc, *input_format, getContext()); });
     }
 
     pipeline = std::make_unique<QueryPipeline>(QueryPipelineBuilder::getPipeline(std::move(builder)));
@@ -311,6 +297,43 @@ bool StorageS3Source::initialize()
     return true;
 }
 
+std::unique_ptr<ReadBuffer> StorageS3Source::createS3ReadBuffer(const String & key)
+{
+    const size_t object_size = DB::S3::getObjectSize(client, bucket, key, false);
+
+    auto download_buffer_size = getContext()->getSettings().max_download_buffer_size;
+    const bool use_parallel_download = download_buffer_size > 0 && download_thread_num > 1;
+    const bool object_too_small = object_size < download_thread_num * download_buffer_size;
+    if (!use_parallel_download || object_too_small)
+    {
+        LOG_TRACE(&Poco::Logger::get("StorageS3Source"), "Downloading object of size {} from S3 in single thread", object_size);
+        return std::make_unique<ReadBufferFromS3>(client, bucket, key, max_single_read_retries, getContext()->getReadSettings());
+    }
+
+    assert(object_size > 0);
+
+    if (download_buffer_size < DBMS_DEFAULT_BUFFER_SIZE)
+    {
+        LOG_WARNING(
+            &Poco::Logger::get("StorageS3Source"),
+            "Downloading buffer {} bytes too small, set at least {} bytes",
+            download_buffer_size,
+            DBMS_DEFAULT_BUFFER_SIZE);
+        download_buffer_size = DBMS_DEFAULT_BUFFER_SIZE;
+    }
+
+    auto factory = std::make_unique<ReadBufferS3Factory>(
+        client, bucket, key, download_buffer_size, object_size, max_single_read_retries, getContext()->getReadSettings());
+    LOG_TRACE(
+        &Poco::Logger::get("StorageS3Source"),
+        "Downloading from S3 in {} threads. Object size: {}, Range size: {}.",
+        download_thread_num,
+        object_size,
+        download_buffer_size);
+
+    return std::make_unique<ParallelReadBuffer>(std::move(factory), &IOThreadPool::get(), download_thread_num);
+}
+
 String StorageS3Source::getName() const
 {
     return name;
@@ -674,6 +697,8 @@ Pipe StorageS3::read(
         block_for_format = storage_snapshot->metadata->getSampleBlock();
     }
 
+    const size_t max_download_threads = local_context->getSettingsRef().max_download_threads;
+    const size_t download_threads_per_stream = num_streams >= max_download_threads ? 1 : (max_download_threads / num_streams);
     for (size_t i = 0; i < num_streams; ++i)
     {
         pipes.emplace_back(std::make_shared<StorageS3Source>(
@@ -690,7 +715,8 @@ Pipe StorageS3::read(
             compression_method,
             client_auth.client,
             client_auth.uri.bucket,
-            iterator_wrapper));
+            iterator_wrapper,
+            download_threads_per_stream));
     }
     auto pipe = Pipe::unitePipes(std::move(pipes));
 
diff --git a/src/Storages/StorageS3.h b/src/Storages/StorageS3.h
index 300b7becb93..6c88f5bba56 100644
--- a/src/Storages/StorageS3.h
+++ b/src/Storages/StorageS3.h
@@ -74,7 +74,8 @@ public:
         String compression_hint_,
         const std::shared_ptr<Aws::S3::S3Client> & client_,
         const String & bucket,
-        std::shared_ptr<IteratorWrapper> file_iterator_);
+        std::shared_ptr<IteratorWrapper> file_iterator_,
+        size_t download_thread_num);
 
     String getName() const override;
 
@@ -105,9 +106,12 @@ private:
     bool with_file_column = false;
     bool with_path_column = false;
     std::shared_ptr<IteratorWrapper> file_iterator;
+    size_t download_thread_num = 1;
 
     /// Recreate ReadBuffer and BlockInputStream for each file.
     bool initialize();
+
+    std::unique_ptr<ReadBuffer> createS3ReadBuffer(const String & key);
 };
 
 /**

From 93ad209910cb00c492bd727de0e5eed3baf2e55d Mon Sep 17 00:00:00 2001
From: Antonio Andelic <antonio@clickhouse.com>
Date: Wed, 23 Mar 2022 11:52:31 +0000
Subject: [PATCH 07/58] Attach memory tracker

---
 src/Storages/StorageS3.cpp | 32 ++++++++++++++++++++++++++++----
 src/Storages/StorageS3.h   |  1 -
 2 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp
index 6966d356070..35a8405eeb2 100644
--- a/src/Storages/StorageS3.cpp
+++ b/src/Storages/StorageS3.cpp
@@ -293,7 +293,6 @@ bool StorageS3Source::initialize()
     pipeline = std::make_unique<QueryPipeline>(QueryPipelineBuilder::getPipeline(std::move(builder)));
     reader = std::make_unique<PullingPipelineExecutor>(*pipeline);
 
-    initialized = false;
     return true;
 }
 
@@ -331,7 +330,32 @@ std::unique_ptr<ReadBuffer> StorageS3Source::createS3ReadBuffer(const String & k
         object_size,
         download_buffer_size);
 
-    return std::make_unique<ParallelReadBuffer>(std::move(factory), &IOThreadPool::get(), download_thread_num);
+    ThreadGroupStatusPtr running_group = CurrentThread::isInitialized() && CurrentThread::get().getThreadGroup()
+        ? CurrentThread::get().getThreadGroup()
+        : MainThreadStatus::getInstance().getThreadGroup();
+
+    ContextPtr query_context
+        = CurrentThread::isInitialized() ? CurrentThread::get().getQueryContext() : nullptr;
+
+    auto worker_cleanup = [has_running_group = running_group == nullptr](ThreadStatus & thread_status)
+    {
+        if (has_running_group)
+            thread_status.detachQuery(false);
+    };
+
+    auto worker_setup = [query_context = std::move(query_context),
+                         running_group = std::move(running_group)](ThreadStatus & thread_status)
+    {
+        /// Save query context if any, because cache implementation needs it.
+        if (query_context)
+            thread_status.attachQueryContext(query_context);
+
+        /// To be able to pass ProfileEvents.
+        if (running_group)
+            thread_status.attachQuery(running_group);
+    };
+
+    return std::make_unique<ParallelReadBuffer>(std::move(factory), &IOThreadPool::get(), download_thread_num, worker_setup, worker_cleanup);
 }
 
 String StorageS3Source::getName() const
@@ -698,7 +722,7 @@ Pipe StorageS3::read(
     }
 
     const size_t max_download_threads = local_context->getSettingsRef().max_download_threads;
-    const size_t download_threads_per_stream = num_streams >= max_download_threads ? 1 : (max_download_threads / num_streams);
+    // const size_t download_threads_per_stream = num_streams >= max_download_threads ? 1 : (max_download_threads / num_streams);
     for (size_t i = 0; i < num_streams; ++i)
     {
         pipes.emplace_back(std::make_shared<StorageS3Source>(
@@ -716,7 +740,7 @@ Pipe StorageS3::read(
             client_auth.client,
             client_auth.uri.bucket,
             iterator_wrapper,
-            download_threads_per_stream));
+            max_download_threads));
     }
     auto pipe = Pipe::unitePipes(std::move(pipes));
 
diff --git a/src/Storages/StorageS3.h b/src/Storages/StorageS3.h
index 6c88f5bba56..6362e777df6 100644
--- a/src/Storages/StorageS3.h
+++ b/src/Storages/StorageS3.h
@@ -102,7 +102,6 @@ private:
     std::unique_ptr<PullingPipelineExecutor> reader;
     /// onCancel and generate can be called concurrently
     std::mutex reader_mutex;
-    bool initialized = false;
     bool with_file_column = false;
     bool with_path_column = false;
     std::shared_ptr<IteratorWrapper> file_iterator;

From 1a5f5c32bb2e2a96d22c0ef8132d29fc21db5aba Mon Sep 17 00:00:00 2001
From: Antonio Andelic <antonio@clickhouse.com>
Date: Wed, 23 Mar 2022 14:36:25 +0000
Subject: [PATCH 08/58] Remove commented out code

---
 src/Storages/StorageS3.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp
index 35a8405eeb2..b80b9ead53f 100644
--- a/src/Storages/StorageS3.cpp
+++ b/src/Storages/StorageS3.cpp
@@ -722,7 +722,6 @@ Pipe StorageS3::read(
     }
 
     const size_t max_download_threads = local_context->getSettingsRef().max_download_threads;
-    // const size_t download_threads_per_stream = num_streams >= max_download_threads ? 1 : (max_download_threads / num_streams);
     for (size_t i = 0; i < num_streams; ++i)
     {
         pipes.emplace_back(std::make_shared<StorageS3Source>(

From e0d7b6dc3ed239ef4075add09090751d45941a4e Mon Sep 17 00:00:00 2001
From: Antonio Andelic <antonio@clickhouse.com>
Date: Thu, 24 Mar 2022 09:30:06 +0000
Subject: [PATCH 09/58] Add tests for S3 multithreaded download

---
 tests/integration/test_storage_s3/test.py | 34 ++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/tests/integration/test_storage_s3/test.py b/tests/integration/test_storage_s3/test.py
index dd29d0a5d6a..7364af4d2dd 100644
--- a/tests/integration/test_storage_s3/test.py
+++ b/tests/integration/test_storage_s3/test.py
@@ -535,8 +535,9 @@ def test_multipart_put(started_cluster, maybe_auth, positive):
 
     one_line_length = 6  # 3 digits, 2 commas, 1 line separator.
 
+    total_rows = csv_size_bytes // one_line_length
     # Generate data having size more than one part
-    int_data = [[1, 2, 3] for i in range(csv_size_bytes // one_line_length)]
+    int_data = [[1, 2, 3] for i in range(total_rows)]
     csv_data = "".join(["{},{},{}\n".format(x, y, z) for x, y, z in int_data])
 
     assert len(csv_data) > min_part_size_bytes
@@ -573,6 +574,37 @@ def test_multipart_put(started_cluster, maybe_auth, positive):
 
         assert csv_data == get_s3_file_content(started_cluster, bucket, filename)
 
+    # select uploaded data from many threads
+    select_query = (
+        "select sum(column1), sum(column2), sum(column3) "
+        "from s3('http://{host}:{port}/{bucket}/{filename}', {auth}'CSV', '{table_format}')".format(
+            host=started_cluster.minio_redirect_host,
+            port=started_cluster.minio_redirect_port,
+            bucket=bucket,
+            filename=filename,
+            auth=maybe_auth,
+            table_format=table_format,
+        )
+    )
+    try:
+        select_result = run_query(
+            instance,
+            select_query,
+            settings={
+                "max_download_threads": random.randint(4, 16),
+                "max_download_buffer_size": 1024 * 1024,
+            },
+        )
+    except helpers.client.QueryRuntimeException:
+        if positive:
+            raise
+    else:
+        assert positive
+        assert (
+            select_result
+            == "\t".join(map(str, [total_rows, total_rows * 2, total_rows * 3])) + "\n"
+        )
+
 
 def test_remote_host_filter(started_cluster):
     instance = started_cluster.instances["restricted_dummy"]

From 62a495a5fc64c24b83982a362c8e72b965f2fb09 Mon Sep 17 00:00:00 2001
From: Antonio Andelic <antonio@clickhouse.com>
Date: Thu, 24 Mar 2022 09:38:11 +0000
Subject: [PATCH 10/58] Small refactor

---
 src/IO/ReadBufferFromS3.cpp               | 30 ++++++++++++++++
 src/IO/ReadBufferFromS3.h                 | 42 ++++++-----------------
 src/IO/S3Common.cpp                       |  3 +-
 src/Storages/StorageS3.cpp                |  3 +-
 tests/integration/test_storage_s3/test.py |  2 +-
 5 files changed, 44 insertions(+), 36 deletions(-)

diff --git a/src/IO/ReadBufferFromS3.cpp b/src/IO/ReadBufferFromS3.cpp
index 05e1276bc63..c78f41a0f9d 100644
--- a/src/IO/ReadBufferFromS3.cpp
+++ b/src/IO/ReadBufferFromS3.cpp
@@ -274,6 +274,36 @@ std::unique_ptr<ReadBuffer> ReadBufferFromS3::initialize()
         throw Exception(outcome.GetError().GetMessage(), ErrorCodes::S3_ERROR);
 }
 
+SeekableReadBufferPtr ReadBufferS3Factory::getReader()
+{
+    const auto next_range = range_generator.nextRange();
+    if (!next_range)
+    {
+        return nullptr;
+    }
+
+    auto reader = std::make_shared<ReadBufferFromS3>(
+        client_ptr,
+        bucket,
+        key,
+        s3_max_single_read_retries,
+        read_settings,
+        false /*use_external_buffer*/,
+        next_range->first,
+        next_range->second);
+    return reader;
+}
+
+off_t ReadBufferS3Factory::seek(off_t off, [[maybe_unused]] int whence)
+{
+    range_generator = RangeGenerator{object_size, range_step, static_cast<size_t>(off)};
+    return off;
+}
+
+std::optional<size_t> ReadBufferS3Factory::getTotalSize()
+{
+    return object_size;
+}
 }
 
 #endif
diff --git a/src/IO/ReadBufferFromS3.h b/src/IO/ReadBufferFromS3.h
index d984fbebe5f..dbba751b321 100644
--- a/src/IO/ReadBufferFromS3.h
+++ b/src/IO/ReadBufferFromS3.h
@@ -5,15 +5,15 @@
 
 #if USE_AWS_S3
 
-#    include <memory>
+#include <memory>
 
-#    include <IO/HTTPCommon.h>
-#    include <IO/ParallelReadBuffer.h>
-#    include <IO/ReadBuffer.h>
-#    include <IO/ReadSettings.h>
-#    include <IO/SeekableReadBuffer.h>
+#include <IO/HTTPCommon.h>
+#include <IO/ParallelReadBuffer.h>
+#include <IO/ReadBuffer.h>
+#include <IO/ReadSettings.h>
+#include <IO/SeekableReadBuffer.h>
 
-#    include <aws/s3/model/GetObjectResult.h>
+#include <aws/s3/model/GetObjectResult.h>
 
 namespace Aws::S3
 {
@@ -104,33 +104,11 @@ public:
         assert(range_step < object_size);
     }
 
-    SeekableReadBufferPtr getReader() override
-    {
-        const auto next_range = range_generator.nextRange();
-        if (!next_range)
-        {
-            return nullptr;
-        }
+    SeekableReadBufferPtr getReader() override;
 
-        auto reader = std::make_shared<ReadBufferFromS3>(
-            client_ptr,
-            bucket,
-            key,
-            s3_max_single_read_retries,
-            read_settings,
-            false /*use_external_buffer*/,
-            next_range->first,
-            next_range->second);
-        return reader;
-    }
+    off_t seek(off_t off, [[maybe_unused]] int whence) override;
 
-    off_t seek(off_t off, [[maybe_unused]] int whence) override
-    {
-        range_generator = RangeGenerator{object_size, range_step, static_cast<size_t>(off)};
-        return off;
-    }
-
-    std::optional<size_t> getTotalSize() override { return object_size; }
+    std::optional<size_t> getTotalSize() override;
 
 private:
     std::shared_ptr<Aws::S3::S3Client> client_ptr;
diff --git a/src/IO/S3Common.cpp b/src/IO/S3Common.cpp
index 971ac0d8a73..e706f0a75c1 100644
--- a/src/IO/S3Common.cpp
+++ b/src/IO/S3Common.cpp
@@ -24,6 +24,7 @@
 #    include <aws/core/utils/UUID.h>
 #    include <aws/core/http/HttpClientFactory.h>
 #    include <aws/s3/S3Client.h>
+#    include <aws/s3/model/HeadObjectRequest.h>  // Y_IGNORE
 
 #    include <IO/S3/PocoHTTPClientFactory.h>
 #    include <IO/S3/PocoHTTPClient.h>
@@ -33,8 +34,6 @@
 #    include <base/logger_useful.h>
 
 #    include <fstream>
-#include <aws/s3/S3Client.h>  // Y_IGNORE
-#include <aws/s3/model/HeadObjectRequest.h>  // Y_IGNORE
 
 namespace
 {
diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp
index b80b9ead53f..5625ff16a10 100644
--- a/src/Storages/StorageS3.cpp
+++ b/src/Storages/StorageS3.cpp
@@ -355,7 +355,8 @@ std::unique_ptr<ReadBuffer> StorageS3Source::createS3ReadBuffer(const String & k
             thread_status.attachQuery(running_group);
     };
 
-    return std::make_unique<ParallelReadBuffer>(std::move(factory), &IOThreadPool::get(), download_thread_num, worker_setup, worker_cleanup);
+    return std::make_unique<ParallelReadBuffer>(
+        std::move(factory), &IOThreadPool::get(), download_thread_num, std::move(worker_setup), std::move(worker_cleanup));
 }
 
 String StorageS3Source::getName() const
diff --git a/tests/integration/test_storage_s3/test.py b/tests/integration/test_storage_s3/test.py
index 7364af4d2dd..7fc59d9d9ef 100644
--- a/tests/integration/test_storage_s3/test.py
+++ b/tests/integration/test_storage_s3/test.py
@@ -517,7 +517,7 @@ def test_put_get_with_globs(started_cluster):
         # ("'minio','minio123',",True), Redirect with credentials not working with nginx.
     ],
 )
-def test_multipart_put(started_cluster, maybe_auth, positive):
+def test_multipart(started_cluster, maybe_auth, positive):
     # type: (ClickHouseCluster) -> None
 
     bucket = (

From 67195bfdd5f883e2e23e591bc78d2f7a56200c04 Mon Sep 17 00:00:00 2001
From: Anton Popov <ap@clickhouse.com>
Date: Fri, 25 Mar 2022 21:00:00 +0000
Subject: [PATCH 11/58] support schema inference for type Object in format
 JSONEachRow

---
 src/Formats/JSONEachRowUtils.cpp              |  20 +-
 src/Formats/ReadSchemaUtils.cpp               |  40 ++++
 src/Functions/FunctionsConversion.h           | 173 +++++++++++++-----
 src/Interpreters/InterpreterInsertQuery.cpp   |   1 +
 src/Interpreters/executeQuery.cpp             |  16 +-
 src/Processors/Formats/ISchemaReader.h        |   5 +
 .../01825_type_json_from_map.reference        |   4 +
 .../0_stateless/01825_type_json_from_map.sql  |  41 +++++
 ...01825_type_json_schema_inference.reference |   8 +
 .../01825_type_json_schema_inference.sh       |  52 ++++++
 10 files changed, 303 insertions(+), 57 deletions(-)
 create mode 100644 tests/queries/0_stateless/01825_type_json_from_map.reference
 create mode 100644 tests/queries/0_stateless/01825_type_json_from_map.sql
 create mode 100644 tests/queries/0_stateless/01825_type_json_schema_inference.reference
 create mode 100755 tests/queries/0_stateless/01825_type_json_schema_inference.sh

diff --git a/src/Formats/JSONEachRowUtils.cpp b/src/Formats/JSONEachRowUtils.cpp
index 66e0538fef1..80b9f84dafe 100644
--- a/src/Formats/JSONEachRowUtils.cpp
+++ b/src/Formats/JSONEachRowUtils.cpp
@@ -9,6 +9,7 @@
 #include <DataTypes/DataTypeArray.h>
 #include <DataTypes/DataTypeTuple.h>
 #include <DataTypes/DataTypeMap.h>
+#include <DataTypes/DataTypeObject.h>
 #include <Common/JSONParsers/SimdJSONParser.h>
 #include <Common/JSONParsers/RapidJSONParser.h>
 #include <Common/JSONParsers/DummyJSONParser.h>
@@ -158,22 +159,29 @@ DataTypePtr getDataTypeFromJSONFieldImpl(const Element & field)
     {
         auto object = field.getObject();
         DataTypePtr value_type;
+        bool is_map = true;
         for (const auto key_value_pair : object)
         {
             auto type = getDataTypeFromJSONFieldImpl(key_value_pair.second);
-            if (!type)
-                return nullptr;
+            if (!type || isObject(type))
+            {
+                is_map = false;
+                break;
+            }
 
             if (value_type && value_type->getName() != type->getName())
-                return nullptr;
+            {
+                is_map = false;
+                break;
+            }
 
             value_type = type;
         }
 
-        if (!value_type)
-            return nullptr;
+        if (is_map && value_type)
+            return std::make_shared<DataTypeMap>(std::make_shared<DataTypeString>(), value_type);
 
-        return std::make_shared<DataTypeMap>(std::make_shared<DataTypeString>(), value_type);
+        return std::make_shared<DataTypeObject>("json", false);
     }
 
     throw Exception{ErrorCodes::INCORRECT_DATA, "Unexpected JSON type"};
diff --git a/src/Formats/ReadSchemaUtils.cpp b/src/Formats/ReadSchemaUtils.cpp
index 559fac4cfaa..8e2531e2006 100644
--- a/src/Formats/ReadSchemaUtils.cpp
+++ b/src/Formats/ReadSchemaUtils.cpp
@@ -7,6 +7,8 @@
 #include <Formats/ReadSchemaUtils.h>
 #include <Processors/Formats/ISchemaReader.h>
 #include <Common/assert_cast.h>
+#include <Interpreters/Context.h>
+#include <Storages/IStorage.h>
 
 namespace DB
 {
@@ -17,6 +19,28 @@ namespace ErrorCodes
     extern const int BAD_ARGUMENTS;
 }
 
+static std::optional<NamesAndTypesList> getOrderedColumnsList(
+    const NamesAndTypesList & columns_list, const Names & columns_order_hint)
+{
+    if (columns_list.size() != columns_order_hint.size())
+        return {};
+
+    std::unordered_map<String, DataTypePtr> available_columns;
+    for (const auto & [name, type] : columns_list)
+        available_columns.emplace(name, type);
+
+    NamesAndTypesList res;
+    for (const auto & name : columns_order_hint)
+    {
+        auto it = available_columns.find(name);
+        if (it == available_columns.end())
+            return {};
+
+        res.emplace_back(name, it->second);
+    }
+    return res;
+}
+
 ColumnsDescription readSchemaFromFormat(
     const String & format_name,
     const std::optional<FormatSettings> & format_settings,
@@ -52,6 +76,22 @@ ColumnsDescription readSchemaFromFormat(
         {
             throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot extract table structure from {} format file. Error: {}", format_name, e.message());
         }
+
+        /// If we have "INSERT SELECT" query then try to order
+        /// columns as they are ordered in table schema for formats
+        /// without strict column order (like JSON and TSKV).
+        /// It will allow to execute simple data loading with query
+        /// "INSERT INTO table SELECT * FROM ..."
+        const auto & insertion_table = context->getInsertionTable();
+        if (!schema_reader->hasStrictOrderOfColumns() && !insertion_table.empty())
+        {
+            auto storage = DatabaseCatalog::instance().getTable(insertion_table, context);
+            auto metadata = storage->getInMemoryMetadataPtr();
+            auto names_in_storage = metadata->getColumns().getNamesOfPhysical();
+            auto ordered_list = getOrderedColumnsList(names_and_types, names_in_storage);
+            if (ordered_list)
+                names_and_types = *ordered_list;
+        }
     }
     else
         throw Exception(ErrorCodes::BAD_ARGUMENTS, "{} file format doesn't support schema inference", format_name);
diff --git a/src/Functions/FunctionsConversion.h b/src/Functions/FunctionsConversion.h
index e098378f51a..edddb200648 100644
--- a/src/Functions/FunctionsConversion.h
+++ b/src/Functions/FunctionsConversion.h
@@ -53,6 +53,7 @@
 #include <DataTypes/DataTypeLowCardinality.h>
 #include <Columns/ColumnLowCardinality.h>
 #include <Interpreters/Context.h>
+#include <Common/HashTable/HashMap.h>
 
 
 namespace DB
@@ -3140,52 +3141,138 @@ private:
         }
     }
 
+    WrapperType createTupleToObjectWrapper(const DataTypeTuple & from_tuple, bool has_nullable_subcolumns) const
+    {
+        if (!from_tuple.haveExplicitNames())
+            throw Exception(ErrorCodes::TYPE_MISMATCH,
+            "Cast to Object can be performed only from flatten Named Tuple. Got: {}", from_tuple.getName());
+
+        PathsInData paths;
+        DataTypes from_types;
+
+        std::tie(paths, from_types) = flattenTuple(from_tuple.getPtr());
+        auto to_types = from_types;
+
+        for (auto & type : to_types)
+        {
+            if (isTuple(type) || isNested(type))
+                throw Exception(ErrorCodes::TYPE_MISMATCH,
+                    "Cast to Object can be performed only from flatten Named Tuple. Got: {}",
+                    from_tuple.getName());
+
+            type = recursiveRemoveLowCardinality(type);
+        }
+
+        return [element_wrappers = getElementWrappers(from_types, to_types),
+            has_nullable_subcolumns, from_types, to_types, paths]
+            (ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable * nullable_source, size_t input_rows_count)
+        {
+            size_t tuple_size = to_types.size();
+            auto flattened_column = flattenTuple(arguments.front().column);
+            const auto & column_tuple = assert_cast<const ColumnTuple &>(*flattened_column);
+
+            if (tuple_size != column_tuple.getColumns().size())
+                throw Exception(ErrorCodes::TYPE_MISMATCH,
+                    "Expected tuple with {} subcolumn, but got {} subcolumns",
+                    tuple_size, column_tuple.getColumns().size());
+
+            auto res = ColumnObject::create(has_nullable_subcolumns);
+            for (size_t i = 0; i < tuple_size; ++i)
+            {
+                ColumnsWithTypeAndName element = {{column_tuple.getColumns()[i], from_types[i], "" }};
+                auto converted_column = element_wrappers[i](element, to_types[i], nullable_source, input_rows_count);
+                res->addSubcolumn(paths[i], converted_column->assumeMutable());
+            }
+
+            return res;
+        };
+    }
+
+    WrapperType createMapToObjectWrapper(const DataTypeMap & from_map, bool has_nullable_subcolumns) const
+    {
+        auto key_value_types = from_map.getKeyValueTypes();
+
+        if (!isStringOrFixedString(key_value_types[0]))
+            throw Exception(ErrorCodes::TYPE_MISMATCH,
+                "Cast to Object from Map can be performed only from Map "
+                "with String or FixedString key. Got: {}", from_map.getName());
+
+        const auto & value_type = key_value_types[1];
+        auto to_value_type = value_type;
+
+        if (!has_nullable_subcolumns && value_type->isNullable())
+            to_value_type = removeNullable(value_type);
+
+        if (has_nullable_subcolumns && !value_type->isNullable())
+            to_value_type = makeNullable(value_type);
+
+        DataTypes to_key_value_types{std::make_shared<DataTypeString>(), std::move(to_value_type)};
+        auto element_wrappers = getElementWrappers(key_value_types, to_key_value_types);
+
+        return [has_nullable_subcolumns, element_wrappers, key_value_types, to_key_value_types]
+            (ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable * nullable_source, size_t) -> ColumnPtr
+        {
+            const auto & column_map = assert_cast<const ColumnMap &>(*arguments.front().column);
+            const auto & offsets = column_map.getNestedColumn().getOffsets();
+            auto key_value_columns = column_map.getNestedData().getColumnsCopy();
+
+            for (size_t i = 0; i < 2; ++i)
+            {
+                ColumnsWithTypeAndName element{{key_value_columns[i], key_value_types[i], ""}};
+                key_value_columns[i] = element_wrappers[i](element, to_key_value_types[i], nullable_source, key_value_columns[i]->size());
+            }
+
+            const auto & key_column_str = assert_cast<const ColumnString &>(*key_value_columns[0]);
+            const auto & value_column = *key_value_columns[1];
+
+            using SubcolumnsMap = HashMap<StringRef, MutableColumnPtr, StringRefHash>;
+            SubcolumnsMap subcolumns;
+
+            for (size_t row = 0; row < offsets.size(); ++row)
+            {
+                for (size_t i = offsets[static_cast<ssize_t>(row) - 1]; i < offsets[row]; ++i)
+                {
+                    auto ref = key_column_str.getDataAt(i);
+
+                    bool inserted;
+                    SubcolumnsMap::LookupResult it;
+                    subcolumns.emplace(ref, it, inserted);
+                    auto & subcolumn = it->getMapped();
+
+                    if (inserted)
+                        subcolumn = value_column.cloneEmpty()->cloneResized(row);
+
+                    /// Map can have duplicated keys. We insert only first one.
+                    if (subcolumn->size() == row)
+                        subcolumn->insertFrom(value_column, i);
+                }
+
+                /// Insert default values for keys missed in current row.
+                for (const auto & [_, subcolumn] : subcolumns)
+                    if (subcolumn->size() == row)
+                        subcolumn->insertDefault();
+            }
+
+            auto column_object = ColumnObject::create(has_nullable_subcolumns);
+            for (auto && [key, subcolumn] : subcolumns)
+            {
+                PathInData path(key.toView());
+                column_object->addSubcolumn(path, std::move(subcolumn));
+            }
+
+            return column_object;
+        };
+    }
+
     WrapperType createObjectWrapper(const DataTypePtr & from_type, const DataTypeObject * to_type) const
     {
         if (const auto * from_tuple = checkAndGetDataType<DataTypeTuple>(from_type.get()))
         {
-            if (!from_tuple->haveExplicitNames())
-                 throw Exception(ErrorCodes::TYPE_MISMATCH,
-                    "Cast to Object can be performed only from flatten Named Tuple. Got: {}", from_type->getName());
-
-            PathsInData paths;
-            DataTypes from_types;
-
-            std::tie(paths, from_types) = flattenTuple(from_type);
-            auto to_types = from_types;
-
-            for (auto & type : to_types)
-            {
-                if (isTuple(type) || isNested(type))
-                     throw Exception(ErrorCodes::TYPE_MISMATCH,
-                        "Cast to Object can be performed only from flatten Named Tuple. Got: {}", from_type->getName());
-
-                type = recursiveRemoveLowCardinality(type);
-            }
-
-            return [element_wrappers = getElementWrappers(from_types, to_types),
-                has_nullable_subcolumns = to_type->hasNullableSubcolumns(), from_types, to_types, paths]
-                (ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable * nullable_source, size_t input_rows_count)
-            {
-                size_t tuple_size = to_types.size();
-                auto flattened_column = flattenTuple(arguments.front().column);
-                const auto & column_tuple = assert_cast<const ColumnTuple &>(*flattened_column);
-
-                if (tuple_size != column_tuple.getColumns().size())
-                    throw Exception(ErrorCodes::TYPE_MISMATCH,
-                        "Expected tuple with {} subcolumn, but got {} subcolumns",
-                        tuple_size, column_tuple.getColumns().size());
-
-                auto res = ColumnObject::create(has_nullable_subcolumns);
-                for (size_t i = 0; i < tuple_size; ++i)
-                {
-                    ColumnsWithTypeAndName element = {{column_tuple.getColumns()[i], from_types[i], "" }};
-                    auto converted_column = element_wrappers[i](element, to_types[i], nullable_source, input_rows_count);
-                    res->addSubcolumn(paths[i], converted_column->assumeMutable());
-                }
-
-                return res;
-            };
+            return createTupleToObjectWrapper(*from_tuple, to_type->hasNullableSubcolumns());
+        }
+        else if (const auto * from_map = checkAndGetDataType<DataTypeMap>(from_type.get()))
+        {
+            return createMapToObjectWrapper(*from_map, to_type->hasNullableSubcolumns());
         }
         else if (checkAndGetDataType<DataTypeString>(from_type.get()))
         {
@@ -3199,7 +3286,7 @@ private:
         }
 
         throw Exception(ErrorCodes::TYPE_MISMATCH,
-            "Cast to Object can be performed only from flatten named tuple or string. Got: {}", from_type->getName());
+            "Cast to Object can be performed only from flatten named Tuple, Map or String. Got: {}", from_type->getName());
     }
 
     template <typename FieldType>
diff --git a/src/Interpreters/InterpreterInsertQuery.cpp b/src/Interpreters/InterpreterInsertQuery.cpp
index df44814a96e..ce369182766 100644
--- a/src/Interpreters/InterpreterInsertQuery.cpp
+++ b/src/Interpreters/InterpreterInsertQuery.cpp
@@ -358,6 +358,7 @@ BlockIO InterpreterInsertQuery::execute()
 
                 auto new_context = Context::createCopy(context);
                 new_context->setSettings(new_settings);
+                new_context->setInsertionTable(getContext()->getInsertionTable());
 
                 InterpreterSelectWithUnionQuery interpreter_select{
                     query.select, new_context, SelectQueryOptions(QueryProcessingStage::Complete, 1)};
diff --git a/src/Interpreters/executeQuery.cpp b/src/Interpreters/executeQuery.cpp
index c1606700540..89f5f4e6959 100644
--- a/src/Interpreters/executeQuery.cpp
+++ b/src/Interpreters/executeQuery.cpp
@@ -656,6 +656,14 @@ static std::tuple<ASTPtr, BlockIO> executeQueryImpl(
                 limits.size_limits = SizeLimits(settings.max_result_rows, settings.max_result_bytes, settings.result_overflow_mode);
             }
 
+            if (const auto * insert_interpreter = typeid_cast<const InterpreterInsertQuery *>(&*interpreter))
+            {
+                /// Save insertion table (not table function). TODO: support remote() table function.
+                auto table_id = insert_interpreter->getDatabaseTable();
+                if (!table_id.empty())
+                    context->setInsertionTable(std::move(table_id));
+            }
+
             {
                 std::unique_ptr<OpenTelemetrySpanHolder> span;
                 if (context->query_trace_context.trace_id != UUID())
@@ -666,14 +674,6 @@ static std::tuple<ASTPtr, BlockIO> executeQueryImpl(
                 }
                 res = interpreter->execute();
             }
-
-            if (const auto * insert_interpreter = typeid_cast<const InterpreterInsertQuery *>(&*interpreter))
-            {
-                /// Save insertion table (not table function). TODO: support remote() table function.
-                auto table_id = insert_interpreter->getDatabaseTable();
-                if (!table_id.empty())
-                    context->setInsertionTable(std::move(table_id));
-            }
         }
 
         if (process_list_entry)
diff --git a/src/Processors/Formats/ISchemaReader.h b/src/Processors/Formats/ISchemaReader.h
index 2d35809e26a..36cf0656119 100644
--- a/src/Processors/Formats/ISchemaReader.h
+++ b/src/Processors/Formats/ISchemaReader.h
@@ -18,6 +18,10 @@ public:
 
     virtual NamesAndTypesList readSchema() = 0;
 
+    /// True if order of columns is important in format.
+    /// Exceptions: JSON, TSKV.
+    virtual bool hasStrictOrderOfColumns() const { return true; }
+
     virtual ~ISchemaReader() = default;
 
 protected:
@@ -60,6 +64,7 @@ class IRowWithNamesSchemaReader : public ISchemaReader
 public:
     IRowWithNamesSchemaReader(ReadBuffer & in_, size_t max_rows_to_read_, DataTypePtr default_type_ = nullptr);
     NamesAndTypesList readSchema() override;
+    bool hasStrictOrderOfColumns() const override { return false; }
 
 protected:
     /// Read one row and determine types of columns in it.
diff --git a/tests/queries/0_stateless/01825_type_json_from_map.reference b/tests/queries/0_stateless/01825_type_json_from_map.reference
new file mode 100644
index 00000000000..dbcf67faef3
--- /dev/null
+++ b/tests/queries/0_stateless/01825_type_json_from_map.reference
@@ -0,0 +1,4 @@
+800000	2000000	1400000	900000
+800000	2000000	1400000	900000
+Tuple(col0 UInt64, col1 UInt64, col2 UInt64, col3 UInt64, col4 UInt64, col5 UInt64, col6 UInt64, col7 UInt64, col8 UInt64)
+1600000	4000000	2800000	1800000
diff --git a/tests/queries/0_stateless/01825_type_json_from_map.sql b/tests/queries/0_stateless/01825_type_json_from_map.sql
new file mode 100644
index 00000000000..2480aca1667
--- /dev/null
+++ b/tests/queries/0_stateless/01825_type_json_from_map.sql
@@ -0,0 +1,41 @@
+-- Tags: no-fasttest
+
+DROP TABLE IF EXISTS t_json;
+DROP TABLE IF EXISTS t_map;
+
+SET allow_experimental_object_type = 1;
+
+CREATE TABLE t_json(id UInt64, obj JSON) ENGINE = MergeTree ORDER BY id;
+CREATE TABLE t_map(id UInt64, m Map(String, UInt64)) ENGINE = MergeTree ORDER BY id;
+
+INSERT INTO t_map
+SELECT
+    number,
+    (
+        arrayMap(x -> 'col' || toString(x), range(number % 10)),
+        range(number % 10)
+    )::Map(String, UInt64)
+FROM numbers(1000000);
+
+INSERT INTO t_json SELECT id, m FROM t_map;
+SELECT sum(m['col1']), sum(m['col4']), sum(m['col7']), sum(m['col8'] = 0) FROM t_map;
+SELECT sum(obj.col1), sum(obj.col4), sum(obj.col7), sum(obj.col8 = 0) FROM t_json;
+SELECT toTypeName(obj) FROM t_json LIMIT 1;
+
+INSERT INTO t_json
+SELECT
+    number,
+    (
+        arrayMap(x -> 'col' || toString(x), range(number % 10)),
+        range(number % 10)
+    )::Map(FixedString(4), UInt64)
+FROM numbers(1000000);
+
+SELECT sum(obj.col1), sum(obj.col4), sum(obj.col7), sum(obj.col8 = 0) FROM t_json;
+
+INSERT INTO t_json
+SELECT number, (range(number % 10), range(number % 10))::Map(UInt64, UInt64)
+FROM numbers(1000000); -- { serverError 53 }
+
+DROP TABLE IF EXISTS t_json;
+DROP TABLE IF EXISTS t_map;
diff --git a/tests/queries/0_stateless/01825_type_json_schema_inference.reference b/tests/queries/0_stateless/01825_type_json_schema_inference.reference
new file mode 100644
index 00000000000..c2c18b5a2ff
--- /dev/null
+++ b/tests/queries/0_stateless/01825_type_json_schema_inference.reference
@@ -0,0 +1,8 @@
+{"id":"1","obj":{"k1":1,"k2":{"k3":"2","k4":[{"k5":3,"k6":0},{"k5":4,"k6":0}]},"some":0},"s":"foo"}
+{"id":"2","obj":{"k1":0,"k2":{"k3":"str","k4":[{"k5":0,"k6":55}]},"some":42},"s":"bar"}
+Tuple(k1 Int8, k2 Tuple(k3 String, k4 Nested(k5 Int8, k6 Int8)), some Int8)
+{"id":"1","obj":"aaa","s":"foo"}
+{"id":"2","obj":"bbb","s":"bar"}
+{"map":{"k1":1,"k2":2},"obj":{"k1":1,"k2.k3":2},"map_type":"Map(String, Nullable(Float64))","obj_type":"Object('json')"}
+{"obj":{"k1":1,"k2":2},"map":{"k1":"1","k2":"2"}}
+Tuple(k1 Float64, k2 Float64)
diff --git a/tests/queries/0_stateless/01825_type_json_schema_inference.sh b/tests/queries/0_stateless/01825_type_json_schema_inference.sh
new file mode 100755
index 00000000000..4e8758761c5
--- /dev/null
+++ b/tests/queries/0_stateless/01825_type_json_schema_inference.sh
@@ -0,0 +1,52 @@
+#!/usr/bin/env bash
+# Tags: no-fasttest
+
+CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+# shellcheck source=../shell_config.sh
+. "$CUR_DIR"/../shell_config.sh
+
+${CLICKHOUSE_CLIENT} -q "DROP TABLE IF EXISTS t_json_inference"
+
+${CLICKHOUSE_CLIENT} -q "CREATE TABLE t_json_inference (id UInt64, obj JSON, s String) \
+    ENGINE = MergeTree ORDER BY id" --allow_experimental_object_type 1
+
+user_files_path=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}')
+mkdir -p ${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/
+rm -rf ${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME:?}/*
+
+filename="${user_files_path}/${CLICKHOUSE_TEST_UNIQUE_NAME}/data.json"
+
+echo '{"id": 1, "obj": {"k1": 1, "k2": {"k3": 2, "k4": [{"k5": 3}, {"k5": 4}]}}, "s": "foo"}' > $filename
+echo '{"id": 2, "obj": {"k2": {"k3": "str", "k4": [{"k6": 55}]}, "some": 42}, "s": "bar"}' >> $filename
+
+${CLICKHOUSE_CLIENT} -q "INSERT INTO t_json_inference SELECT * FROM file('${CLICKHOUSE_TEST_UNIQUE_NAME}/data.json', 'JSONEachRow')"
+${CLICKHOUSE_CLIENT} -q "SELECT * FROM t_json_inference FORMAT JSONEachRow" --output_format_json_named_tuples_as_objects 1
+${CLICKHOUSE_CLIENT} -q "SELECT toTypeName(obj) FROM t_json_inference LIMIT 1"
+
+${CLICKHOUSE_CLIENT} -q "DROP TABLE IF EXISTS t_json_inference"
+
+${CLICKHOUSE_CLIENT} -q "CREATE TABLE t_json_inference (id UInt64, obj String, s String) ENGINE = MergeTree ORDER BY id"
+
+echo '{"obj": "aaa", "id": 1, "s": "foo"}' > $filename
+echo '{"id": 2, "obj": "bbb", "s": "bar"}' >> $filename
+
+${CLICKHOUSE_CLIENT} -q "INSERT INTO t_json_inference SELECT * FROM file('${CLICKHOUSE_TEST_UNIQUE_NAME}/data.json', 'JSONEachRow')"
+${CLICKHOUSE_CLIENT} -q "SELECT * FROM t_json_inference FORMAT JSONEachRow" --output_format_json_named_tuples_as_objects 1
+
+${CLICKHOUSE_CLIENT} -q "DROP TABLE IF EXISTS t_json_inference"
+
+echo '{"map": {"k1": 1, "k2": 2}, "obj": {"k1": 1, "k2": {"k3": 2}}}' > $filename
+
+${CLICKHOUSE_CLIENT} -q "SELECT map, obj, toTypeName(map) AS map_type, toTypeName(obj) AS obj_type \
+    FROM file('${CLICKHOUSE_TEST_UNIQUE_NAME}/data.json', 'JSONEachRow') FORMAT JSONEachRow" --output_format_json_named_tuples_as_objects 1
+
+${CLICKHOUSE_CLIENT} -q "CREATE TABLE t_json_inference (obj JSON, map Map(String, UInt64)) \
+    ENGINE = MergeTree ORDER BY tuple()" --allow_experimental_object_type 1
+
+echo '{"map": {"k1": 1, "k2": 2}, "obj": {"k1": 1, "k2": 2}}' > $filename
+
+${CLICKHOUSE_CLIENT} -q "INSERT INTO t_json_inference SELECT * FROM file('${CLICKHOUSE_TEST_UNIQUE_NAME}/data.json', 'JSONEachRow')"
+${CLICKHOUSE_CLIENT} -q "SELECT * FROM t_json_inference FORMAT JSONEachRow" --output_format_json_named_tuples_as_objects 1
+${CLICKHOUSE_CLIENT} -q "SELECT toTypeName(obj) FROM t_json_inference LIMIT 1"
+
+${CLICKHOUSE_CLIENT} -q "DROP TABLE IF EXISTS t_json_inference"

From a6450be8b625b26f188a0f176217226fe648de86 Mon Sep 17 00:00:00 2001
From: Anton Popov <ap@clickhouse.com>
Date: Sat, 26 Mar 2022 01:11:16 +0000
Subject: [PATCH 12/58] fix schema inference

---
 src/Formats/JSONEachRowUtils.cpp | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/src/Formats/JSONEachRowUtils.cpp b/src/Formats/JSONEachRowUtils.cpp
index 80b9f84dafe..fb1ddb479f2 100644
--- a/src/Formats/JSONEachRowUtils.cpp
+++ b/src/Formats/JSONEachRowUtils.cpp
@@ -159,29 +159,37 @@ DataTypePtr getDataTypeFromJSONFieldImpl(const Element & field)
     {
         auto object = field.getObject();
         DataTypePtr value_type;
-        bool is_map = true;
+        bool is_object = false;
         for (const auto key_value_pair : object)
         {
             auto type = getDataTypeFromJSONFieldImpl(key_value_pair.second);
-            if (!type || isObject(type))
+            if (!type)
+                continue;
+
+            if (isObject(type))
             {
-                is_map = false;
+                is_object = true;
                 break;
             }
 
-            if (value_type && value_type->getName() != type->getName())
+            if (!value_type)
             {
-                is_map = false;
+                value_type = type;
+            }
+            else if (!value_type->equals(*type))
+            {
+                is_object = true;
                 break;
             }
-
-            value_type = type;
         }
 
-        if (is_map && value_type)
+        if (is_object)
+            return std::make_shared<DataTypeObject>("json", false);
+
+        if (value_type)
             return std::make_shared<DataTypeMap>(std::make_shared<DataTypeString>(), value_type);
 
-        return std::make_shared<DataTypeObject>("json", false);
+        return nullptr;
     }
 
     throw Exception{ErrorCodes::INCORRECT_DATA, "Unexpected JSON type"};

From 58a78fba12da9e298611a2ecdd1558eff31bfe53 Mon Sep 17 00:00:00 2001
From: Amos Bird <amosbird@gmail.com>
Date: Mon, 28 Mar 2022 12:58:22 +0800
Subject: [PATCH 13/58] formatString bug fix and refactoring

---
 src/Common/format.h                           | 176 +++++++++++++++++
 src/Functions/concat.cpp                      |  24 ++-
 src/Functions/formatString.cpp                |  24 ++-
 src/Functions/formatString.h                  | 180 +-----------------
 ...245_format_string_stack_overflow.reference |   1 +
 .../02245_format_string_stack_overflow.sql    |   1 +
 6 files changed, 209 insertions(+), 197 deletions(-)
 create mode 100644 src/Common/format.h
 create mode 100644 tests/queries/0_stateless/02245_format_string_stack_overflow.reference
 create mode 100644 tests/queries/0_stateless/02245_format_string_stack_overflow.sql

diff --git a/src/Common/format.h b/src/Common/format.h
new file mode 100644
index 00000000000..234ff90061c
--- /dev/null
+++ b/src/Common/format.h
@@ -0,0 +1,176 @@
+#include <base/types.h>
+#include <Common/Exception.h>
+#include <Common/PODArray.h>
+#include <Common/StringUtils/StringUtils.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int BAD_ARGUMENTS;
+}
+
+namespace Format
+{
+    using IndexPositions = PODArrayWithStackMemory<UInt64, 64>;
+
+    static inline void parseNumber(const String & description, UInt64 l, UInt64 r, UInt64 & res, UInt64 argument_number)
+    {
+        res = 0;
+        for (UInt64 pos = l; pos < r; ++pos)
+        {
+            if (!isNumericASCII(description[pos]))
+                throw Exception("Not a number in curly braces at position " + std::to_string(pos), ErrorCodes::BAD_ARGUMENTS);
+            res = res * 10 + description[pos] - '0';
+            if (res >= argument_number)
+                throw Exception(
+                    "Too big number for arguments, must be at most " + std::to_string(argument_number - 1), ErrorCodes::BAD_ARGUMENTS);
+        }
+    }
+
+    static inline void init(
+        const String & pattern,
+        size_t argument_number,
+        const std::vector<std::optional<String>> & constant_strings,
+        IndexPositions & index_positions,
+        std::vector<String> & substrings)
+    {
+        /// Is current position after open curly brace.
+        bool is_open_curly = false;
+        /// The position of last open token.
+        size_t last_open = -1;
+
+        /// Is formatting in a plain {} token.
+        std::optional<bool> is_plain_numbering;
+        UInt64 index_if_plain = 0;
+
+        /// Left position of adding substrings, just to the closed brace position or the start of the string.
+        /// Invariant --- the start of substring is in this position.
+        size_t start_pos = 0;
+
+        /// A flag to decide whether we should glue the constant strings.
+        bool glue_to_next = false;
+
+        /// Handling double braces (escaping).
+        auto double_brace_removal = [](String & str)
+        {
+            size_t i = 0;
+            bool should_delete = true;
+            str.erase(
+                std::remove_if(
+                    str.begin(),
+                    str.end(),
+                    [&i, &should_delete, &str](char)
+                    {
+                        bool is_double_brace = (str[i] == '{' && str[i + 1] == '{') || (str[i] == '}' && str[i + 1] == '}');
+                        ++i;
+                        if (is_double_brace && should_delete)
+                        {
+                            should_delete = false;
+                            return true;
+                        }
+                        should_delete = true;
+                        return false;
+                    }),
+                str.end());
+        };
+
+        index_positions.emplace_back();
+
+        for (size_t i = 0; i < pattern.size(); ++i)
+        {
+            if (pattern[i] == '{')
+            {
+                /// Escaping handling
+                /// It is safe to access because of null termination
+                if (pattern[i + 1] == '{')
+                {
+                    ++i;
+                    continue;
+                }
+
+                if (is_open_curly)
+                    throw Exception("Two open curly braces without close one at position " + std::to_string(i), ErrorCodes::BAD_ARGUMENTS);
+
+                String to_add = String(pattern.data() + start_pos, i - start_pos);
+                double_brace_removal(to_add);
+                if (!glue_to_next)
+                    substrings.emplace_back(to_add);
+                else
+                    substrings.back() += to_add;
+
+                glue_to_next = false;
+
+                is_open_curly = true;
+                last_open = i + 1;
+            }
+            else if (pattern[i] == '}')
+            {
+                if (pattern[i + 1] == '}')
+                {
+                    ++i;
+                    continue;
+                }
+
+                if (!is_open_curly)
+                    throw Exception("Closed curly brace without open one at position " + std::to_string(i), ErrorCodes::BAD_ARGUMENTS);
+
+                is_open_curly = false;
+
+                if (last_open == i)
+                {
+                    if (is_plain_numbering && !*is_plain_numbering)
+                        throw Exception(
+                            "Cannot switch from automatic field numbering to manual field specification", ErrorCodes::BAD_ARGUMENTS);
+                    is_plain_numbering = true;
+                    if (index_if_plain >= argument_number)
+                        throw Exception("Argument is too big for formatting", ErrorCodes::BAD_ARGUMENTS);
+                    index_positions.back() = index_if_plain++;
+                }
+                else
+                {
+                    if (is_plain_numbering && *is_plain_numbering)
+                        throw Exception(
+                            "Cannot switch from automatic field numbering to manual field specification", ErrorCodes::BAD_ARGUMENTS);
+                    is_plain_numbering = false;
+
+                    UInt64 arg;
+                    parseNumber(pattern, last_open, i, arg, argument_number);
+
+                    if (arg >= argument_number)
+                        throw Exception(
+                            "Argument is too big for formatting. Note that indexing starts from zero", ErrorCodes::BAD_ARGUMENTS);
+
+                    index_positions.back() = arg;
+                }
+
+                if (!constant_strings.empty() && constant_strings[index_positions.back()])
+                {
+                    /// The next string should be glued to last `A {} C`.format('B') -> `A B C`.
+                    glue_to_next = true;
+                    substrings.back() += *constant_strings[index_positions.back()];
+                }
+                else
+                    index_positions.emplace_back(); /// Otherwise we commit arg number and proceed.
+
+                start_pos = i + 1;
+            }
+        }
+
+        if (is_open_curly)
+            throw Exception("Last open curly brace is not closed", ErrorCodes::BAD_ARGUMENTS);
+
+        String to_add = String(pattern.data() + start_pos, pattern.size() - start_pos);
+        double_brace_removal(to_add);
+
+        if (!glue_to_next)
+            substrings.emplace_back(to_add);
+        else
+            substrings.back() += to_add;
+
+        index_positions.pop_back();
+    }
+}
+
+}
diff --git a/src/Functions/concat.cpp b/src/Functions/concat.cpp
index cd83223de3e..e11071265ce 100644
--- a/src/Functions/concat.cpp
+++ b/src/Functions/concat.cpp
@@ -52,23 +52,21 @@ public:
     {
         if (arguments.size() < 2)
             throw Exception(
-                "Number of arguments for function " + getName() + " doesn't match: passed " + toString(arguments.size())
-                    + ", should be at least 2.",
-                ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
-
-        if (arguments.size() > FormatImpl::argument_threshold)
-            throw Exception(
-                "Number of arguments for function " + getName() + " doesn't match: passed " + toString(arguments.size())
-                    + ", should be at most " + std::to_string(FormatImpl::argument_threshold),
-                ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
+                ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
+                "Number of arguments for function {} doesn't match: passed {}, should be at least 2",
+                getName(),
+                arguments.size());
 
         for (const auto arg_idx : collections::range(0, arguments.size()))
         {
             const auto * arg = arguments[arg_idx].get();
             if (!isStringOrFixedString(arg))
-                throw Exception{"Illegal type " + arg->getName() + " of argument " + std::to_string(arg_idx + 1) + " of function "
-                                    + getName(),
-                                ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT};
+                throw Exception(
+                    ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
+                    "Illegal type {} of argument {} of function {}",
+                    arg->getName(),
+                    arg_idx + 1,
+                    getName());
         }
 
         return std::make_shared<DataTypeString>();
@@ -125,7 +123,7 @@ private:
         std::vector<const ColumnString::Chars *> data(num_arguments);
         std::vector<const ColumnString::Offsets *> offsets(num_arguments);
         std::vector<size_t> fixed_string_sizes(num_arguments);
-        std::vector<String> constant_strings(num_arguments);
+        std::vector<std::optional<String>> constant_strings(num_arguments);
         bool has_column_string = false;
         bool has_column_fixed_string = false;
         for (size_t i = 0; i < num_arguments; ++i)
diff --git a/src/Functions/formatString.cpp b/src/Functions/formatString.cpp
index e138c072639..ee90a082e5b 100644
--- a/src/Functions/formatString.cpp
+++ b/src/Functions/formatString.cpp
@@ -45,25 +45,23 @@ public:
 
     DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
     {
-        if (arguments.empty())
+        if (arguments.size() < 2)
             throw Exception(
-                "Number of arguments for function " + getName() + " doesn't match: passed " + toString(arguments.size())
-                    + ", should be at least 1",
-                ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
-
-        if (arguments.size() > FormatImpl::argument_threshold)
-            throw Exception(
-                "Number of arguments for function " + getName() + " doesn't match: passed " + toString(arguments.size())
-                    + ", should be at most " + std::to_string(FormatImpl::argument_threshold),
-                ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
+                ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
+                "Number of arguments for function {} doesn't match: passed {}, should be at least 2",
+                getName(),
+                arguments.size());
 
         for (const auto arg_idx : collections::range(0, arguments.size()))
         {
             const auto * arg = arguments[arg_idx].get();
             if (!isStringOrFixedString(arg))
                 throw Exception(
-                    "Illegal type " + arg->getName() + " of argument " + std::to_string(arg_idx + 1) + " of function " + getName(),
-                    ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+                    ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
+                    "Illegal type {} of argument {} of function {}",
+                    arg->getName(),
+                    arg_idx + 1,
+                    getName());
         }
 
         return std::make_shared<DataTypeString>();
@@ -84,7 +82,7 @@ public:
         std::vector<const ColumnString::Chars *> data(arguments.size() - 1);
         std::vector<const ColumnString::Offsets *> offsets(arguments.size() - 1);
         std::vector<size_t> fixed_string_sizes(arguments.size() - 1);
-        std::vector<String> constant_strings(arguments.size() - 1);
+        std::vector<std::optional<String>> constant_strings(arguments.size() - 1);
 
         bool has_column_string = false;
         bool has_column_fixed_string = false;
diff --git a/src/Functions/formatString.h b/src/Functions/formatString.h
index 419ecf1c773..3c5057ecb97 100644
--- a/src/Functions/formatString.h
+++ b/src/Functions/formatString.h
@@ -4,8 +4,10 @@
 #include <base/types.h>
 #include <Common/Exception.h>
 #include <Common/StringUtils/StringUtils.h>
+#include <Common/format.h>
 #include <Common/memcpySmall.h>
 
+
 #include <algorithm>
 #include <optional>
 #include <string>
@@ -22,8 +24,6 @@ namespace ErrorCodes
 
 struct FormatImpl
 {
-    static constexpr size_t small_argument_threshold = 1024;
-    static constexpr size_t argument_threshold = std::numeric_limits<UInt32>::max();
     static constexpr size_t right_padding = 15;
 
     template <typename... Args>
@@ -39,165 +39,10 @@ struct FormatImpl
             format<false, false>(std::forward<Args>(args)...);
     }
 
-    static void parseNumber(const String & description, UInt64 l, UInt64 r, UInt64 & res)
-    {
-        res = 0;
-        for (UInt64 pos = l; pos < r; ++pos)
-        {
-            if (!isNumericASCII(description[pos]))
-                throw Exception("Not a number in curly braces at position " + std::to_string(pos), ErrorCodes::BAD_ARGUMENTS);
-            res = res * 10 + description[pos] - '0';
-            if (res >= argument_threshold)
-                throw Exception(
-                    "Too big number for arguments, must be at most " + std::to_string(argument_threshold), ErrorCodes::BAD_ARGUMENTS);
-        }
-    }
-
-    static inline void init(
-        const String & pattern,
-        const std::vector<const ColumnString::Chars *> & data,
-        size_t argument_number,
-        const std::vector<String> & constant_strings,
-        UInt64 * index_positions_ptr,
-        std::vector<String> & substrings)
-    {
-        /// Is current position after open curly brace.
-        bool is_open_curly = false;
-        /// The position of last open token.
-        size_t last_open = -1;
-
-        /// Is formatting in a plain {} token.
-        std::optional<bool> is_plain_numbering;
-        UInt64 index_if_plain = 0;
-
-        /// Left position of adding substrings, just to the closed brace position or the start of the string.
-        /// Invariant --- the start of substring is in this position.
-        size_t start_pos = 0;
-
-        /// A flag to decide whether we should glue the constant strings.
-        bool glue_to_next = false;
-
-        /// Handling double braces (escaping).
-        auto double_brace_removal = [](String & str)
-        {
-            size_t i = 0;
-            bool should_delete = true;
-            str.erase(
-                std::remove_if(
-                    str.begin(),
-                    str.end(),
-                    [&i, &should_delete, &str](char)
-                    {
-                        bool is_double_brace = (str[i] == '{' && str[i + 1] == '{') || (str[i] == '}' && str[i + 1] == '}');
-                        ++i;
-                        if (is_double_brace && should_delete)
-                        {
-                            should_delete = false;
-                            return true;
-                        }
-                        should_delete = true;
-                        return false;
-                    }),
-                str.end());
-        };
-
-        for (size_t i = 0; i < pattern.size(); ++i)
-        {
-            if (pattern[i] == '{')
-            {
-                /// Escaping handling
-                /// It is safe to access because of null termination
-                if (pattern[i + 1] == '{')
-                {
-                    ++i;
-                    continue;
-                }
-
-                if (is_open_curly)
-                    throw Exception("Two open curly braces without close one at position " + std::to_string(i), ErrorCodes::BAD_ARGUMENTS);
-
-                String to_add = String(pattern.data() + start_pos, i - start_pos);
-                double_brace_removal(to_add);
-                if (!glue_to_next)
-                    substrings.emplace_back(to_add);
-                else
-                    substrings.back() += to_add;
-
-                glue_to_next = false;
-
-                is_open_curly = true;
-                last_open = i + 1;
-            }
-            else if (pattern[i] == '}')
-            {
-                if (pattern[i + 1] == '}')
-                {
-                    ++i;
-                    continue;
-                }
-
-                if (!is_open_curly)
-                    throw Exception("Closed curly brace without open one at position " + std::to_string(i), ErrorCodes::BAD_ARGUMENTS);
-
-                is_open_curly = false;
-
-                if (last_open == i)
-                {
-                    if (is_plain_numbering && !*is_plain_numbering)
-                        throw Exception(
-                            "Cannot switch from automatic field numbering to manual field specification", ErrorCodes::BAD_ARGUMENTS);
-                    is_plain_numbering = true;
-                    if (index_if_plain >= argument_number)
-                        throw Exception("Argument is too big for formatting", ErrorCodes::BAD_ARGUMENTS);
-                    *index_positions_ptr = index_if_plain++;
-                }
-                else
-                {
-                    if (is_plain_numbering && *is_plain_numbering)
-                        throw Exception(
-                            "Cannot switch from automatic field numbering to manual field specification", ErrorCodes::BAD_ARGUMENTS);
-                    is_plain_numbering = false;
-
-                    UInt64 arg;
-                    parseNumber(pattern, last_open, i, arg);
-
-                    if (arg >= argument_number)
-                        throw Exception(
-                            "Argument is too big for formatting. Note that indexing starts from zero", ErrorCodes::BAD_ARGUMENTS);
-
-                    *index_positions_ptr = arg;
-                }
-
-                /// Constant string.
-                if (!data[*index_positions_ptr])
-                {
-                    /// The next string should be glued to last `A {} C`.format('B') -> `A B C`.
-                    glue_to_next = true;
-                    substrings.back() += constant_strings[*index_positions_ptr];
-                }
-                else
-                    ++index_positions_ptr; /// Otherwise we commit arg number and proceed.
-
-                start_pos = i + 1;
-            }
-        }
-
-        if (is_open_curly)
-            throw Exception("Last open curly brace is not closed", ErrorCodes::BAD_ARGUMENTS);
-
-        String to_add = String(pattern.data() + start_pos, pattern.size() - start_pos);
-        double_brace_removal(to_add);
-
-        if (!glue_to_next)
-            substrings.emplace_back(to_add);
-        else
-            substrings.back() += to_add;
-    }
-
     /// data for ColumnString and ColumnFixed. Nullptr means no data, it is const string.
     /// offsets for ColumnString, nullptr is an indicator that there is a fixed string rather than ColumnString.
     /// fixed_string_N for savings N to fixed strings.
-    /// constant_strings for constant strings. If data[i] is nullptr, than it is constant string.
+    /// constant_strings for constant strings. If data[i] is nullptr, it is constant string.
     /// res_data is result_data, res_offsets is offset result.
     /// input_rows_count is the number of rows processed.
     /// Precondition: data.size() == offsets.size() == fixed_string_N.size() == constant_strings.size().
@@ -207,29 +52,22 @@ struct FormatImpl
         const std::vector<const ColumnString::Chars *> & data,
         const std::vector<const ColumnString::Offsets *> & offsets,
         [[maybe_unused]] /* Because sometimes !has_column_fixed_string */ const std::vector<size_t> & fixed_string_N,
-        const std::vector<String> & constant_strings,
+        const std::vector<std::optional<String>> & constant_strings,
         ColumnString::Chars & res_data,
         ColumnString::Offsets & res_offsets,
         size_t input_rows_count)
     {
         const size_t argument_number = offsets.size();
 
-        UInt64 small_index_positions_buffer[small_argument_threshold];
-        /// The subsequent indexes of strings we should use. e.g `Hello world {1} {3} {1} {0}` this array will be filled with [1, 3, 1, 0, ... (garbage)] but without constant string indices.
-        UInt64 * index_positions = small_index_positions_buffer;
-
-        std::unique_ptr<UInt64[]> big_index_positions_buffer;
-        if (argument_number > small_argument_threshold)
-        {
-            big_index_positions_buffer.reset(new UInt64[argument_number]);
-            index_positions = big_index_positions_buffer.get();
-        }
+        /// The subsequent indexes of strings we should use. e.g `Hello world {1} {3} {1} {0}` this
+        /// array will be filled with [1, 3, 1, 0] but without constant string indices.
+        Format::IndexPositions index_positions;
 
         /// Vector of substrings of pattern that will be copied to the answer, not string view because of escaping and iterators invalidation.
         /// These are exactly what is between {} tokens, for `Hello {} world {}` we will have [`Hello `, ` world `, ``].
         std::vector<String> substrings;
 
-        init(pattern, data, argument_number, constant_strings, index_positions, substrings);
+        Format::init(pattern, argument_number, constant_strings, index_positions, substrings);
 
         UInt64 final_size = 0;
 
@@ -271,7 +109,7 @@ struct FormatImpl
                 for (size_t j = 1; j < substrings.size(); ++j)
                 {
                     UInt64 arg = index_positions[j - 1];
-                    auto offset_ptr = offsets[arg];
+                    const auto * offset_ptr = offsets[arg];
                     UInt64 arg_offset = 0;
                     UInt64 size = 0;
 
diff --git a/tests/queries/0_stateless/02245_format_string_stack_overflow.reference b/tests/queries/0_stateless/02245_format_string_stack_overflow.reference
new file mode 100644
index 00000000000..6e163a64914
--- /dev/null
+++ b/tests/queries/0_stateless/02245_format_string_stack_overflow.reference
@@ -0,0 +1 @@
+00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
diff --git a/tests/queries/0_stateless/02245_format_string_stack_overflow.sql b/tests/queries/0_stateless/02245_format_string_stack_overflow.sql
new file mode 100644
index 00000000000..1ee3606d3a6
--- /dev/null
+++ b/tests/queries/0_stateless/02245_format_string_stack_overflow.sql
@@ -0,0 +1 @@
+select format('{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}', toString(number)) str from numbers(1);

From d9d826c813cafced31842be0ac3397b1c6292f60 Mon Sep 17 00:00:00 2001
From: Antonio Andelic <antonio@clickhouse.com>
Date: Mon, 28 Mar 2022 08:19:23 +0000
Subject: [PATCH 14/58] Address PR review

---
 src/Common/RangeGenerator.h |  2 +-
 src/IO/ReadBufferFromS3.cpp | 19 +++++++++++++------
 src/IO/ReadBufferFromS3.h   |  2 +-
 src/Storages/StorageS3.cpp  | 17 ++++-------------
 src/Storages/StorageS3.h    |  2 ++
 5 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/src/Common/RangeGenerator.h b/src/Common/RangeGenerator.h
index 8d3b88936fc..8f097087c8c 100644
--- a/src/Common/RangeGenerator.h
+++ b/src/Common/RangeGenerator.h
@@ -34,7 +34,7 @@ public:
 
         Range range{from, to};
         from = to;
-        return std::move(range);
+        return range;
     }
 
 private:
diff --git a/src/IO/ReadBufferFromS3.cpp b/src/IO/ReadBufferFromS3.cpp
index c78f41a0f9d..6616d92b492 100644
--- a/src/IO/ReadBufferFromS3.cpp
+++ b/src/IO/ReadBufferFromS3.cpp
@@ -1,4 +1,5 @@
 #include <Common/config.h>
+#include "IO/S3Common.h"
 
 #if USE_AWS_S3
 
@@ -212,13 +213,14 @@ std::optional<size_t> ReadBufferFromS3::getTotalSize()
     if (file_size)
         return file_size;
 
-    Aws::S3::Model::HeadObjectRequest request;
-    request.SetBucket(bucket);
-    request.SetKey(key);
+    auto object_size = S3::getObjectSize(client_ptr, bucket, key, false);
 
-    auto outcome = client_ptr->HeadObject(request);
-    auto head_result = outcome.GetResultWithOwnership();
-    file_size = head_result.GetContentLength();
+    if (!object_size)
+    {
+        return std::nullopt;
+    }
+
+    file_size = object_size;
     return file_size;
 }
 
@@ -236,6 +238,11 @@ void ReadBufferFromS3::setReadUntilPosition(size_t position)
     }
 }
 
+SeekableReadBuffer::Range ReadBufferFromS3::getRemainingReadRange() const
+{
+    return Range{.left = static_cast<size_t>(offset), .right = read_until_position ? std::optional{read_until_position - 1} : std::nullopt};
+}
+
 std::unique_ptr<ReadBuffer> ReadBufferFromS3::initialize()
 {
     Aws::S3::Model::GetObjectRequest req;
diff --git a/src/IO/ReadBufferFromS3.h b/src/IO/ReadBufferFromS3.h
index dbba751b321..5c9d709d58e 100644
--- a/src/IO/ReadBufferFromS3.h
+++ b/src/IO/ReadBufferFromS3.h
@@ -63,7 +63,7 @@ public:
 
     void setReadUntilPosition(size_t position) override;
 
-    Range getRemainingReadRange() const override { return Range{.left = static_cast<size_t>(offset), .right = read_until_position - 1}; }
+    Range getRemainingReadRange() const override;
 
     size_t getFileOffsetOfBufferEnd() const override { return offset; }
 
diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp
index b40e352e1fb..411f78e44d6 100644
--- a/src/Storages/StorageS3.cpp
+++ b/src/Storages/StorageS3.cpp
@@ -307,7 +307,7 @@ std::unique_ptr<ReadBuffer> StorageS3Source::createS3ReadBuffer(const String & k
     const bool object_too_small = object_size < download_thread_num * download_buffer_size;
     if (!use_parallel_download || object_too_small)
     {
-        LOG_TRACE(&Poco::Logger::get("StorageS3Source"), "Downloading object of size {} from S3 in single thread", object_size);
+        LOG_TRACE(log, "Downloading object of size {} from S3 in single thread", object_size);
         return std::make_unique<ReadBufferFromS3>(client, bucket, key, max_single_read_retries, getContext()->getReadSettings());
     }
 
@@ -315,29 +315,20 @@ std::unique_ptr<ReadBuffer> StorageS3Source::createS3ReadBuffer(const String & k
 
     if (download_buffer_size < DBMS_DEFAULT_BUFFER_SIZE)
     {
-        LOG_WARNING(
-            &Poco::Logger::get("StorageS3Source"),
-            "Downloading buffer {} bytes too small, set at least {} bytes",
-            download_buffer_size,
-            DBMS_DEFAULT_BUFFER_SIZE);
+        LOG_WARNING(log, "Downloading buffer {} bytes too small, set at least {} bytes", download_buffer_size, DBMS_DEFAULT_BUFFER_SIZE);
         download_buffer_size = DBMS_DEFAULT_BUFFER_SIZE;
     }
 
     auto factory = std::make_unique<ReadBufferS3Factory>(
         client, bucket, key, download_buffer_size, object_size, max_single_read_retries, getContext()->getReadSettings());
     LOG_TRACE(
-        &Poco::Logger::get("StorageS3Source"),
-        "Downloading from S3 in {} threads. Object size: {}, Range size: {}.",
-        download_thread_num,
-        object_size,
-        download_buffer_size);
+        log, "Downloading from S3 in {} threads. Object size: {}, Range size: {}.", download_thread_num, object_size, download_buffer_size);
 
     ThreadGroupStatusPtr running_group = CurrentThread::isInitialized() && CurrentThread::get().getThreadGroup()
         ? CurrentThread::get().getThreadGroup()
         : MainThreadStatus::getInstance().getThreadGroup();
 
-    ContextPtr query_context
-        = CurrentThread::isInitialized() ? CurrentThread::get().getQueryContext() : nullptr;
+    ContextPtr query_context = CurrentThread::isInitialized() ? CurrentThread::get().getQueryContext() : nullptr;
 
     auto worker_cleanup = [has_running_group = running_group == nullptr](ThreadStatus & thread_status)
     {
diff --git a/src/Storages/StorageS3.h b/src/Storages/StorageS3.h
index 6362e777df6..e67223190ba 100644
--- a/src/Storages/StorageS3.h
+++ b/src/Storages/StorageS3.h
@@ -107,6 +107,8 @@ private:
     std::shared_ptr<IteratorWrapper> file_iterator;
     size_t download_thread_num = 1;
 
+    Poco::Logger * log = &Poco::Logger::get("StorageS3Source");
+
     /// Recreate ReadBuffer and BlockInputStream for each file.
     bool initialize();
 

From 6003a49a877908ac1d9f4651b7d507e6055e9a9b Mon Sep 17 00:00:00 2001
From: Amos Bird <amosbird@gmail.com>
Date: Mon, 28 Mar 2022 17:48:47 +0800
Subject: [PATCH 15/58] Fix style

---
 src/Common/format.h          | 2 ++
 src/Functions/formatString.h | 4 ----
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/Common/format.h b/src/Common/format.h
index 234ff90061c..a9382f247ab 100644
--- a/src/Common/format.h
+++ b/src/Common/format.h
@@ -1,3 +1,5 @@
+#pragma once
+
 #include <base/types.h>
 #include <Common/Exception.h>
 #include <Common/PODArray.h>
diff --git a/src/Functions/formatString.h b/src/Functions/formatString.h
index 3c5057ecb97..44fbdac9378 100644
--- a/src/Functions/formatString.h
+++ b/src/Functions/formatString.h
@@ -17,10 +17,6 @@
 
 namespace DB
 {
-namespace ErrorCodes
-{
-    extern const int BAD_ARGUMENTS;
-}
 
 struct FormatImpl
 {

From 065bc13aad41c6a564e5803d0408a91f640fec43 Mon Sep 17 00:00:00 2001
From: pdai <pdai@users.noreply.github.com>
Date: Mon, 28 Mar 2022 20:58:18 +0800
Subject: [PATCH 16/58] fix typo

---
 .../mergetree-family/versionedcollapsingmergetree.md            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/zh/engines/table-engines/mergetree-family/versionedcollapsingmergetree.md b/docs/zh/engines/table-engines/mergetree-family/versionedcollapsingmergetree.md
index b81d2206bf4..f5f2c428ea7 100644
--- a/docs/zh/engines/table-engines/mergetree-family/versionedcollapsingmergetree.md
+++ b/docs/zh/engines/table-engines/mergetree-family/versionedcollapsingmergetree.md
@@ -8,7 +8,7 @@ toc_title: "版本折叠MergeTree"
 这个引擎:
 
 -   允许快速写入不断变化的对象状态。
--   删除后台中的旧对象状态。 这显着降低了存储体积。
+-   删除后台中的旧对象状态。 这显著降低了存储体积。
 
 请参阅部分 [崩溃](#table_engines_versionedcollapsingmergetree) 有关详细信息。
 

From 0e4af89f69e1340d27846f24faf14268ae0c4a10 Mon Sep 17 00:00:00 2001
From: Anton Popov <ap@clickhouse.com>
Date: Mon, 28 Mar 2022 17:21:47 +0000
Subject: [PATCH 17/58] fix reading from type object

---
 src/Storages/MergeTree/IMergeTreeDataPart.cpp |  5 +-
 src/Storages/MergeTree/IMergeTreeDataPart.h   |  2 +-
 .../MergeTree/MergeTreeBlockReadUtils.cpp     | 39 ++++++------
 .../MergeTree/MergeTreeBlockReadUtils.h       |  2 +-
 .../MergeTree/MergeTreeSequentialSource.cpp   |  2 +-
 src/Storages/StorageSnapshot.cpp              | 62 ++++++++++---------
 src/Storages/StorageSnapshot.h                |  4 ++
 .../01825_type_json_missed_values.reference   |  2 +
 .../01825_type_json_missed_values.sql         | 19 ++++++
 9 files changed, 82 insertions(+), 55 deletions(-)
 create mode 100644 tests/queries/0_stateless/01825_type_json_missed_values.reference
 create mode 100644 tests/queries/0_stateless/01825_type_json_missed_values.sql

diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp
index 9028023dc80..6e1d73da427 100644
--- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp
+++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp
@@ -558,9 +558,10 @@ size_t IMergeTreeDataPart::getFileSizeOrZero(const String & file_name) const
     return checksum->second.file_size;
 }
 
-String IMergeTreeDataPart::getColumnNameWithMinimumCompressedSize(const StorageMetadataPtr & metadata_snapshot) const
+String IMergeTreeDataPart::getColumnNameWithMinimumCompressedSize(const StorageSnapshotPtr & storage_snapshot) const
 {
-    const auto & storage_columns = metadata_snapshot->getColumns().getAllPhysical();
+    auto options = GetColumnsOptions(GetColumnsOptions::AllPhysical).withExtendedObjects().withSubcolumns();
+    auto storage_columns = storage_snapshot->getColumns(options);
     MergeTreeData::AlterConversions alter_conversions;
     if (!parent_part)
         alter_conversions = storage.getAlterConversionsForPart(shared_from_this());
diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h
index 85088b10f70..0dfac49bf30 100644
--- a/src/Storages/MergeTree/IMergeTreeDataPart.h
+++ b/src/Storages/MergeTree/IMergeTreeDataPart.h
@@ -164,7 +164,7 @@ public:
 
     /// Returns the name of a column with minimum compressed size (as returned by getColumnSize()).
     /// If no checksums are present returns the name of the first physically existing column.
-    String getColumnNameWithMinimumCompressedSize(const StorageMetadataPtr & metadata_snapshot) const;
+    String getColumnNameWithMinimumCompressedSize(const StorageSnapshotPtr & storage_snapshot) const;
 
     bool contains(const IMergeTreeDataPart & other) const { return info.contains(other.info); }
 
diff --git a/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp b/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp
index 6e72b843f10..97516ea06c6 100644
--- a/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp
+++ b/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp
@@ -24,7 +24,7 @@ namespace
 /// least one existing (physical) column in part.
 bool injectRequiredColumnsRecursively(
     const String & column_name,
-    const ColumnsDescription & storage_columns,
+    const StorageSnapshotPtr & storage_snapshot,
     const MergeTreeData::AlterConversions & alter_conversions,
     const MergeTreeData::DataPartPtr & part,
     Names & columns,
@@ -36,7 +36,8 @@ bool injectRequiredColumnsRecursively(
     /// stages.
     checkStackSize();
 
-    auto column_in_storage = storage_columns.tryGetColumnOrSubcolumn(GetColumnsOptions::AllPhysical, column_name);
+    auto options = GetColumnsOptions(GetColumnsOptions::AllPhysical).withSubcolumns().withExtendedObjects();
+    auto column_in_storage = storage_snapshot->tryGetColumn(options, column_name);
     if (column_in_storage)
     {
         auto column_name_in_part = column_in_storage->getNameInStorage();
@@ -63,7 +64,8 @@ bool injectRequiredColumnsRecursively(
 
     /// Column doesn't have default value and don't exist in part
     /// don't need to add to required set.
-    const auto column_default = storage_columns.getDefault(column_name);
+    auto metadata_snapshot = storage_snapshot->getMetadataForQuery();
+    const auto column_default = metadata_snapshot->getColumns().getDefault(column_name);
     if (!column_default)
         return false;
 
@@ -73,39 +75,36 @@ bool injectRequiredColumnsRecursively(
 
     bool result = false;
     for (const auto & identifier : identifiers)
-        result |= injectRequiredColumnsRecursively(identifier, storage_columns, alter_conversions, part, columns, required_columns, injected_columns);
+        result |= injectRequiredColumnsRecursively(identifier, storage_snapshot, alter_conversions, part, columns, required_columns, injected_columns);
 
     return result;
 }
 
 }
 
-NameSet injectRequiredColumns(const MergeTreeData & storage, const StorageMetadataPtr & metadata_snapshot, const MergeTreeData::DataPartPtr & part, Names & columns)
+NameSet injectRequiredColumns(
+    const MergeTreeData & storage,
+    const StorageSnapshotPtr & storage_snapshot,
+    const MergeTreeData::DataPartPtr & part,
+    Names & columns)
 {
     NameSet required_columns{std::begin(columns), std::end(columns)};
     NameSet injected_columns;
 
     bool have_at_least_one_physical_column = false;
-
-    const auto & storage_columns = metadata_snapshot->getColumns();
     MergeTreeData::AlterConversions alter_conversions;
     if (!part->isProjectionPart())
         alter_conversions = storage.getAlterConversionsForPart(part);
+
     for (size_t i = 0; i < columns.size(); ++i)
     {
-        auto name_in_storage = Nested::extractTableName(columns[i]);
-        if (storage_columns.has(name_in_storage) && isObject(storage_columns.get(name_in_storage).type))
-        {
-            have_at_least_one_physical_column = true;
-            continue;
-        }
-
         /// We are going to fetch only physical columns
-        if (!storage_columns.hasColumnOrSubcolumn(GetColumnsOptions::AllPhysical, columns[i]))
-            throw Exception("There is no physical column or subcolumn " + columns[i] + " in table.", ErrorCodes::NO_SUCH_COLUMN_IN_TABLE);
+        auto options = GetColumnsOptions(GetColumnsOptions::AllPhysical).withSubcolumns().withExtendedObjects();
+        if (!storage_snapshot->tryGetColumn(options, columns[i]))
+            throw Exception(ErrorCodes::NO_SUCH_COLUMN_IN_TABLE, "There is no physical column or subcolumn {} in table", columns[i]);
 
         have_at_least_one_physical_column |= injectRequiredColumnsRecursively(
-            columns[i], storage_columns, alter_conversions,
+            columns[i], storage_snapshot, alter_conversions,
             part, columns, required_columns, injected_columns);
     }
 
@@ -115,7 +114,7 @@ NameSet injectRequiredColumns(const MergeTreeData & storage, const StorageMetada
         */
     if (!have_at_least_one_physical_column)
     {
-        const auto minimum_size_column_name = part->getColumnNameWithMinimumCompressedSize(metadata_snapshot);
+        const auto minimum_size_column_name = part->getColumnNameWithMinimumCompressedSize(storage_snapshot);
         columns.push_back(minimum_size_column_name);
         /// correctly report added column
         injected_columns.insert(columns.back());
@@ -271,7 +270,7 @@ MergeTreeReadTaskColumns getReadTaskColumns(
     Names pre_column_names;
 
     /// inject columns required for defaults evaluation
-    bool should_reorder = !injectRequiredColumns(storage, storage_snapshot->getMetadataForQuery(), data_part, column_names).empty();
+    bool should_reorder = !injectRequiredColumns(storage, storage_snapshot, data_part, column_names).empty();
 
     if (prewhere_info)
     {
@@ -296,7 +295,7 @@ MergeTreeReadTaskColumns getReadTaskColumns(
         if (pre_column_names.empty())
             pre_column_names.push_back(column_names[0]);
 
-        const auto injected_pre_columns = injectRequiredColumns(storage, storage_snapshot->getMetadataForQuery(), data_part, pre_column_names);
+        const auto injected_pre_columns = injectRequiredColumns(storage, storage_snapshot, data_part, pre_column_names);
         if (!injected_pre_columns.empty())
             should_reorder = true;
 
diff --git a/src/Storages/MergeTree/MergeTreeBlockReadUtils.h b/src/Storages/MergeTree/MergeTreeBlockReadUtils.h
index 2373881f954..b4293b4ce3d 100644
--- a/src/Storages/MergeTree/MergeTreeBlockReadUtils.h
+++ b/src/Storages/MergeTree/MergeTreeBlockReadUtils.h
@@ -22,7 +22,7 @@ using MergeTreeBlockSizePredictorPtr = std::shared_ptr<MergeTreeBlockSizePredict
   * so that you can calculate the DEFAULT expression for these columns.
   * Adds them to the `columns`.
   */
-NameSet injectRequiredColumns(const MergeTreeData & storage, const StorageMetadataPtr & metadata_snapshot, const MergeTreeData::DataPartPtr & part, Names & columns);
+NameSet injectRequiredColumns(const MergeTreeData & storage, const StorageSnapshotPtr & storage_snapshot, const MergeTreeData::DataPartPtr & part, Names & columns);
 
 
 /// A batch of work for MergeTreeThreadSelectBlockInputStream
diff --git a/src/Storages/MergeTree/MergeTreeSequentialSource.cpp b/src/Storages/MergeTree/MergeTreeSequentialSource.cpp
index 5dbc59ba2d5..c5a3b7935d9 100644
--- a/src/Storages/MergeTree/MergeTreeSequentialSource.cpp
+++ b/src/Storages/MergeTree/MergeTreeSequentialSource.cpp
@@ -41,7 +41,7 @@ MergeTreeSequentialSource::MergeTreeSequentialSource(
     addTotalRowsApprox(data_part->rows_count);
 
     /// Add columns because we don't want to read empty blocks
-    injectRequiredColumns(storage, storage_snapshot->metadata, data_part, columns_to_read);
+    injectRequiredColumns(storage, storage_snapshot, data_part, columns_to_read);
     NamesAndTypesList columns_for_reader;
     if (take_column_types_from_storage)
     {
diff --git a/src/Storages/StorageSnapshot.cpp b/src/Storages/StorageSnapshot.cpp
index e214afc6a90..8a82c5387c5 100644
--- a/src/Storages/StorageSnapshot.cpp
+++ b/src/Storages/StorageSnapshot.cpp
@@ -51,40 +51,42 @@ NamesAndTypesList StorageSnapshot::getColumns(const GetColumnsOptions & options)
 NamesAndTypesList StorageSnapshot::getColumnsByNames(const GetColumnsOptions & options, const Names & names) const
 {
     NamesAndTypesList res;
-    const auto & columns = getMetadataForQuery()->getColumns();
     for (const auto & name : names)
+        res.push_back(getColumn(options, name));
+    return res;
+}
+
+std::optional<NameAndTypePair> StorageSnapshot::tryGetColumn(const GetColumnsOptions & options, const String & column_name) const
+{
+    const auto & columns = getMetadataForQuery()->getColumns();
+    auto column = columns.tryGetColumn(options, column_name);
+    if (column && (!isObject(column->type) || !options.with_extended_objects))
+        return column;
+
+    if (options.with_extended_objects)
     {
-        auto column = columns.tryGetColumn(options, name);
-        if (column && !isObject(column->type))
-        {
-            res.emplace_back(std::move(*column));
-            continue;
-        }
-
-        if (options.with_extended_objects)
-        {
-            auto object_column = object_columns.tryGetColumn(options, name);
-            if (object_column)
-            {
-                res.emplace_back(std::move(*object_column));
-                continue;
-            }
-        }
-
-        if (options.with_virtuals)
-        {
-            auto it = virtual_columns.find(name);
-            if (it != virtual_columns.end())
-            {
-                res.emplace_back(name, it->second);
-                continue;
-            }
-        }
-
-        throw Exception(ErrorCodes::NO_SUCH_COLUMN_IN_TABLE, "There is no column {} in table", name);
+        auto object_column = object_columns.tryGetColumn(options, column_name);
+        if (object_column)
+            return object_column;
     }
 
-    return res;
+    if (options.with_virtuals)
+    {
+        auto it = virtual_columns.find(column_name);
+        if (it != virtual_columns.end())
+            return NameAndTypePair(column_name, it->second);
+    }
+
+    return {};
+}
+
+NameAndTypePair StorageSnapshot::getColumn(const GetColumnsOptions & options, const String & column_name) const
+{
+    auto column = tryGetColumn(options, column_name);
+    if (!column)
+        throw Exception(ErrorCodes::NO_SUCH_COLUMN_IN_TABLE, "There is no column {} in table", column_name);
+
+    return *column;
 }
 
 Block StorageSnapshot::getSampleBlockForColumns(const Names & column_names) const
diff --git a/src/Storages/StorageSnapshot.h b/src/Storages/StorageSnapshot.h
index 46244827f6c..909f4fd5cab 100644
--- a/src/Storages/StorageSnapshot.h
+++ b/src/Storages/StorageSnapshot.h
@@ -61,6 +61,10 @@ struct StorageSnapshot
     /// Get columns with types according to options only for requested names.
     NamesAndTypesList getColumnsByNames(const GetColumnsOptions & options, const Names & names) const;
 
+    /// Get column with type according to options for requested name.
+    std::optional<NameAndTypePair> tryGetColumn(const GetColumnsOptions & options, const String & column_name) const;
+    NameAndTypePair getColumn(const GetColumnsOptions & options, const String & column_name) const;
+
     /// Block with ordinary + materialized + aliases + virtuals + subcolumns.
     Block getSampleBlockForColumns(const Names & column_names) const;
 
diff --git a/tests/queries/0_stateless/01825_type_json_missed_values.reference b/tests/queries/0_stateless/01825_type_json_missed_values.reference
new file mode 100644
index 00000000000..b480493995b
--- /dev/null
+++ b/tests/queries/0_stateless/01825_type_json_missed_values.reference
@@ -0,0 +1,2 @@
+Tuple(foo Int8, k1 Int8, k2 Int8)
+1
diff --git a/tests/queries/0_stateless/01825_type_json_missed_values.sql b/tests/queries/0_stateless/01825_type_json_missed_values.sql
new file mode 100644
index 00000000000..2420ab7cf34
--- /dev/null
+++ b/tests/queries/0_stateless/01825_type_json_missed_values.sql
@@ -0,0 +1,19 @@
+-- Tags: no-fasttest
+
+DROP TABLE IF EXISTS t_json;
+
+SET allow_experimental_object_type = 1;
+
+CREATE TABLE t_json(id UInt64, obj JSON)
+ENGINE = MergeTree ORDER BY id
+SETTINGS min_bytes_for_wide_part = 0;
+
+SYSTEM STOP MERGES t_json;
+
+INSERT INTO t_json SELECT number, '{"k1": 1, "k2": 2}' FROM numbers(1000000);
+INSERT INTO t_json VALUES (1000001, '{"foo": 1}');
+
+SELECT toTypeName(obj) FROM t_json LIMIT 1;
+SELECT count() FROM t_json WHERE obj.foo != 0;
+
+DROP TABLE IF EXISTS t_json;

From 24c0cf86d44682ea0d8fb967cb17837191381d82 Mon Sep 17 00:00:00 2001
From: Anton Popov <ap@clickhouse.com>
Date: Mon, 28 Mar 2022 19:25:51 +0000
Subject: [PATCH 18/58] add function 'flattenTuple'

---
 src/Functions/flattenTuple.cpp                | 68 +++++++++++++++++++
 .../registerFunctionsMiscellaneous.cpp        |  2 +
 .../0_stateless/02246_flatten_tuple.reference |  4 ++
 .../0_stateless/02246_flatten_tuple.sql       | 22 ++++++
 4 files changed, 96 insertions(+)
 create mode 100644 src/Functions/flattenTuple.cpp
 create mode 100644 tests/queries/0_stateless/02246_flatten_tuple.reference
 create mode 100644 tests/queries/0_stateless/02246_flatten_tuple.sql

diff --git a/src/Functions/flattenTuple.cpp b/src/Functions/flattenTuple.cpp
new file mode 100644
index 00000000000..f5d5b4cb773
--- /dev/null
+++ b/src/Functions/flattenTuple.cpp
@@ -0,0 +1,68 @@
+#include <Functions/IFunction.h>
+#include <Functions/FunctionFactory.h>
+#include <Functions/FunctionHelpers.h>
+#include <DataTypes/DataTypeTuple.h>
+#include <DataTypes/ObjectUtils.h>
+#include <Columns/ColumnTuple.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int ILLEGAL_COLUMN;
+    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+}
+
+namespace
+{
+
+class FunctionFlattenTuple : public IFunction
+{
+public:
+    static constexpr auto name = "flattenTuple";
+    static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionFlattenTuple>(); }
+
+    String getName() const override { return name; }
+    size_t getNumberOfArguments() const override { return 1; }
+    bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo &) const override { return true; }
+    bool useDefaultImplementationForConstants() const override { return true; }
+
+    DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
+    {
+        const auto & type = arguments[0];
+        const auto * type_tuple = checkAndGetDataType<DataTypeTuple>(type.get());
+        if (!type_tuple || !type_tuple->haveExplicitNames())
+            throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
+                "Argument for function '{}' must be Named Tuple. Got '{}'",
+                getName(), type->getName());
+
+        auto [paths, types] = flattenTuple(type);
+        Names names;
+        names.reserve(paths.size());
+        for (const auto & path : paths)
+            names.push_back(path.getPath());
+
+        return std::make_shared<DataTypeTuple>(types, names);
+    }
+
+    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/) const override
+    {
+        auto column = arguments.at(0).column;
+        if (!checkAndGetColumn<ColumnTuple>(column.get()))
+            throw Exception(ErrorCodes::ILLEGAL_COLUMN,
+                "Illegal column {} of first argument of function {}. Expected ColumnTuple",
+                column->getName(), getName());
+
+        return flattenTuple(column);
+    }
+};
+
+}
+
+void registerFunctionFlattenTuple(FunctionFactory & factory)
+{
+    factory.registerFunction<FunctionFlattenTuple>();
+}
+
+}
diff --git a/src/Functions/registerFunctionsMiscellaneous.cpp b/src/Functions/registerFunctionsMiscellaneous.cpp
index 76d61ce509a..40a8409cd15 100644
--- a/src/Functions/registerFunctionsMiscellaneous.cpp
+++ b/src/Functions/registerFunctionsMiscellaneous.cpp
@@ -80,6 +80,7 @@ void registerFunctionInitialQueryID(FunctionFactory & factory);
 void registerFunctionServerUUID(FunctionFactory &);
 void registerFunctionZooKeeperSessionUptime(FunctionFactory &);
 void registerFunctionGetOSKernelVersion(FunctionFactory &);
+void registerFunctionFlattenTuple(FunctionFactory &);
 
 #if USE_ICU
 void registerFunctionConvertCharset(FunctionFactory &);
@@ -166,6 +167,7 @@ void registerFunctionsMiscellaneous(FunctionFactory & factory)
     registerFunctionServerUUID(factory);
     registerFunctionZooKeeperSessionUptime(factory);
     registerFunctionGetOSKernelVersion(factory);
+    registerFunctionFlattenTuple(factory);
 
 #if USE_ICU
     registerFunctionConvertCharset(factory);
diff --git a/tests/queries/0_stateless/02246_flatten_tuple.reference b/tests/queries/0_stateless/02246_flatten_tuple.reference
new file mode 100644
index 00000000000..0320150025d
--- /dev/null
+++ b/tests/queries/0_stateless/02246_flatten_tuple.reference
@@ -0,0 +1,4 @@
+([1,2],['a','b'],3,'c',4)	Tuple(`t1.a` Array(UInt32), `t1.s` Array(String), b UInt32, `t2.k` String, `t2.v` UInt32)
+Tuple(id Int8, obj Tuple(k1 Int8, k2 Tuple(k3 String, k4 Nested(k5 Int8, k6 Int8)), some Int8), s String)	Tuple(id Int8, `obj.k1` Int8, `obj.k2.k3` String, `obj.k2.k4.k5` Array(Int8), `obj.k2.k4.k6` Array(Int8), `obj.some` Int8, s String)
+1	1	2	[3,4]	[0,0]	0	foo
+2	0	str	[0]	[55]	42	bar
diff --git a/tests/queries/0_stateless/02246_flatten_tuple.sql b/tests/queries/0_stateless/02246_flatten_tuple.sql
new file mode 100644
index 00000000000..e73f382b261
--- /dev/null
+++ b/tests/queries/0_stateless/02246_flatten_tuple.sql
@@ -0,0 +1,22 @@
+DROP TABLE IF EXISTS t_flatten_tuple;
+DROP TABLE IF EXISTS t_flatten_object;
+
+SET flatten_nested = 0;
+
+CREATE TABLE t_flatten_tuple(t Tuple(t1 Nested(a UInt32, s String), b UInt32, t2 Tuple(k String, v UInt32))) ENGINE = Memory;
+
+INSERT INTO t_flatten_tuple VALUES (([(1, 'a'), (2, 'b')], 3, ('c', 4)));
+
+SELECT flattenTuple(t) AS ft, toTypeName(ft) FROM t_flatten_tuple;
+
+SET allow_experimental_object_type = 1;
+CREATE TABLE t_flatten_object(data JSON) ENGINE = Memory;
+
+INSERT INTO t_flatten_object VALUES ('{"id": 1, "obj": {"k1": 1, "k2": {"k3": 2, "k4": [{"k5": 3}, {"k5": 4}]}}, "s": "foo"}');
+INSERT INTO t_flatten_object VALUES ('{"id": 2, "obj": {"k2": {"k3": "str", "k4": [{"k6": 55}]}, "some": 42}, "s": "bar"}');
+
+SELECT toTypeName(data), toTypeName(flattenTuple(data)) FROM t_flatten_object LIMIT 1;
+SELECT untuple(flattenTuple(data)) FROM t_flatten_object;
+
+DROP TABLE IF EXISTS t_flatten_tuple;
+DROP TABLE IF EXISTS t_flatten_object;

From 36ea9ef11fce7a8d8672d376606ccbe6f6547831 Mon Sep 17 00:00:00 2001
From: avogar <pav.cruglov@yandex.ru>
Date: Mon, 28 Mar 2022 19:52:10 +0000
Subject: [PATCH 19/58] Fix creating materialized view with subquery after
 server restart

---
 src/Databases/DatabaseOnDisk.cpp              |  4 ++++
 .../__init__.py                               |  0
 .../test.py                                   | 23 +++++++++++++++++++
 3 files changed, 27 insertions(+)
 create mode 100755 tests/integration/test_materialized_view_restart_server/__init__.py
 create mode 100755 tests/integration/test_materialized_view_restart_server/test.py

diff --git a/src/Databases/DatabaseOnDisk.cpp b/src/Databases/DatabaseOnDisk.cpp
index 9f56b5f7676..f12009cef1a 100644
--- a/src/Databases/DatabaseOnDisk.cpp
+++ b/src/Databases/DatabaseOnDisk.cpp
@@ -6,6 +6,7 @@
 #include <IO/WriteHelpers.h>
 #include <Interpreters/Context.h>
 #include <Interpreters/InterpreterCreateQuery.h>
+#include <Interpreters/ApplyWithSubqueryVisitor.h>
 #include <Parsers/ASTCreateQuery.h>
 #include <Parsers/ASTFunction.h>
 #include <Parsers/ParserCreateQuery.h>
@@ -55,6 +56,9 @@ std::pair<String, StoragePtr> createTableFromAST(
     ast_create_query.attach = true;
     ast_create_query.setDatabase(database_name);
 
+    if (ast_create_query.select && ast_create_query.isView())
+        ApplyWithSubqueryVisitor().visit(*ast_create_query.select);
+
     if (ast_create_query.as_table_function)
     {
         const auto & factory = TableFunctionFactory::instance();
diff --git a/tests/integration/test_materialized_view_restart_server/__init__.py b/tests/integration/test_materialized_view_restart_server/__init__.py
new file mode 100755
index 00000000000..e69de29bb2d
diff --git a/tests/integration/test_materialized_view_restart_server/test.py b/tests/integration/test_materialized_view_restart_server/test.py
new file mode 100755
index 00000000000..828d8c279bf
--- /dev/null
+++ b/tests/integration/test_materialized_view_restart_server/test.py
@@ -0,0 +1,23 @@
+import pytest
+from helpers.cluster import ClickHouseCluster
+
+cluster = ClickHouseCluster(__file__)
+node = cluster.add_instance("node", stay_alive=True)
+
+
+@pytest.fixture(scope="module")
+def start_cluster():
+    try:
+        cluster.start()
+        yield cluster
+    finally:
+        cluster.shutdown()
+
+
+def test_materialized_view_with_subquery(start_cluster):
+    node.query("create table test (x UInt32) engine=TineLog()")
+    node.query("create materialized view mv engine = TinyLog() as with subquery as (select * from test) select * from subquery")
+    node.restart_clickhouse(kill=True)
+    node.query("insert into test select 1")
+    result = node.query("select * from mv")
+    assert int(result) == 1

From c066593595bd70a16cc0c84bd88778da1d801167 Mon Sep 17 00:00:00 2001
From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com>
Date: Mon, 28 Mar 2022 22:02:28 +0200
Subject: [PATCH 20/58] Update test.py

---
 .../integration/test_materialized_view_restart_server/test.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/integration/test_materialized_view_restart_server/test.py b/tests/integration/test_materialized_view_restart_server/test.py
index 828d8c279bf..4ede9f0f062 100755
--- a/tests/integration/test_materialized_view_restart_server/test.py
+++ b/tests/integration/test_materialized_view_restart_server/test.py
@@ -16,7 +16,9 @@ def start_cluster():
 
 def test_materialized_view_with_subquery(start_cluster):
     node.query("create table test (x UInt32) engine=TineLog()")
-    node.query("create materialized view mv engine = TinyLog() as with subquery as (select * from test) select * from subquery")
+    node.query(
+        "create materialized view mv engine = TinyLog() as with subquery as (select * from test) select * from subquery"
+    )
     node.restart_clickhouse(kill=True)
     node.query("insert into test select 1")
     result = node.query("select * from mv")

From cd3cc60d5dbdf05bcf585bac7b2bd4c71d1177ed Mon Sep 17 00:00:00 2001
From: Anton Popov <ap@clickhouse.com>
Date: Mon, 28 Mar 2022 21:13:20 +0000
Subject: [PATCH 21/58] fix fasttest

---
 tests/queries/0_stateless/02246_flatten_tuple.sql | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/queries/0_stateless/02246_flatten_tuple.sql b/tests/queries/0_stateless/02246_flatten_tuple.sql
index e73f382b261..927e3b9ec80 100644
--- a/tests/queries/0_stateless/02246_flatten_tuple.sql
+++ b/tests/queries/0_stateless/02246_flatten_tuple.sql
@@ -1,3 +1,5 @@
+-- Tags: no-fasttest
+
 DROP TABLE IF EXISTS t_flatten_tuple;
 DROP TABLE IF EXISTS t_flatten_object;
 

From 98fe1ca1ecefed61b3ea4b4416aef4988546ab2c Mon Sep 17 00:00:00 2001
From: kssenii <sumarokovakseniia@mail.ru>
Date: Tue, 29 Mar 2022 12:02:21 +0200
Subject: [PATCH 22/58] Fix

---
 programs/local/LocalServer.cpp                |  9 +++----
 ...6_clickhouse_local_drop_database.reference |  0
 .../02246_clickhouse_local_drop_database.sh   | 24 +++++++++++++++++++
 3 files changed, 29 insertions(+), 4 deletions(-)
 create mode 100644 tests/queries/0_stateless/02246_clickhouse_local_drop_database.reference
 create mode 100755 tests/queries/0_stateless/02246_clickhouse_local_drop_database.sh

diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp
index 26d42a11315..bb6684ca137 100644
--- a/programs/local/LocalServer.cpp
+++ b/programs/local/LocalServer.cpp
@@ -184,6 +184,11 @@ void LocalServer::tryInitPath()
     if (path.back() != '/')
         path += '/';
 
+    fs::create_directories(fs::path(path) / "user_defined/");
+    fs::create_directories(fs::path(path) / "data/");
+    fs::create_directories(fs::path(path) / "metadata/");
+    fs::create_directories(fs::path(path) / "metadata_dropped/");
+
     global_context->setPath(path);
 
     global_context->setTemporaryStorage(path + "tmp");
@@ -565,7 +570,6 @@ void LocalServer::processConfig()
         /// Lock path directory before read
         status.emplace(fs::path(path) / "status", StatusFile::write_full_info);
 
-        fs::create_directories(fs::path(path) / "user_defined/");
         LOG_DEBUG(log, "Loading user defined objects from {}", path);
         Poco::File(path + "user_defined/").createDirectories();
         UserDefinedSQLObjectsLoader::instance().loadObjects(global_context);
@@ -573,9 +577,6 @@ void LocalServer::processConfig()
         LOG_DEBUG(log, "Loaded user defined objects.");
 
         LOG_DEBUG(log, "Loading metadata from {}", path);
-        fs::create_directories(fs::path(path) / "data/");
-        fs::create_directories(fs::path(path) / "metadata/");
-
         loadMetadataSystem(global_context);
         attachSystemTablesLocal(global_context, *createMemoryDatabaseIfNotExists(global_context, DatabaseCatalog::SYSTEM_DATABASE));
         attachInformationSchema(global_context, *createMemoryDatabaseIfNotExists(global_context, DatabaseCatalog::INFORMATION_SCHEMA));
diff --git a/tests/queries/0_stateless/02246_clickhouse_local_drop_database.reference b/tests/queries/0_stateless/02246_clickhouse_local_drop_database.reference
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/queries/0_stateless/02246_clickhouse_local_drop_database.sh b/tests/queries/0_stateless/02246_clickhouse_local_drop_database.sh
new file mode 100755
index 00000000000..00f3904192f
--- /dev/null
+++ b/tests/queries/0_stateless/02246_clickhouse_local_drop_database.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+
+CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+# shellcheck source=../shell_config.sh
+. "$CURDIR"/../shell_config.sh
+
+dir=${CLICKHOUSE_TEST_UNIQUE_NAME}
+[[ -d $dir ]] && rm -r $dir
+mkdir $dir
+$CLICKHOUSE_LOCAL --multiline --multiquery --path $dir -q """
+DROP DATABASE IF EXISTS test;
+CREATE DATABASE IF NOT EXISTS test;
+USE test;
+CREATE TABLE test (id Int32) ENGINE=MergeTree() ORDER BY id;
+DROP DATABASE test;
+"""
+
+$CLICKHOUSE_LOCAL --multiline --multiquery -q """
+DROP DATABASE IF EXISTS test;
+CREATE DATABASE IF NOT EXISTS test;
+USE test;
+CREATE TABLE test (id Int32) ENGINE=MergeTree() ORDER BY id;
+DROP DATABASE test;
+"""

From f0b4a9fd15642240619dddfd4ebd5419b8b54e79 Mon Sep 17 00:00:00 2001
From: Antonio Andelic <antonio@clickhouse.com>
Date: Tue, 29 Mar 2022 10:56:49 +0000
Subject: [PATCH 23/58] Fix construction of ReadBufferFromS3

---
 src/Disks/IO/ReadBufferFromRemoteFSGather.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp b/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp
index 8f91804bbbe..0d50b24f7a5 100644
--- a/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp
+++ b/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp
@@ -44,7 +44,7 @@ SeekableReadBufferPtr ReadBufferFromS3Gather::createImplementationBuffer(const S
     {
         return std::make_unique<ReadBufferFromS3>(
             client_ptr, bucket, fs::path(metadata.remote_fs_root_path) / path, max_single_read_retries,
-            settings, /* use_external_buffer */true, read_until_position, /* restricted_seek */true);
+            settings, /* use_external_buffer */true, /* offset */ 0, read_until_position, /* restricted_seek */true);
     };
 
     if (with_cache)

From a2aab8ffacbfa74b9d86bdb938eb04b356152f89 Mon Sep 17 00:00:00 2001
From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com>
Date: Tue, 29 Mar 2022 14:07:19 +0200
Subject: [PATCH 24/58] Fix test

---
 tests/integration/test_materialized_view_restart_server/test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integration/test_materialized_view_restart_server/test.py b/tests/integration/test_materialized_view_restart_server/test.py
index 4ede9f0f062..bfadec5136b 100755
--- a/tests/integration/test_materialized_view_restart_server/test.py
+++ b/tests/integration/test_materialized_view_restart_server/test.py
@@ -15,7 +15,7 @@ def start_cluster():
 
 
 def test_materialized_view_with_subquery(start_cluster):
-    node.query("create table test (x UInt32) engine=TineLog()")
+    node.query("create table test (x UInt32) engine=TinyLog()")
     node.query(
         "create materialized view mv engine = TinyLog() as with subquery as (select * from test) select * from subquery"
     )

From c59926dcc7e3bb52be7b9b86ead120ae0d8b640d Mon Sep 17 00:00:00 2001
From: Anton Popov <ap@clickhouse.com>
Date: Tue, 29 Mar 2022 12:13:02 +0000
Subject: [PATCH 25/58] fix tests

---
 tests/queries/0_stateless/02246_flatten_tuple.sql | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/queries/0_stateless/02246_flatten_tuple.sql b/tests/queries/0_stateless/02246_flatten_tuple.sql
index 927e3b9ec80..139f7a621ef 100644
--- a/tests/queries/0_stateless/02246_flatten_tuple.sql
+++ b/tests/queries/0_stateless/02246_flatten_tuple.sql
@@ -18,7 +18,7 @@ INSERT INTO t_flatten_object VALUES ('{"id": 1, "obj": {"k1": 1, "k2": {"k3": 2,
 INSERT INTO t_flatten_object VALUES ('{"id": 2, "obj": {"k2": {"k3": "str", "k4": [{"k6": 55}]}, "some": 42}, "s": "bar"}');
 
 SELECT toTypeName(data), toTypeName(flattenTuple(data)) FROM t_flatten_object LIMIT 1;
-SELECT untuple(flattenTuple(data)) FROM t_flatten_object;
+SELECT untuple(flattenTuple(data)) FROM t_flatten_object ORDER BY data.id;
 
 DROP TABLE IF EXISTS t_flatten_tuple;
 DROP TABLE IF EXISTS t_flatten_object;

From ebd72e433d83dba85b3563af95b6aef015f8a4b8 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Tue, 29 Mar 2022 14:41:47 +0200
Subject: [PATCH 26/58] Fail checks in case of errors

---
 tests/ci/build_report_check.py           | 3 +++
 tests/ci/compatibility_check.py          | 3 +++
 tests/ci/docker_images_check.py          | 3 +++
 tests/ci/docs_check.py                   | 3 +++
 tests/ci/fast_test_check.py              | 2 +-
 tests/ci/integration_test_check.py       | 3 +++
 tests/ci/performance_comparison_check.py | 3 +++
 tests/ci/split_build_smoke_check.py      | 3 +++
 tests/ci/stress_check.py                 | 3 +++
 tests/ci/style_check.py                  | 3 +++
 tests/ci/unit_tests_check.py             | 3 +++
 11 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/tests/ci/build_report_check.py b/tests/ci/build_report_check.py
index 592e905bcb5..50315ad3d78 100644
--- a/tests/ci/build_report_check.py
+++ b/tests/ci/build_report_check.py
@@ -244,3 +244,6 @@ if __name__ == "__main__":
         state=summary_status,
         target_url=url,
     )
+
+    if summary_status == "error":
+        sys.exit(1)
diff --git a/tests/ci/compatibility_check.py b/tests/ci/compatibility_check.py
index d546fabf231..1f408ba0445 100644
--- a/tests/ci/compatibility_check.py
+++ b/tests/ci/compatibility_check.py
@@ -198,3 +198,6 @@ if __name__ == "__main__":
         CHECK_NAME,
     )
     ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events)
+
+    if state == "error":
+        sys.exit(1)
diff --git a/tests/ci/docker_images_check.py b/tests/ci/docker_images_check.py
index 3d0cc468aec..ef6d369bf4f 100644
--- a/tests/ci/docker_images_check.py
+++ b/tests/ci/docker_images_check.py
@@ -7,6 +7,7 @@ import platform
 import shutil
 import subprocess
 import time
+import sys
 from typing import Dict, List, Optional, Set, Tuple, Union
 
 from github import Github
@@ -461,6 +462,8 @@ def main():
     ch_helper = ClickHouseHelper()
     ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events)
 
+    if status == "error":
+        sys.exit(1)
 
 if __name__ == "__main__":
     main()
diff --git a/tests/ci/docs_check.py b/tests/ci/docs_check.py
index 58678b160a4..6e0c7e5d5fb 100644
--- a/tests/ci/docs_check.py
+++ b/tests/ci/docs_check.py
@@ -115,3 +115,6 @@ if __name__ == "__main__":
         NAME,
     )
     ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events)
+
+    if status == "error":
+        sys.exit(1)
diff --git a/tests/ci/fast_test_check.py b/tests/ci/fast_test_check.py
index 64e04594786..40fa5bff091 100644
--- a/tests/ci/fast_test_check.py
+++ b/tests/ci/fast_test_check.py
@@ -208,7 +208,7 @@ if __name__ == "__main__":
 
     # Refuse other checks to run if fast test failed
     if state != "success":
-        if "force-tests" in pr_info.labels:
+        if "force-tests" in pr_info.labels and state != "error":
             print("'force-tests' enabled, will report success")
         else:
             sys.exit(1)
diff --git a/tests/ci/integration_test_check.py b/tests/ci/integration_test_check.py
index 30009414d6e..7481beb2329 100644
--- a/tests/ci/integration_test_check.py
+++ b/tests/ci/integration_test_check.py
@@ -280,3 +280,6 @@ if __name__ == "__main__":
         check_name_with_group,
     )
     ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events)
+
+    if state == "error":
+        sys.exit(1)
diff --git a/tests/ci/performance_comparison_check.py b/tests/ci/performance_comparison_check.py
index c6ce86b2ce1..2e8a0de7100 100644
--- a/tests/ci/performance_comparison_check.py
+++ b/tests/ci/performance_comparison_check.py
@@ -217,3 +217,6 @@ if __name__ == "__main__":
     post_commit_status(
         gh, pr_info.sha, check_name_with_group, message, status, report_url
     )
+
+    if status == "error":
+        sys.exit(1)
diff --git a/tests/ci/split_build_smoke_check.py b/tests/ci/split_build_smoke_check.py
index 41ba6c2fedb..6629c2b3f71 100644
--- a/tests/ci/split_build_smoke_check.py
+++ b/tests/ci/split_build_smoke_check.py
@@ -148,3 +148,6 @@ if __name__ == "__main__":
         CHECK_NAME,
     )
     ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events)
+
+    if state == "error":
+        sys.exit(1)
diff --git a/tests/ci/stress_check.py b/tests/ci/stress_check.py
index 32c181140e2..904ac38bffe 100644
--- a/tests/ci/stress_check.py
+++ b/tests/ci/stress_check.py
@@ -177,3 +177,6 @@ if __name__ == "__main__":
         check_name,
     )
     ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events)
+
+    if state == "error":
+        sys.exit(1)
diff --git a/tests/ci/style_check.py b/tests/ci/style_check.py
index 1b3037217c8..811e6d1d746 100644
--- a/tests/ci/style_check.py
+++ b/tests/ci/style_check.py
@@ -118,3 +118,6 @@ if __name__ == "__main__":
         NAME,
     )
     ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events)
+
+    if state == "error":
+        sys.exit(1)
diff --git a/tests/ci/unit_tests_check.py b/tests/ci/unit_tests_check.py
index 84c4faa822d..f86560562b9 100644
--- a/tests/ci/unit_tests_check.py
+++ b/tests/ci/unit_tests_check.py
@@ -174,3 +174,6 @@ if __name__ == "__main__":
         check_name,
     )
     ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events)
+
+    if state == "error":
+        sys.exit(1)

From 3246261da8a3152bc0ecf0c8855120454d76877f Mon Sep 17 00:00:00 2001
From: tavplubix <tavplubix@clickhouse.com>
Date: Tue, 29 Mar 2022 15:43:42 +0300
Subject: [PATCH 27/58] Fix logging in `test_distributed_respect_user_timeouts`
 (#35575)

* Update test.py

* Update test.py

* Update test.py

Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
---
 .../integration/test_distributed_respect_user_timeouts/test.py  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integration/test_distributed_respect_user_timeouts/test.py b/tests/integration/test_distributed_respect_user_timeouts/test.py
index 9cf7082d63a..567377aba0b 100644
--- a/tests/integration/test_distributed_respect_user_timeouts/test.py
+++ b/tests/integration/test_distributed_respect_user_timeouts/test.py
@@ -94,7 +94,7 @@ def _check_exception(exception, expected_tries=3):
 
 @pytest.fixture(scope="module", params=["configs", "configs_secure"])
 def started_cluster(request):
-    cluster = ClickHouseCluster(__file__)
+    cluster = ClickHouseCluster(__file__, request.param)
     cluster.__with_ssl_config = request.param == "configs_secure"
     main_configs = []
     main_configs += [os.path.join(request.param, "config.d/remote_servers.xml")]

From 0cc9c8212410e8bad45e5124f7beb6e98e17c8d9 Mon Sep 17 00:00:00 2001
From: Dmitry Novik <n0vik@clickhouse.com>
Date: Tue, 29 Mar 2022 13:09:37 +0000
Subject: [PATCH 28/58] Try to fix test_global_overcommit_tracker flakyness

---
 tests/integration/test_global_overcommit_tracker/test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integration/test_global_overcommit_tracker/test.py b/tests/integration/test_global_overcommit_tracker/test.py
index c2f3a22915f..cacc447be1a 100644
--- a/tests/integration/test_global_overcommit_tracker/test.py
+++ b/tests/integration/test_global_overcommit_tracker/test.py
@@ -30,7 +30,7 @@ def test_overcommited_is_killed():
 
     responses_A = list()
     responses_B = list()
-    for _ in range(100):
+    for _ in range(500):
         responses_A.append(node.get_query_request(TEST_QUERY_A, user="A"))
         responses_B.append(node.get_query_request(TEST_QUERY_B, user="B"))
 

From 9990abb76afad4ac7c65895976d777ded0ca9d1c Mon Sep 17 00:00:00 2001
From: Antonio Andelic <antonio@clickhouse.com>
Date: Mon, 28 Mar 2022 09:48:17 +0000
Subject: [PATCH 29/58] Use compile-time check for Exception messages, fix
 wrong messages

---
 src/Common/Exception.h                        | 22 +++++++++----------
 src/Databases/DatabaseReplicatedWorker.cpp    |  8 +++++--
 src/Dictionaries/XDBCDictionarySource.cpp     |  2 +-
 src/Disks/IO/CachedReadBufferFromRemoteFS.cpp |  9 ++++++--
 src/Formats/FormatSchemaInfo.cpp              |  9 +++++---
 src/Functions/FunctionsStringArray.h          |  2 +-
 src/Functions/IFunction.cpp                   |  9 +++++---
 src/Functions/castOrDefault.cpp               |  2 +-
 src/Functions/dateName.cpp                    |  2 +-
 src/Interpreters/CatBoostModel.cpp            |  1 +
 src/Interpreters/DatabaseCatalog.cpp          |  8 +++----
 src/Interpreters/executeDDLQueryOnCluster.cpp | 19 ++++++++++------
 .../Formats/Impl/MsgPackRowInputFormat.cpp    |  4 ++--
 13 files changed, 59 insertions(+), 38 deletions(-)

diff --git a/src/Common/Exception.h b/src/Common/Exception.h
index 0bf89e7a447..c9bb8cc9f12 100644
--- a/src/Common/Exception.h
+++ b/src/Common/Exception.h
@@ -35,10 +35,10 @@ public:
     {}
 
     // Format message with fmt::format, like the logging functions.
-    template <typename ...Args>
-    Exception(int code, const std::string & fmt, Args&&... args)
-        : Exception(fmt::format(fmt::runtime(fmt), std::forward<Args>(args)...), code)
-    {}
+    template <typename... Args>
+    Exception(int code, fmt::format_string<Args...> fmt, Args &&... args) : Exception(fmt::format(fmt, std::forward<Args>(args)...), code)
+    {
+    }
 
     struct CreateFromPocoTag {};
     struct CreateFromSTDTag {};
@@ -52,10 +52,10 @@ public:
     const char * what() const throw() override { return message().data(); }
 
     /// Add something to the existing message.
-    template <typename ...Args>
-    void addMessage(const std::string& format, Args&&... args)
+    template <typename... Args>
+    void addMessage(fmt::format_string<Args...> format, Args &&... args)
     {
-        extendedMessage(fmt::format(fmt::runtime(format), std::forward<Args>(args)...));
+        extendedMessage(fmt::format(format, std::forward<Args>(args)...));
     }
 
     void addMessage(const std::string& message)
@@ -117,10 +117,10 @@ public:
     ParsingException(int code, const std::string & message);
 
     // Format message with fmt::format, like the logging functions.
-    template <typename ...Args>
-    ParsingException(int code, const std::string & fmt, Args&&... args)
-        : Exception(fmt::format(fmt::runtime(fmt), std::forward<Args>(args)...), code)
-    {}
+    template <typename... Args>
+    ParsingException(int code, fmt::format_string<Args...> fmt, Args &&... args) : Exception(code, fmt, std::forward<Args>(args)...)
+    {
+    }
 
 
     std::string displayText() const
diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp
index 365a5d02816..b45cfb16362 100644
--- a/src/Databases/DatabaseReplicatedWorker.cpp
+++ b/src/Databases/DatabaseReplicatedWorker.cpp
@@ -179,8 +179,12 @@ String DatabaseReplicatedDDLWorker::tryEnqueueAndExecuteEntry(DDLLogEntry & entr
 
     if (!task->was_executed)
     {
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Entry {} was executed, but was not committed: code {}: {}",
-                        task->execution_status.code, task->execution_status.message);
+        throw Exception(
+            ErrorCodes::LOGICAL_ERROR,
+            "Entry {} was executed, but was not committed: code {}: {}",
+            task->entry_name,
+            task->execution_status.code,
+            task->execution_status.message);
     }
 
     try_node->setAlreadyRemoved();
diff --git a/src/Dictionaries/XDBCDictionarySource.cpp b/src/Dictionaries/XDBCDictionarySource.cpp
index f08abcdc516..764a7072ca0 100644
--- a/src/Dictionaries/XDBCDictionarySource.cpp
+++ b/src/Dictionaries/XDBCDictionarySource.cpp
@@ -50,7 +50,7 @@ namespace
         {
             if (!qualified_name.database.empty())
                 throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
-                    "Dictionary source of type {} specifies a schema but schema is not supported by {}-driver",
+                    "Dictionary source specifies a schema but schema is not supported by {}-driver",
                     bridge_.getName());
         }
 
diff --git a/src/Disks/IO/CachedReadBufferFromRemoteFS.cpp b/src/Disks/IO/CachedReadBufferFromRemoteFS.cpp
index 5cab2cb2995..4b614cd10e0 100644
--- a/src/Disks/IO/CachedReadBufferFromRemoteFS.cpp
+++ b/src/Disks/IO/CachedReadBufferFromRemoteFS.cpp
@@ -392,8 +392,13 @@ void CachedReadBufferFromRemoteFS::predownload(FileSegmentPtr & file_segment)
                 if (bytes_to_predownload)
                     throw Exception(
                         ErrorCodes::LOGICAL_ERROR,
-                        "Failed to predownload remaining {} bytes. Current file segment: {}, current download offset: {}, expected: {}, eof: {}",
-                        file_segment->range().toString(), file_segment->getDownloadOffset(), file_offset_of_buffer_end, implementation_buffer->eof());
+                        "Failed to predownload remaining {} bytes. Current file segment: {}, current download offset: {}, expected: {}, "
+                        "eof: {}",
+                        bytes_to_predownload,
+                        file_segment->range().toString(),
+                        file_segment->getDownloadOffset(),
+                        file_offset_of_buffer_end,
+                        implementation_buffer->eof());
 
                 auto result = implementation_buffer->hasPendingData();
 
diff --git a/src/Formats/FormatSchemaInfo.cpp b/src/Formats/FormatSchemaInfo.cpp
index 24c8dfc14f2..c1157be6e7a 100644
--- a/src/Formats/FormatSchemaInfo.cpp
+++ b/src/Formats/FormatSchemaInfo.cpp
@@ -85,9 +85,12 @@ FormatSchemaInfo::FormatSchemaInfo(const String & format_schema, const String &
     else if (path.has_parent_path() && !fs::weakly_canonical(default_schema_directory_path / path).string().starts_with(fs::weakly_canonical(default_schema_directory_path).string()))
     {
         if (is_server)
-            throw Exception(ErrorCodes::BAD_ARGUMENTS,
-                            "Path in the 'format_schema' setting shouldn't go outside the 'format_schema_path' directory: {} ({} not in {})",
-                            path.string());
+            throw Exception(
+                ErrorCodes::BAD_ARGUMENTS,
+                "Path in the 'format_schema' setting shouldn't go outside the 'format_schema_path' directory: {} ({} not in {})",
+                default_schema_directory(),
+                path.string(),
+                default_schema_directory());
         path = default_schema_directory_path / path;
         schema_path = path.filename();
         schema_directory = path.parent_path() / "";
diff --git a/src/Functions/FunctionsStringArray.h b/src/Functions/FunctionsStringArray.h
index a1256598f1b..6b3adf46ff5 100644
--- a/src/Functions/FunctionsStringArray.h
+++ b/src/Functions/FunctionsStringArray.h
@@ -259,7 +259,7 @@ public:
             throw Exception(
                 ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
                 "Function '{}' needs at least 2 arguments, at most 3 arguments; passed {}.",
-                arguments.size());
+                name, arguments.size());
 
         if (!isString(arguments[0]))
             throw Exception("Illegal type " + arguments[0]->getName() + " of first argument of function " + getName() + ". Must be String.",
diff --git a/src/Functions/IFunction.cpp b/src/Functions/IFunction.cpp
index cfb4e12a025..19638c78daf 100644
--- a/src/Functions/IFunction.cpp
+++ b/src/Functions/IFunction.cpp
@@ -181,9 +181,12 @@ ColumnPtr IExecutableFunction::defaultImplementationForNulls(
         // Default implementation for nulls returns null result for null arguments,
         // so the result type must be nullable.
         if (!result_type->isNullable())
-            throw Exception(ErrorCodes::LOGICAL_ERROR,
-                            "Function {} with Null argument and default implementation for Nulls "
-                            "is expected to return Nullable result, got {}", result_type->getName());
+            throw Exception(
+                ErrorCodes::LOGICAL_ERROR,
+                "Function {} with Null argument and default implementation for Nulls "
+                "is expected to return Nullable result, got {}",
+                getName(),
+                result_type->getName());
 
         return result_type->createColumnConstWithDefaultValue(input_rows_count);
     }
diff --git a/src/Functions/castOrDefault.cpp b/src/Functions/castOrDefault.cpp
index 628ac57f34d..f7b93ec2e83 100644
--- a/src/Functions/castOrDefault.cpp
+++ b/src/Functions/castOrDefault.cpp
@@ -231,7 +231,7 @@ private:
             {
                 throw Exception(ErrorCodes::BAD_ARGUMENTS,
                     "Function {} decimal scale should have native UInt type. Actual {}",
-                    scale_argument.type->getName());
+                    getName(), scale_argument.type->getName());
             }
 
             scale = arguments[additional_argument_index].column->getUInt(0);
diff --git a/src/Functions/dateName.cpp b/src/Functions/dateName.cpp
index eef9bc3955b..91ea8995777 100644
--- a/src/Functions/dateName.cpp
+++ b/src/Functions/dateName.cpp
@@ -112,7 +112,7 @@ public:
             || (res = executeType<DataTypeDateTime64>(arguments, result_type))))
             throw Exception(
                 ErrorCodes::ILLEGAL_COLUMN,
-                "Illegal column {} of function {], must be Date or DateTime.",
+                "Illegal column {} of function {}, must be Date or DateTime.",
                 arguments[1].column->getName(),
                 getName());
 
diff --git a/src/Interpreters/CatBoostModel.cpp b/src/Interpreters/CatBoostModel.cpp
index cffaa81c4f0..d5803ed9e36 100644
--- a/src/Interpreters/CatBoostModel.cpp
+++ b/src/Interpreters/CatBoostModel.cpp
@@ -169,6 +169,7 @@ public:
         if (columns.size() != float_features_count + cat_features_count)
             throw Exception(ErrorCodes::BAD_ARGUMENTS,
                 "Number of columns is different with number of features: columns size {} float features size {} + cat features size {}",
+                columns.size(),
                 float_features_count,
                 cat_features_count);
 
diff --git a/src/Interpreters/DatabaseCatalog.cpp b/src/Interpreters/DatabaseCatalog.cpp
index 360a5d430e0..2f51d942403 100644
--- a/src/Interpreters/DatabaseCatalog.cpp
+++ b/src/Interpreters/DatabaseCatalog.cpp
@@ -233,7 +233,7 @@ DatabaseAndTable DatabaseCatalog::getTableImpl(
         {
             assert(!db_and_table.first && !db_and_table.second);
             if (exception)
-                exception->emplace(ErrorCodes::UNKNOWN_TABLE, "Table {} doesn't exist", table_id.getNameForLogs());
+                exception->emplace(fmt::format("Table {} doesn't exist", table_id.getNameForLogs()), ErrorCodes::UNKNOWN_TABLE);
             return {};
         }
 
@@ -263,7 +263,7 @@ DatabaseAndTable DatabaseCatalog::getTableImpl(
         /// If table_id has no UUID, then the name of database was specified by user and table_id was not resolved through context.
         /// Do not allow access to TEMPORARY_DATABASE because it contains all temporary tables of all contexts and users.
         if (exception)
-            exception->emplace(ErrorCodes::DATABASE_ACCESS_DENIED, "Direct access to `{}` database is not allowed", String(TEMPORARY_DATABASE));
+            exception->emplace(fmt::format("Direct access to `{}` database is not allowed", TEMPORARY_DATABASE), ErrorCodes::DATABASE_ACCESS_DENIED);
         return {};
     }
 
@@ -274,7 +274,7 @@ DatabaseAndTable DatabaseCatalog::getTableImpl(
         if (databases.end() == it)
         {
             if (exception)
-                exception->emplace(ErrorCodes::UNKNOWN_DATABASE, "Database {} doesn't exist", backQuoteIfNeed(table_id.getDatabaseName()));
+                exception->emplace(fmt::format("Database {} doesn't exist", backQuoteIfNeed(table_id.getDatabaseName())), ErrorCodes::UNKNOWN_DATABASE);
             return {};
         }
         database = it->second;
@@ -282,7 +282,7 @@ DatabaseAndTable DatabaseCatalog::getTableImpl(
 
     auto table = database->tryGetTable(table_id.table_name, context_);
     if (!table && exception)
-            exception->emplace(ErrorCodes::UNKNOWN_TABLE, "Table {} doesn't exist", table_id.getNameForLogs());
+            exception->emplace(fmt::format("Table {} doesn't exist", table_id.getNameForLogs()), ErrorCodes::UNKNOWN_TABLE);
     if (!table)
         database = nullptr;
 
diff --git a/src/Interpreters/executeDDLQueryOnCluster.cpp b/src/Interpreters/executeDDLQueryOnCluster.cpp
index ce00676b2ed..f0279bafca2 100644
--- a/src/Interpreters/executeDDLQueryOnCluster.cpp
+++ b/src/Interpreters/executeDDLQueryOnCluster.cpp
@@ -320,12 +320,13 @@ Chunk DDLQueryStatusSource::generate()
             if (throw_on_timeout)
             {
                 if (!first_exception)
-                    first_exception = std::make_unique<Exception>(ErrorCodes::TIMEOUT_EXCEEDED, msg_format,
-                        node_path, timeout_seconds, num_unfinished_hosts, num_active_hosts);
+                    first_exception = std::make_unique<Exception>(
+                        fmt::format(msg_format, node_path, timeout_seconds, num_unfinished_hosts, num_active_hosts),
+                        ErrorCodes::TIMEOUT_EXCEEDED);
                 return {};
             }
 
-            LOG_INFO(log, fmt::runtime(msg_format), node_path, timeout_seconds, num_unfinished_hosts, num_active_hosts);
+            LOG_INFO(log, msg_format, node_path, timeout_seconds, num_unfinished_hosts, num_active_hosts);
 
             NameSet unfinished_hosts = waiting_hosts;
             for (const auto & host_id : finished_hosts)
@@ -358,9 +359,12 @@ Chunk DDLQueryStatusSource::generate()
             /// Paradoxically, this exception will be throw even in case of "never_throw" mode.
 
             if (!first_exception)
-                first_exception = std::make_unique<Exception>(ErrorCodes::UNFINISHED,
-                    "Cannot provide query execution status. The query's node {} has been deleted by the cleaner"
-                    " since it was finished (or its lifetime is expired)", node_path);
+                first_exception = std::make_unique<Exception>(
+                    fmt::format(
+                        "Cannot provide query execution status. The query's node {} has been deleted by the cleaner"
+                        " since it was finished (or its lifetime is expired)",
+                        node_path),
+                    ErrorCodes::UNFINISHED);
             return {};
         }
 
@@ -386,7 +390,8 @@ Chunk DDLQueryStatusSource::generate()
             if (status.code != 0 && !first_exception
                 && context->getSettingsRef().distributed_ddl_output_mode != DistributedDDLOutputMode::NEVER_THROW)
             {
-                first_exception = std::make_unique<Exception>(status.code, "There was an error on [{}:{}]: {}", host, port, status.message);
+                first_exception = std::make_unique<Exception>(
+                    fmt::format("There was an error on [{}:{}]: {}", host, port, status.message), status.code);
             }
 
             ++num_hosts_finished;
diff --git a/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp b/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp
index 607e6f36767..722cedbab30 100644
--- a/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp
+++ b/src/Processors/Formats/Impl/MsgPackRowInputFormat.cpp
@@ -359,7 +359,7 @@ bool MsgPackVisitor::visit_ext(const char * value, uint32_t size)
         return true;
     }
 
-    throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unsupported MsgPack extension type: {%x}", type);
+    throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unsupported MsgPack extension type: {:x}", type);
 }
 
 void MsgPackVisitor::parse_error(size_t, size_t) // NOLINT
@@ -498,7 +498,7 @@ DataTypePtr MsgPackSchemaReader::getDataType(const msgpack::object & object)
             msgpack::object_ext object_ext = object.via.ext;
             if (object_ext.type() == int8_t(MsgPackExtensionTypes::UUIDType))
                 return std::make_shared<DataTypeUUID>();
-            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Msgpack extension type {%x} is not supported", object_ext.type());
+            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Msgpack extension type {:x} is not supported", object_ext.type());
         }
     }
     __builtin_unreachable();

From ef484547bf9927c222c588113487a1534e61e39a Mon Sep 17 00:00:00 2001
From: Antonio Andelic <antonio@clickhouse.com>
Date: Tue, 29 Mar 2022 10:14:38 +0000
Subject: [PATCH 30/58] Fix GCC build

---
 src/Storages/Kafka/ReadBufferFromKafkaConsumer.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/Storages/Kafka/ReadBufferFromKafkaConsumer.cpp b/src/Storages/Kafka/ReadBufferFromKafkaConsumer.cpp
index a72d7b12a4f..ebfeaed8346 100644
--- a/src/Storages/Kafka/ReadBufferFromKafkaConsumer.cpp
+++ b/src/Storages/Kafka/ReadBufferFromKafkaConsumer.cpp
@@ -1,10 +1,13 @@
+// Needs to go first because its partial specialization of fmt::formatter
+// should be defined before any instantiation
+#include <fmt/ostream.h>
+
 #include <Storages/Kafka/ReadBufferFromKafkaConsumer.h>
 
 #include <base/logger_useful.h>
 
 #include <cppkafka/cppkafka.h>
 #include <boost/algorithm/string/join.hpp>
-#include <fmt/ostream.h>
 #include <algorithm>
 
 namespace DB

From c9feb7f7515cf2711877fa40844a1b55b4e03267 Mon Sep 17 00:00:00 2001
From: Antonio Andelic <antonio@clickhouse.com>
Date: Tue, 29 Mar 2022 13:40:14 +0000
Subject: [PATCH 31/58] Format test file

---
 tests/integration/test_allowed_url_from_config/test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integration/test_allowed_url_from_config/test.py b/tests/integration/test_allowed_url_from_config/test.py
index 13f3929902d..01a2a500ebf 100644
--- a/tests/integration/test_allowed_url_from_config/test.py
+++ b/tests/integration/test_allowed_url_from_config/test.py
@@ -280,4 +280,4 @@ def test_HDFS(start_cluster):
 
 def test_schema_inference(start_cluster):
     error = node7.query_and_get_error("desc url('http://test.com`, 'TSVRaw'')")
-    assert(error.find('ReadWriteBufferFromHTTPBase') == -1)
+    assert error.find("ReadWriteBufferFromHTTPBase") == -1

From d0f01516db0c2d403f9d45898251bd417e78ad54 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Tue, 29 Mar 2022 15:48:57 +0200
Subject: [PATCH 32/58] Resurrect automatic labelling

---
 .github/workflows/master.yml         |  10 -
 tests/ci/cherry_pick_utils/parser.py |   8 -
 tests/ci/commit_status_helper.py     |   5 +
 tests/ci/run_check.py                |  34 +-
 utils/github/__init__.py             |   1 -
 utils/github/backport.py             | 185 ----------
 utils/github/cherrypick.py           | 323 ------------------
 utils/github/local.py                | 108 ------
 utils/github/parser.py               |  64 ----
 utils/github/query.py                | 492 ---------------------------
 10 files changed, 31 insertions(+), 1199 deletions(-)
 delete mode 100644 utils/github/__init__.py
 delete mode 100644 utils/github/backport.py
 delete mode 100644 utils/github/cherrypick.py
 delete mode 100644 utils/github/local.py
 delete mode 100644 utils/github/parser.py
 delete mode 100644 utils/github/query.py

diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml
index 73cffc1d5d3..cfa95b84ee5 100644
--- a/.github/workflows/master.yml
+++ b/.github/workflows/master.yml
@@ -149,7 +149,6 @@ jobs:
           sudo rm -fr "$TEMP_PATH"
   SplitBuildSmokeTest:
     needs: [BuilderDebSplitted]
-    if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }}
     runs-on: [self-hosted, style-checker]
     steps:
       - name: Set envs
@@ -316,7 +315,6 @@ jobs:
           sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
   BuilderBinRelease:
     needs: [DockerHubPush]
-    if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }}
     runs-on: [self-hosted, builder]
     steps:
       - name: Set envs
@@ -362,7 +360,6 @@ jobs:
           sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
   BuilderBinGCC:
     needs: [DockerHubPush]
-    if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }}
     runs-on: [self-hosted, builder]
     steps:
       - name: Set envs
@@ -636,7 +633,6 @@ jobs:
 ##########################################################################################
   BuilderDebSplitted:
     needs: [DockerHubPush]
-    if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }}
     runs-on: [self-hosted, builder]
     steps:
       - name: Set envs
@@ -682,7 +678,6 @@ jobs:
           sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
   BuilderBinTidy:
     needs: [DockerHubPush]
-    if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }}
     runs-on: [self-hosted, builder]
     steps:
       - name: Set envs
@@ -728,7 +723,6 @@ jobs:
           sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
   BuilderBinDarwin:
     needs: [DockerHubPush]
-    if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }}
     runs-on: [self-hosted, builder]
     steps:
       - name: Set envs
@@ -774,7 +768,6 @@ jobs:
           sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
   BuilderBinAarch64:
     needs: [DockerHubPush]
-    if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }}
     runs-on: [self-hosted, builder]
     steps:
       - name: Set envs
@@ -820,7 +813,6 @@ jobs:
           sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
   BuilderBinFreeBSD:
     needs: [DockerHubPush]
-    if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }}
     runs-on: [self-hosted, builder]
     steps:
       - name: Set envs
@@ -866,7 +858,6 @@ jobs:
           sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
   BuilderBinDarwinAarch64:
     needs: [DockerHubPush]
-    if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }}
     runs-on: [self-hosted, builder]
     steps:
       - name: Set envs
@@ -912,7 +903,6 @@ jobs:
           sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
   BuilderBinPPC64:
     needs: [DockerHubPush]
-    if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }}
     runs-on: [self-hosted, builder]
     steps:
       - name: Set envs
diff --git a/tests/ci/cherry_pick_utils/parser.py b/tests/ci/cherry_pick_utils/parser.py
index d8348e6d964..29c05e5328f 100644
--- a/tests/ci/cherry_pick_utils/parser.py
+++ b/tests/ci/cherry_pick_utils/parser.py
@@ -20,8 +20,6 @@ class Description:
 
     def __init__(self, pull_request):
         self.label_name = str()
-        self.legal = False
-
         self._parse(pull_request["bodyText"])
 
     def _parse(self, text):
@@ -39,12 +37,6 @@ class Description:
                 category = stripped
                 next_category = False
 
-            if (
-                stripped
-                == "I hereby agree to the terms of the CLA available at: https://yandex.ru/legal/cla/?lang=en"
-            ):
-                self.legal = True
-
             category_headers = (
                 "Category (leave one):",
                 "Changelog category (leave one):",
diff --git a/tests/ci/commit_status_helper.py b/tests/ci/commit_status_helper.py
index e379c9a2254..53dfb145b44 100644
--- a/tests/ci/commit_status_helper.py
+++ b/tests/ci/commit_status_helper.py
@@ -59,3 +59,8 @@ def post_commit_status_to_file(file_path, description, state, report_url):
     with open(file_path, "w", encoding="utf-8") as f:
         out = csv.writer(f, delimiter="\t")
         out.writerow([state, report_url, description])
+
+def post_label(gh, pr_info, label_name):
+    repo = gh.get_repo(GITHUB_REPOSITORY)
+    pull_request = repo.get_pull(pr_info.number)
+    pull_request.add_to_labels(label_name)
diff --git a/tests/ci/run_check.py b/tests/ci/run_check.py
index 9c7ba13f8e4..74e60582e8d 100644
--- a/tests/ci/run_check.py
+++ b/tests/ci/run_check.py
@@ -8,7 +8,7 @@ from github import Github
 from env_helper import GITHUB_RUN_URL, GITHUB_REPOSITORY, GITHUB_SERVER_URL
 from pr_info import PRInfo
 from get_robot_token import get_best_robot_token
-from commit_status_helper import get_commit
+from commit_status_helper import get_commit, post_label
 
 NAME = "Run Check (actions)"
 
@@ -81,6 +81,21 @@ TRUSTED_CONTRIBUTORS = {
     ]
 }
 
+MAP_CATEGORY_TO_LABEL = {
+    "New Feature": "pr-feature",
+    "Bug Fix": "pr-bugfix",
+    "Bug Fix (user-visible misbehaviour in official stable or prestable release)": "pr-bugfix",
+    "Improvement": "pr-improvement",
+    "Performance Improvement": "pr-performance",
+    "Backward Incompatible Change": "pr-backward-incompatible",
+    "Build/Testing/Packaging Improvement": "pr-build",
+    "Not for changelog (changelog entry is not required)": "pr-not-for-changelog",
+    "Not for changelog": "pr-not-for-changelog",
+    "Documentation (changelog entry is not required)": "pr-documentation",
+    "Documentation": "pr-documentation",
+    # 'Other': doesn't match anything
+}
+
 
 def pr_is_by_trusted_user(pr_user_login, pr_user_orgs):
     if pr_user_login.lower() in TRUSTED_CONTRIBUTORS:
@@ -168,7 +183,7 @@ def check_pr_description(pr_info):
                     + second_category
                     + "'"
                 )
-                return result_status[:140]
+                return result_status[:140], category
 
         elif re.match(
             r"(?i)^[>*_ ]*(short\s*description|change\s*log\s*entry)", lines[i]
@@ -190,19 +205,19 @@ def check_pr_description(pr_info):
             i += 1
 
     if not category:
-        return "Changelog category is empty"
+        return "Changelog category is empty", category
 
     # Filter out the PR categories that are not for changelog.
     if re.match(
         r"(?i)doc|((non|in|not|un)[-\s]*significant)|(not[ ]*for[ ]*changelog)",
         category,
     ):
-        return ""
+        return "", category
 
     if not entry:
-        return f"Changelog entry required for category '{category}'"
+        return f"Changelog entry required for category '{category}'", category
 
-    return ""
+    return "", category
 
 
 if __name__ == "__main__":
@@ -213,7 +228,10 @@ if __name__ == "__main__":
     gh = Github(get_best_robot_token())
     commit = get_commit(gh, pr_info.sha)
 
-    description_report = check_pr_description(pr_info)[:139]
+    description_report, category = check_pr_description(pr_info)
+    if category in MAP_CATEGORY_TO_LABEL:
+        post_label(gh, pr_info, MAP_CATEGORY_TO_LABEL[category])
+
     if description_report:
         print("::notice ::Cannot run, description does not match the template")
         logging.info(
@@ -225,7 +243,7 @@ if __name__ == "__main__":
         )
         commit.create_status(
             context=NAME,
-            description=description_report,
+            description=description_report[:139],
             state="failure",
             target_url=url,
         )
diff --git a/utils/github/__init__.py b/utils/github/__init__.py
deleted file mode 100644
index 40a96afc6ff..00000000000
--- a/utils/github/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-# -*- coding: utf-8 -*-
diff --git a/utils/github/backport.py b/utils/github/backport.py
deleted file mode 100644
index 615c0d19ffa..00000000000
--- a/utils/github/backport.py
+++ /dev/null
@@ -1,185 +0,0 @@
-# -*- coding: utf-8 -*-
-
-try:
-    from clickhouse.utils.github.cherrypick import CherryPick
-    from clickhouse.utils.github.query import Query as RemoteRepo
-    from clickhouse.utils.github.local import Repository as LocalRepo
-except:
-    from .cherrypick import CherryPick
-    from .query import Query as RemoteRepo
-    from .local import Repository as LocalRepo
-
-import argparse
-import logging
-import re
-import sys
-
-
-class Backport:
-    def __init__(self, token, owner, name, team):
-        self._gh = RemoteRepo(
-            token, owner=owner, name=name, team=team, max_page_size=30, min_page_size=7
-        )
-        self._token = token
-        self.default_branch_name = self._gh.default_branch
-        self.ssh_url = self._gh.ssh_url
-
-    def getPullRequests(self, from_commit):
-        return self._gh.get_pull_requests(from_commit)
-
-    def getBranchesWithRelease(self):
-        branches = set()
-        for pull_request in self._gh.find_pull_requests("release"):
-            branches.add(pull_request["headRefName"])
-        return branches
-
-    def execute(self, repo, upstream, until_commit, run_cherrypick):
-        repo = LocalRepo(repo, upstream, self.default_branch_name)
-        all_branches = repo.get_release_branches()  # [(branch_name, base_commit)]
-
-        release_branches = self.getBranchesWithRelease()
-
-        branches = []
-        # iterate over all branches to preserve their precedence.
-        for branch in all_branches:
-            if branch[0] in release_branches:
-                branches.append(branch)
-
-        if not branches:
-            logging.info("No release branches found!")
-            return
-
-        for branch in branches:
-            logging.info("Found release branch: %s", branch[0])
-
-        if not until_commit:
-            until_commit = branches[0][1]
-        pull_requests = self.getPullRequests(until_commit)
-
-        backport_map = {}
-
-        RE_MUST_BACKPORT = re.compile(r"^v(\d+\.\d+)-must-backport$")
-        RE_NO_BACKPORT = re.compile(r"^v(\d+\.\d+)-no-backport$")
-        RE_BACKPORTED = re.compile(r"^v(\d+\.\d+)-backported$")
-
-        # pull-requests are sorted by ancestry from the most recent.
-        for pr in pull_requests:
-            while repo.comparator(branches[-1][1]) >= repo.comparator(
-                pr["mergeCommit"]["oid"]
-            ):
-                logging.info(
-                    "PR #{} is already inside {}. Dropping this branch for further PRs".format(
-                        pr["number"], branches[-1][0]
-                    )
-                )
-                branches.pop()
-
-            logging.info("Processing PR #{}".format(pr["number"]))
-
-            assert len(branches)
-
-            branch_set = set([branch[0] for branch in branches])
-
-            # First pass. Find all must-backports
-            for label in pr["labels"]["nodes"]:
-                if label["name"] == "pr-must-backport":
-                    backport_map[pr["number"]] = branch_set.copy()
-                    continue
-                matched = RE_MUST_BACKPORT.match(label["name"])
-                if matched:
-                    if pr["number"] not in backport_map:
-                        backport_map[pr["number"]] = set()
-                    backport_map[pr["number"]].add(matched.group(1))
-
-            # Second pass. Find all no-backports
-            for label in pr["labels"]["nodes"]:
-                if label["name"] == "pr-no-backport" and pr["number"] in backport_map:
-                    del backport_map[pr["number"]]
-                    break
-                matched_no_backport = RE_NO_BACKPORT.match(label["name"])
-                matched_backported = RE_BACKPORTED.match(label["name"])
-                if (
-                    matched_no_backport
-                    and pr["number"] in backport_map
-                    and matched_no_backport.group(1) in backport_map[pr["number"]]
-                ):
-                    backport_map[pr["number"]].remove(matched_no_backport.group(1))
-                    logging.info(
-                        "\tskipping %s because of forced no-backport",
-                        matched_no_backport.group(1),
-                    )
-                elif (
-                    matched_backported
-                    and pr["number"] in backport_map
-                    and matched_backported.group(1) in backport_map[pr["number"]]
-                ):
-                    backport_map[pr["number"]].remove(matched_backported.group(1))
-                    logging.info(
-                        "\tskipping %s because it's already backported manually",
-                        matched_backported.group(1),
-                    )
-
-        for pr, branches in list(backport_map.items()):
-            logging.info("PR #%s needs to be backported to:", pr)
-            for branch in branches:
-                logging.info(
-                    "\t%s, and the status is: %s",
-                    branch,
-                    run_cherrypick(self._token, pr, branch),
-                )
-
-        # print API costs
-        logging.info("\nGitHub API total costs per query:")
-        for name, value in list(self._gh.api_costs.items()):
-            logging.info("%s : %s", name, value)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--token", type=str, required=True, help="token for Github access"
-    )
-    parser.add_argument(
-        "--repo",
-        type=str,
-        required=True,
-        help="path to full repository",
-        metavar="PATH",
-    )
-    parser.add_argument(
-        "--til", type=str, help="check PRs from HEAD til this commit", metavar="COMMIT"
-    )
-    parser.add_argument(
-        "--dry-run",
-        action="store_true",
-        help="do not create or merge any PRs",
-        default=False,
-    )
-    parser.add_argument(
-        "--verbose",
-        "-v",
-        action="store_true",
-        help="more verbose output",
-        default=False,
-    )
-    parser.add_argument(
-        "--upstream",
-        "-u",
-        type=str,
-        help="remote name of upstream in repository",
-        default="origin",
-    )
-    args = parser.parse_args()
-
-    if args.verbose:
-        logging.basicConfig(
-            format="%(message)s", stream=sys.stdout, level=logging.DEBUG
-        )
-    else:
-        logging.basicConfig(format="%(message)s", stream=sys.stdout, level=logging.INFO)
-
-    cherrypick_run = lambda token, pr, branch: CherryPick(
-        token, "ClickHouse", "ClickHouse", "core", pr, branch
-    ).execute(args.repo, args.dry_run)
-    bp = Backport(args.token, "ClickHouse", "ClickHouse", "core")
-    bp.execute(args.repo, args.upstream, args.til, cherrypick_run)
diff --git a/utils/github/cherrypick.py b/utils/github/cherrypick.py
deleted file mode 100644
index c6469fa62a9..00000000000
--- a/utils/github/cherrypick.py
+++ /dev/null
@@ -1,323 +0,0 @@
-# -*- coding: utf-8 -*-
-
-"""
-Backports changes from PR to release branch.
-Requires multiple separate runs as part of the implementation.
-
-First run should do the following:
-1. Merge release branch with a first parent of merge-commit of PR (using 'ours' strategy). (branch: backport/{branch}/{pr})
-2. Create temporary branch over merge-commit to use it for PR creation. (branch: cherrypick/{merge_commit})
-3. Create PR from temporary branch to backport branch (emulating cherry-pick).
-
-Second run checks PR from previous run to be merged or at least being mergeable. If it's not merged then try to merge it.
-
-Third run creates PR from backport branch (with merged previous PR) to release branch.
-"""
-
-try:
-    from clickhouse.utils.github.query import Query as RemoteRepo
-except:
-    from .query import Query as RemoteRepo
-
-import argparse
-from enum import Enum
-import logging
-import os
-import subprocess
-import sys
-
-
-class CherryPick:
-    class Status(Enum):
-        DISCARDED = "discarded"
-        NOT_INITIATED = "not started"
-        FIRST_MERGEABLE = "waiting for 1st stage"
-        FIRST_CONFLICTS = "conflicts on 1st stage"
-        SECOND_MERGEABLE = "waiting for 2nd stage"
-        SECOND_CONFLICTS = "conflicts on 2nd stage"
-        MERGED = "backported"
-
-    def _run(self, args):
-        out = subprocess.check_output(args).rstrip()
-        logging.debug(out)
-        return out
-
-    def __init__(self, token, owner, name, team, pr_number, target_branch):
-        self._gh = RemoteRepo(token, owner=owner, name=name, team=team)
-        self._pr = self._gh.get_pull_request(pr_number)
-
-        self.ssh_url = self._gh.ssh_url
-
-        # TODO: check if pull-request is merged.
-
-        self.merge_commit_oid = self._pr["mergeCommit"]["oid"]
-
-        self.target_branch = target_branch
-        self.backport_branch = "backport/{branch}/{pr}".format(
-            branch=target_branch, pr=pr_number
-        )
-        self.cherrypick_branch = "cherrypick/{branch}/{oid}".format(
-            branch=target_branch, oid=self.merge_commit_oid
-        )
-
-    def getCherryPickPullRequest(self):
-        return self._gh.find_pull_request(
-            base=self.backport_branch, head=self.cherrypick_branch
-        )
-
-    def createCherryPickPullRequest(self, repo_path):
-        DESCRIPTION = (
-            "This pull-request is a first step of an automated backporting.\n"
-            "It contains changes like after calling a local command `git cherry-pick`.\n"
-            "If you intend to continue backporting this changes, then resolve all conflicts if any.\n"
-            "Otherwise, if you do not want to backport them, then just close this pull-request.\n"
-            "\n"
-            "The check results does not matter at this step - you can safely ignore them.\n"
-            "Also this pull-request will be merged automatically as it reaches the mergeable state, but you always can merge it manually.\n"
-        )
-
-        # FIXME: replace with something better than os.system()
-        git_prefix = [
-            "git",
-            "-C",
-            repo_path,
-            "-c",
-            "user.email=robot-clickhouse@yandex-team.ru",
-            "-c",
-            "user.name=robot-clickhouse",
-        ]
-        base_commit_oid = self._pr["mergeCommit"]["parents"]["nodes"][0]["oid"]
-
-        # Create separate branch for backporting, and make it look like real cherry-pick.
-        self._run(git_prefix + ["checkout", "-f", self.target_branch])
-        self._run(git_prefix + ["checkout", "-B", self.backport_branch])
-        self._run(git_prefix + ["merge", "-s", "ours", "--no-edit", base_commit_oid])
-
-        # Create secondary branch to allow pull request with cherry-picked commit.
-        self._run(
-            git_prefix + ["branch", "-f", self.cherrypick_branch, self.merge_commit_oid]
-        )
-
-        self._run(
-            git_prefix
-            + [
-                "push",
-                "-f",
-                "origin",
-                "{branch}:{branch}".format(branch=self.backport_branch),
-            ]
-        )
-        self._run(
-            git_prefix
-            + [
-                "push",
-                "-f",
-                "origin",
-                "{branch}:{branch}".format(branch=self.cherrypick_branch),
-            ]
-        )
-
-        # Create pull-request like a local cherry-pick
-        pr = self._gh.create_pull_request(
-            source=self.cherrypick_branch,
-            target=self.backport_branch,
-            title="Cherry pick #{number} to {target}: {title}".format(
-                number=self._pr["number"],
-                target=self.target_branch,
-                title=self._pr["title"].replace('"', '\\"'),
-            ),
-            description="Original pull-request #{}\n\n{}".format(
-                self._pr["number"], DESCRIPTION
-            ),
-        )
-
-        # FIXME: use `team` to leave a single eligible assignee.
-        self._gh.add_assignee(pr, self._pr["author"])
-        self._gh.add_assignee(pr, self._pr["mergedBy"])
-
-        self._gh.set_label(pr, "do not test")
-        self._gh.set_label(pr, "pr-cherrypick")
-
-        return pr
-
-    def mergeCherryPickPullRequest(self, cherrypick_pr):
-        return self._gh.merge_pull_request(cherrypick_pr["id"])
-
-    def getBackportPullRequest(self):
-        return self._gh.find_pull_request(
-            base=self.target_branch, head=self.backport_branch
-        )
-
-    def createBackportPullRequest(self, cherrypick_pr, repo_path):
-        DESCRIPTION = (
-            "This pull-request is a last step of an automated backporting.\n"
-            "Treat it as a standard pull-request: look at the checks and resolve conflicts.\n"
-            "Merge it only if you intend to backport changes to the target branch, otherwise just close it.\n"
-        )
-
-        git_prefix = [
-            "git",
-            "-C",
-            repo_path,
-            "-c",
-            "user.email=robot-clickhouse@clickhouse.com",
-            "-c",
-            "user.name=robot-clickhouse",
-        ]
-
-        pr_title = "Backport #{number} to {target}: {title}".format(
-            number=self._pr["number"],
-            target=self.target_branch,
-            title=self._pr["title"].replace('"', '\\"'),
-        )
-
-        self._run(git_prefix + ["checkout", "-f", self.backport_branch])
-        self._run(git_prefix + ["pull", "--ff-only", "origin", self.backport_branch])
-        self._run(
-            git_prefix
-            + [
-                "reset",
-                "--soft",
-                self._run(
-                    git_prefix
-                    + [
-                        "merge-base",
-                        "origin/" + self.target_branch,
-                        self.backport_branch,
-                    ]
-                ),
-            ]
-        )
-        self._run(git_prefix + ["commit", "-a", "--allow-empty", "-m", pr_title])
-        self._run(
-            git_prefix
-            + [
-                "push",
-                "-f",
-                "origin",
-                "{branch}:{branch}".format(branch=self.backport_branch),
-            ]
-        )
-
-        pr = self._gh.create_pull_request(
-            source=self.backport_branch,
-            target=self.target_branch,
-            title=pr_title,
-            description="Original pull-request #{}\nCherry-pick pull-request #{}\n\n{}".format(
-                self._pr["number"], cherrypick_pr["number"], DESCRIPTION
-            ),
-        )
-
-        # FIXME: use `team` to leave a single eligible assignee.
-        self._gh.add_assignee(pr, self._pr["author"])
-        self._gh.add_assignee(pr, self._pr["mergedBy"])
-
-        self._gh.set_label(pr, "pr-backport")
-
-        return pr
-
-    def execute(self, repo_path, dry_run=False):
-        pr1 = self.getCherryPickPullRequest()
-        if not pr1:
-            if not dry_run:
-                pr1 = self.createCherryPickPullRequest(repo_path)
-                logging.debug(
-                    "Created PR with cherry-pick of %s to %s: %s",
-                    self._pr["number"],
-                    self.target_branch,
-                    pr1["url"],
-                )
-            else:
-                return CherryPick.Status.NOT_INITIATED
-        else:
-            logging.debug(
-                "Found PR with cherry-pick of %s to %s: %s",
-                self._pr["number"],
-                self.target_branch,
-                pr1["url"],
-            )
-
-        if not pr1["merged"] and pr1["mergeable"] == "MERGEABLE" and not pr1["closed"]:
-            if not dry_run:
-                pr1 = self.mergeCherryPickPullRequest(pr1)
-                logging.debug(
-                    "Merged PR with cherry-pick of %s to %s: %s",
-                    self._pr["number"],
-                    self.target_branch,
-                    pr1["url"],
-                )
-
-        if not pr1["merged"]:
-            logging.debug(
-                "Waiting for PR with cherry-pick of %s to %s: %s",
-                self._pr["number"],
-                self.target_branch,
-                pr1["url"],
-            )
-
-            if pr1["closed"]:
-                return CherryPick.Status.DISCARDED
-            elif pr1["mergeable"] == "CONFLICTING":
-                return CherryPick.Status.FIRST_CONFLICTS
-            else:
-                return CherryPick.Status.FIRST_MERGEABLE
-
-        pr2 = self.getBackportPullRequest()
-        if not pr2:
-            if not dry_run:
-                pr2 = self.createBackportPullRequest(pr1, repo_path)
-                logging.debug(
-                    "Created PR with backport of %s to %s: %s",
-                    self._pr["number"],
-                    self.target_branch,
-                    pr2["url"],
-                )
-            else:
-                return CherryPick.Status.FIRST_MERGEABLE
-        else:
-            logging.debug(
-                "Found PR with backport of %s to %s: %s",
-                self._pr["number"],
-                self.target_branch,
-                pr2["url"],
-            )
-
-        if pr2["merged"]:
-            return CherryPick.Status.MERGED
-        elif pr2["closed"]:
-            return CherryPick.Status.DISCARDED
-        elif pr2["mergeable"] == "CONFLICTING":
-            return CherryPick.Status.SECOND_CONFLICTS
-        else:
-            return CherryPick.Status.SECOND_MERGEABLE
-
-
-if __name__ == "__main__":
-    logging.basicConfig(format="%(message)s", stream=sys.stdout, level=logging.DEBUG)
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--token", "-t", type=str, required=True, help="token for Github access"
-    )
-    parser.add_argument("--pr", type=str, required=True, help="PR# to cherry-pick")
-    parser.add_argument(
-        "--branch",
-        "-b",
-        type=str,
-        required=True,
-        help="target branch name for cherry-pick",
-    )
-    parser.add_argument(
-        "--repo",
-        "-r",
-        type=str,
-        required=True,
-        help="path to full repository",
-        metavar="PATH",
-    )
-    args = parser.parse_args()
-
-    cp = CherryPick(
-        args.token, "ClickHouse", "ClickHouse", "core", args.pr, args.branch
-    )
-    cp.execute(args.repo)
diff --git a/utils/github/local.py b/utils/github/local.py
deleted file mode 100644
index 571c9102ba0..00000000000
--- a/utils/github/local.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# -*- coding: utf-8 -*-
-
-import functools
-import logging
-import os
-import re
-
-
-class RepositoryBase:
-    def __init__(self, repo_path):
-        import git
-
-        self._repo = git.Repo(repo_path, search_parent_directories=(not repo_path))
-
-        # comparator of commits
-        def cmp(x, y):
-            if str(x) == str(y):
-                return 0
-            if self._repo.is_ancestor(x, y):
-                return -1
-            else:
-                return 1
-
-        self.comparator = functools.cmp_to_key(cmp)
-
-    def get_head_commit(self):
-        return self._repo.commit(self._default)
-
-    def iterate(self, begin, end):
-        rev_range = "{}...{}".format(begin, end)
-        for commit in self._repo.iter_commits(rev_range, first_parent=True):
-            yield commit
-
-
-class Repository(RepositoryBase):
-    def __init__(self, repo_path, remote_name, default_branch_name):
-        super(Repository, self).__init__(repo_path)
-        self._remote = self._repo.remotes[remote_name]
-        self._remote.fetch()
-        self._default = self._remote.refs[default_branch_name]
-
-    def get_release_branches(self):
-        """
-        Returns sorted list of tuples:
-         * remote branch (git.refs.remote.RemoteReference),
-         * base commit (git.Commit),
-         * head (git.Commit)).
-        List is sorted by commits in ascending order.
-        """
-        release_branches = []
-
-        RE_RELEASE_BRANCH_REF = re.compile(r"^refs/remotes/.+/\d+\.\d+$")
-
-        for branch in [
-            r for r in self._remote.refs if RE_RELEASE_BRANCH_REF.match(r.path)
-        ]:
-            base = self._repo.merge_base(self._default, self._repo.commit(branch))
-            if not base:
-                logging.info(
-                    "Branch %s is not based on branch %s. Ignoring.",
-                    branch.path,
-                    self._default,
-                )
-            elif len(base) > 1:
-                logging.info(
-                    "Branch %s has more than one base commit. Ignoring.", branch.path
-                )
-            else:
-                release_branches.append((os.path.basename(branch.name), base[0]))
-
-        return sorted(release_branches, key=lambda x: self.comparator(x[1]))
-
-
-class BareRepository(RepositoryBase):
-    def __init__(self, repo_path, default_branch_name):
-        super(BareRepository, self).__init__(repo_path)
-        self._default = self._repo.branches[default_branch_name]
-
-    def get_release_branches(self):
-        """
-        Returns sorted list of tuples:
-         * branch (git.refs.head?),
-         * base commit (git.Commit),
-         * head (git.Commit)).
-        List is sorted by commits in ascending order.
-        """
-        release_branches = []
-
-        RE_RELEASE_BRANCH_REF = re.compile(r"^refs/heads/\d+\.\d+$")
-
-        for branch in [
-            r for r in self._repo.branches if RE_RELEASE_BRANCH_REF.match(r.path)
-        ]:
-            base = self._repo.merge_base(self._default, self._repo.commit(branch))
-            if not base:
-                logging.info(
-                    "Branch %s is not based on branch %s. Ignoring.",
-                    branch.path,
-                    self._default,
-                )
-            elif len(base) > 1:
-                logging.info(
-                    "Branch %s has more than one base commit. Ignoring.", branch.path
-                )
-            else:
-                release_branches.append((os.path.basename(branch.name), base[0]))
-
-        return sorted(release_branches, key=lambda x: self.comparator(x[1]))
diff --git a/utils/github/parser.py b/utils/github/parser.py
deleted file mode 100644
index d8348e6d964..00000000000
--- a/utils/github/parser.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# -*- coding: utf-8 -*-
-
-
-class Description:
-    """Parsed description representation"""
-
-    MAP_CATEGORY_TO_LABEL = {
-        "New Feature": "pr-feature",
-        "Bug Fix": "pr-bugfix",
-        "Improvement": "pr-improvement",
-        "Performance Improvement": "pr-performance",
-        # 'Backward Incompatible Change': doesn't match anything
-        "Build/Testing/Packaging Improvement": "pr-build",
-        "Non-significant (changelog entry is not needed)": "pr-non-significant",
-        "Non-significant (changelog entry is not required)": "pr-non-significant",
-        "Non-significant": "pr-non-significant",
-        "Documentation (changelog entry is not required)": "pr-documentation",
-        # 'Other': doesn't match anything
-    }
-
-    def __init__(self, pull_request):
-        self.label_name = str()
-        self.legal = False
-
-        self._parse(pull_request["bodyText"])
-
-    def _parse(self, text):
-        lines = text.splitlines()
-        next_category = False
-        category = str()
-
-        for line in lines:
-            stripped = line.strip()
-
-            if not stripped:
-                continue
-
-            if next_category:
-                category = stripped
-                next_category = False
-
-            if (
-                stripped
-                == "I hereby agree to the terms of the CLA available at: https://yandex.ru/legal/cla/?lang=en"
-            ):
-                self.legal = True
-
-            category_headers = (
-                "Category (leave one):",
-                "Changelog category (leave one):",
-                "Changelog category:",
-                "Category:",
-            )
-
-            if stripped in category_headers:
-                next_category = True
-
-        if category in Description.MAP_CATEGORY_TO_LABEL:
-            self.label_name = Description.MAP_CATEGORY_TO_LABEL[category]
-        else:
-            if not category:
-                print("Cannot find category in pr description")
-            else:
-                print(("Unknown category: " + category))
diff --git a/utils/github/query.py b/utils/github/query.py
deleted file mode 100644
index 7afbc57781c..00000000000
--- a/utils/github/query.py
+++ /dev/null
@@ -1,492 +0,0 @@
-# -*- coding: utf-8 -*-
-
-import requests
-
-
-class Query:
-    """
-    Implements queries to the Github API using GraphQL
-    """
-
-    _PULL_REQUEST = """
-        author {{
-            ... on User {{
-                id
-                login
-            }}
-        }}
-
-        baseRepository {{
-            nameWithOwner
-        }}
-
-        mergeCommit {{
-            oid
-            parents(first: {min_page_size}) {{
-                totalCount
-                nodes {{
-                    oid
-                }}
-            }}
-        }}
-
-        mergedBy {{
-            ... on User {{
-                id
-                login
-            }}
-        }}
-
-        baseRefName
-        closed
-        headRefName
-        id
-        mergeable
-        merged
-        number
-        title
-        url
-    """
-
-    def __init__(self, token, owner, name, team, max_page_size=100, min_page_size=10):
-        self._PULL_REQUEST = Query._PULL_REQUEST.format(min_page_size=min_page_size)
-
-        self._token = token
-        self._owner = owner
-        self._name = name
-        self._team = team
-
-        self._max_page_size = max_page_size
-        self._min_page_size = min_page_size
-
-        self.api_costs = {}
-
-        repo = self.get_repository()
-        self._id = repo["id"]
-        self.ssh_url = repo["sshUrl"]
-        self.default_branch = repo["defaultBranchRef"]["name"]
-
-        self.members = set(self.get_members())
-
-    def get_repository(self):
-        _QUERY = """
-            repository(owner: "{owner}" name: "{name}") {{
-                defaultBranchRef {{
-                    name
-                }}
-                id
-                sshUrl
-            }}
-        """
-
-        query = _QUERY.format(owner=self._owner, name=self._name)
-        return self._run(query)["repository"]
-
-    def get_members(self):
-        """Get all team members for organization
-
-        Returns:
-            members: a map of members' logins to ids
-        """
-
-        _QUERY = """
-            organization(login: "{organization}") {{
-                team(slug: "{team}") {{
-                    members(first: {max_page_size} {next}) {{
-                        pageInfo {{
-                            hasNextPage
-                            endCursor
-                        }}
-                        nodes {{
-                            id
-                            login
-                        }}
-                    }}
-                }}
-            }}
-        """
-
-        members = {}
-        not_end = True
-        query = _QUERY.format(
-            organization=self._owner,
-            team=self._team,
-            max_page_size=self._max_page_size,
-            next="",
-        )
-
-        while not_end:
-            result = self._run(query)["organization"]["team"]
-            if result is None:
-                break
-            result = result["members"]
-            not_end = result["pageInfo"]["hasNextPage"]
-            query = _QUERY.format(
-                organization=self._owner,
-                team=self._team,
-                max_page_size=self._max_page_size,
-                next='after: "{}"'.format(result["pageInfo"]["endCursor"]),
-            )
-
-            members += dict([(node["login"], node["id"]) for node in result["nodes"]])
-
-        return members
-
-    def get_pull_request(self, number):
-        _QUERY = """
-            repository(owner: "{owner}" name: "{name}") {{
-                pullRequest(number: {number}) {{
-                    {pull_request_data}
-                }}
-            }}
-        """
-
-        query = _QUERY.format(
-            owner=self._owner,
-            name=self._name,
-            number=number,
-            pull_request_data=self._PULL_REQUEST,
-            min_page_size=self._min_page_size,
-        )
-        return self._run(query)["repository"]["pullRequest"]
-
-    def find_pull_request(self, base, head):
-        _QUERY = """
-            repository(owner: "{owner}" name: "{name}") {{
-                pullRequests(first: {min_page_size} baseRefName: "{base}" headRefName: "{head}") {{
-                    nodes {{
-                        {pull_request_data}
-                    }}
-                    totalCount
-                }}
-            }}
-        """
-
-        query = _QUERY.format(
-            owner=self._owner,
-            name=self._name,
-            base=base,
-            head=head,
-            pull_request_data=self._PULL_REQUEST,
-            min_page_size=self._min_page_size,
-        )
-        result = self._run(query)["repository"]["pullRequests"]
-        if result["totalCount"] > 0:
-            return result["nodes"][0]
-        else:
-            return {}
-
-    def find_pull_requests(self, label_name):
-        """
-        Get all pull-requests filtered by label name
-        """
-        _QUERY = """
-            repository(owner: "{owner}" name: "{name}") {{
-                pullRequests(first: {min_page_size} labels: "{label_name}" states: OPEN) {{
-                    nodes {{
-                        {pull_request_data}
-                    }}
-                }}
-            }}
-        """
-
-        query = _QUERY.format(
-            owner=self._owner,
-            name=self._name,
-            label_name=label_name,
-            pull_request_data=self._PULL_REQUEST,
-            min_page_size=self._min_page_size,
-        )
-        return self._run(query)["repository"]["pullRequests"]["nodes"]
-
-    def get_pull_requests(self, before_commit):
-        """
-        Get all merged pull-requests from the HEAD of default branch to the last commit (excluding)
-        """
-
-        _QUERY = """
-            repository(owner: "{owner}" name: "{name}") {{
-                defaultBranchRef {{
-                    target {{
-                        ... on Commit {{
-                            history(first: {max_page_size} {next}) {{
-                                pageInfo {{
-                                    hasNextPage
-                                    endCursor
-                                }}
-                                nodes {{
-                                    oid
-                                    associatedPullRequests(first: {min_page_size}) {{
-                                        totalCount
-                                        nodes {{
-                                            ... on PullRequest {{
-                                                {pull_request_data}
-
-                                                labels(first: {min_page_size}) {{
-                                                    totalCount
-                                                    pageInfo {{
-                                                        hasNextPage
-                                                        endCursor
-                                                    }}
-                                                    nodes {{
-                                                        name
-                                                        color
-                                                    }}
-                                                }}
-                                            }}
-                                        }}
-                                    }}
-                                }}
-                            }}
-                        }}
-                    }}
-                }}
-            }}
-        """
-
-        pull_requests = []
-        not_end = True
-        query = _QUERY.format(
-            owner=self._owner,
-            name=self._name,
-            max_page_size=self._max_page_size,
-            min_page_size=self._min_page_size,
-            pull_request_data=self._PULL_REQUEST,
-            next="",
-        )
-
-        while not_end:
-            result = self._run(query)["repository"]["defaultBranchRef"]["target"][
-                "history"
-            ]
-            not_end = result["pageInfo"]["hasNextPage"]
-            query = _QUERY.format(
-                owner=self._owner,
-                name=self._name,
-                max_page_size=self._max_page_size,
-                min_page_size=self._min_page_size,
-                pull_request_data=self._PULL_REQUEST,
-                next='after: "{}"'.format(result["pageInfo"]["endCursor"]),
-            )
-
-            for commit in result["nodes"]:
-                # FIXME: maybe include `before_commit`?
-                if str(commit["oid"]) == str(before_commit):
-                    not_end = False
-                    break
-
-                # TODO: fetch all pull-requests that were merged in a single commit.
-                assert (
-                    commit["associatedPullRequests"]["totalCount"]
-                    <= self._min_page_size
-                )
-
-                for pull_request in commit["associatedPullRequests"]["nodes"]:
-                    if (
-                        pull_request["baseRepository"]["nameWithOwner"]
-                        == "{}/{}".format(self._owner, self._name)
-                        and pull_request["baseRefName"] == self.default_branch
-                        and pull_request["mergeCommit"]["oid"] == commit["oid"]
-                    ):
-                        pull_requests.append(pull_request)
-
-        return pull_requests
-
-    def create_pull_request(
-        self, source, target, title, description="", draft=False, can_modify=True
-    ):
-        _QUERY = """
-            createPullRequest(input: {{
-                baseRefName: "{target}",
-                headRefName: "{source}",
-                repositoryId: "{id}",
-                title: "{title}",
-                body: "{body}",
-                draft: {draft},
-                maintainerCanModify: {modify}
-            }}) {{
-                pullRequest {{
-                    {pull_request_data}
-                }}
-            }}
-        """
-
-        query = _QUERY.format(
-            target=target,
-            source=source,
-            id=self._id,
-            title=title,
-            body=description,
-            draft="true" if draft else "false",
-            modify="true" if can_modify else "false",
-            pull_request_data=self._PULL_REQUEST,
-        )
-        return self._run(query, is_mutation=True)["createPullRequest"]["pullRequest"]
-
-    def merge_pull_request(self, id):
-        _QUERY = """
-            mergePullRequest(input: {{
-                pullRequestId: "{id}"
-            }}) {{
-                pullRequest {{
-                    {pull_request_data}
-                }}
-            }}
-        """
-
-        query = _QUERY.format(id=id, pull_request_data=self._PULL_REQUEST)
-        return self._run(query, is_mutation=True)["mergePullRequest"]["pullRequest"]
-
-    # FIXME: figure out how to add more assignees at once
-    def add_assignee(self, pr, assignee):
-        _QUERY = """
-            addAssigneesToAssignable(input: {{
-                assignableId: "{id1}",
-                assigneeIds: "{id2}"
-            }}) {{
-                clientMutationId
-            }}
-        """
-
-        query = _QUERY.format(id1=pr["id"], id2=assignee["id"])
-        self._run(query, is_mutation=True)
-
-    def set_label(self, pull_request, label_name):
-        """
-        Set label by name to the pull request
-
-        Args:
-            pull_request: JSON object returned by `get_pull_requests()`
-            label_name (string): label name
-        """
-
-        _GET_LABEL = """
-            repository(owner: "{owner}" name: "{name}") {{
-                labels(first: {max_page_size} {next} query: "{label_name}") {{
-                    pageInfo {{
-                        hasNextPage
-                        endCursor
-                    }}
-                    nodes {{
-                        id
-                        name
-                        color
-                    }}
-                }}
-            }}
-        """
-
-        _SET_LABEL = """
-            addLabelsToLabelable(input: {{
-                labelableId: "{pr_id}",
-                labelIds: "{label_id}"
-            }}) {{
-                clientMutationId
-            }}
-        """
-
-        labels = []
-        not_end = True
-        query = _GET_LABEL.format(
-            owner=self._owner,
-            name=self._name,
-            label_name=label_name,
-            max_page_size=self._max_page_size,
-            next="",
-        )
-
-        while not_end:
-            result = self._run(query)["repository"]["labels"]
-            not_end = result["pageInfo"]["hasNextPage"]
-            query = _GET_LABEL.format(
-                owner=self._owner,
-                name=self._name,
-                label_name=label_name,
-                max_page_size=self._max_page_size,
-                next='after: "{}"'.format(result["pageInfo"]["endCursor"]),
-            )
-
-            labels += [label for label in result["nodes"]]
-
-        if not labels:
-            return
-
-        query = _SET_LABEL.format(pr_id=pull_request["id"], label_id=labels[0]["id"])
-        self._run(query, is_mutation=True)
-
-    def _run(self, query, is_mutation=False):
-        from requests.adapters import HTTPAdapter
-        from urllib3.util.retry import Retry
-
-        def requests_retry_session(
-            retries=3,
-            backoff_factor=0.3,
-            status_forcelist=(500, 502, 504),
-            session=None,
-        ):
-            session = session or requests.Session()
-            retry = Retry(
-                total=retries,
-                read=retries,
-                connect=retries,
-                backoff_factor=backoff_factor,
-                status_forcelist=status_forcelist,
-            )
-            adapter = HTTPAdapter(max_retries=retry)
-            session.mount("http://", adapter)
-            session.mount("https://", adapter)
-            return session
-
-        headers = {"Authorization": "bearer {}".format(self._token)}
-        if is_mutation:
-            query = """
-            mutation {{
-                {query}
-            }}
-            """.format(
-                query=query
-            )
-        else:
-            query = """
-            query {{
-                {query}
-                rateLimit {{
-                    cost
-                    remaining
-                }}
-            }}
-            """.format(
-                query=query
-            )
-
-        while True:
-            request = requests_retry_session().post(
-                "https://api.github.com/graphql", json={"query": query}, headers=headers
-            )
-            if request.status_code == 200:
-                result = request.json()
-                if "errors" in result:
-                    raise Exception(
-                        "Errors occurred: {}\nOriginal query: {}".format(
-                            result["errors"], query
-                        )
-                    )
-
-                if not is_mutation:
-                    import inspect
-
-                    caller = inspect.getouterframes(inspect.currentframe(), 2)[1][3]
-                    if caller not in list(self.api_costs.keys()):
-                        self.api_costs[caller] = 0
-                    self.api_costs[caller] += result["data"]["rateLimit"]["cost"]
-
-                return result["data"]
-            else:
-                import json
-
-                raise Exception(
-                    "Query failed with code {code}:\n{json}".format(
-                        code=request.status_code,
-                        json=json.dumps(request.json(), indent=4),
-                    )
-                )

From bf591b971a6d9dbd1b63357ac353066d6348fc34 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Tue, 29 Mar 2022 15:53:08 +0200
Subject: [PATCH 33/58] Moar categories

---
 tests/ci/run_check.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/ci/run_check.py b/tests/ci/run_check.py
index 74e60582e8d..608a708b04b 100644
--- a/tests/ci/run_check.py
+++ b/tests/ci/run_check.py
@@ -89,6 +89,10 @@ MAP_CATEGORY_TO_LABEL = {
     "Performance Improvement": "pr-performance",
     "Backward Incompatible Change": "pr-backward-incompatible",
     "Build/Testing/Packaging Improvement": "pr-build",
+    "Build Improvement": "pr-build",
+    "Build/Testing Improvement": "pr-build",
+    "Build": "pr-build",
+    "Packaging Improvement": "pr-build",
     "Not for changelog (changelog entry is not required)": "pr-not-for-changelog",
     "Not for changelog": "pr-not-for-changelog",
     "Documentation (changelog entry is not required)": "pr-documentation",

From 693b47cdc9c39ee4d51202592cabc915ed9b0962 Mon Sep 17 00:00:00 2001
From: tavplubix <tavplubix@gmail.com>
Date: Tue, 29 Mar 2022 17:51:56 +0300
Subject: [PATCH 34/58] Revert "Fix enable LLVM for JIT compilation in CMake"

---
 contrib/llvm-cmake/CMakeLists.txt | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/contrib/llvm-cmake/CMakeLists.txt b/contrib/llvm-cmake/CMakeLists.txt
index 87c8a65510f..6ff07f0e016 100644
--- a/contrib/llvm-cmake/CMakeLists.txt
+++ b/contrib/llvm-cmake/CMakeLists.txt
@@ -1,9 +1,12 @@
-if (APPLE OR NOT ARCH_AMD64 OR SANITIZE STREQUAL "undefined")
-   set (ENABLE_EMBEDDED_COMPILER_DEFAULT OFF)
+# During cross-compilation in our CI we have to use llvm-tblgen and other building tools
+# tools to be build for host architecture and everything else for target architecture (e.g. AArch64)
+# Possible workaround is to use llvm-tblgen from some package...
+# But lets just enable LLVM for native builds
+if (CMAKE_CROSSCOMPILING OR SANITIZE STREQUAL "undefined")
+    set (ENABLE_EMBEDDED_COMPILER_DEFAULT OFF)
 else()
-   set (ENABLE_EMBEDDED_COMPILER_DEFAULT ON)
+    set (ENABLE_EMBEDDED_COMPILER_DEFAULT ON)
 endif()
-
 option (ENABLE_EMBEDDED_COMPILER "Enable support for 'compile_expressions' option for query execution" ${ENABLE_EMBEDDED_COMPILER_DEFAULT})
 
 if (NOT ENABLE_EMBEDDED_COMPILER)

From 27390799c186c0f2b69d882a983d4f36bf9945a5 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Tue, 29 Mar 2022 18:23:18 +0200
Subject: [PATCH 35/58] Fix black

---
 tests/ci/docker_images_check.py                        | 1 +
 tests/integration/test_allowed_url_from_config/test.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/ci/docker_images_check.py b/tests/ci/docker_images_check.py
index ef6d369bf4f..4277ef1e15f 100644
--- a/tests/ci/docker_images_check.py
+++ b/tests/ci/docker_images_check.py
@@ -465,5 +465,6 @@ def main():
     if status == "error":
         sys.exit(1)
 
+
 if __name__ == "__main__":
     main()
diff --git a/tests/integration/test_allowed_url_from_config/test.py b/tests/integration/test_allowed_url_from_config/test.py
index 13f3929902d..01a2a500ebf 100644
--- a/tests/integration/test_allowed_url_from_config/test.py
+++ b/tests/integration/test_allowed_url_from_config/test.py
@@ -280,4 +280,4 @@ def test_HDFS(start_cluster):
 
 def test_schema_inference(start_cluster):
     error = node7.query_and_get_error("desc url('http://test.com`, 'TSVRaw'')")
-    assert(error.find('ReadWriteBufferFromHTTPBase') == -1)
+    assert error.find("ReadWriteBufferFromHTTPBase") == -1

From d59941e4f6ccd623b1f938460e6a712dd69a73d0 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Tue, 29 Mar 2022 19:15:25 +0200
Subject: [PATCH 36/58] Style

---
 tests/ci/commit_status_helper.py                       | 1 +
 tests/integration/test_allowed_url_from_config/test.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/ci/commit_status_helper.py b/tests/ci/commit_status_helper.py
index 53dfb145b44..4c79d5ace07 100644
--- a/tests/ci/commit_status_helper.py
+++ b/tests/ci/commit_status_helper.py
@@ -60,6 +60,7 @@ def post_commit_status_to_file(file_path, description, state, report_url):
         out = csv.writer(f, delimiter="\t")
         out.writerow([state, report_url, description])
 
+
 def post_label(gh, pr_info, label_name):
     repo = gh.get_repo(GITHUB_REPOSITORY)
     pull_request = repo.get_pull(pr_info.number)
diff --git a/tests/integration/test_allowed_url_from_config/test.py b/tests/integration/test_allowed_url_from_config/test.py
index 13f3929902d..01a2a500ebf 100644
--- a/tests/integration/test_allowed_url_from_config/test.py
+++ b/tests/integration/test_allowed_url_from_config/test.py
@@ -280,4 +280,4 @@ def test_HDFS(start_cluster):
 
 def test_schema_inference(start_cluster):
     error = node7.query_and_get_error("desc url('http://test.com`, 'TSVRaw'')")
-    assert(error.find('ReadWriteBufferFromHTTPBase') == -1)
+    assert error.find("ReadWriteBufferFromHTTPBase") == -1

From 9220bedb7dc57a1f0ee2be1999610fad75ea674a Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Tue, 29 Mar 2022 19:28:18 +0200
Subject: [PATCH 37/58] Add submodule changed

---
 tests/ci/commit_status_helper.py |  5 +++--
 tests/ci/pr_info.py              |  9 +++++++++
 tests/ci/run_check.py            | 12 +++++++++---
 3 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/tests/ci/commit_status_helper.py b/tests/ci/commit_status_helper.py
index 4c79d5ace07..b4a3e49372f 100644
--- a/tests/ci/commit_status_helper.py
+++ b/tests/ci/commit_status_helper.py
@@ -61,7 +61,8 @@ def post_commit_status_to_file(file_path, description, state, report_url):
         out.writerow([state, report_url, description])
 
 
-def post_label(gh, pr_info, label_name):
+def post_labels(gh, pr_info, labels_names):
     repo = gh.get_repo(GITHUB_REPOSITORY)
     pull_request = repo.get_pull(pr_info.number)
-    pull_request.add_to_labels(label_name)
+    for label in labels_names:
+        pull_request.add_to_labels(label)
diff --git a/tests/ci/pr_info.py b/tests/ci/pr_info.py
index ee4399792ae..0de4ec89124 100644
--- a/tests/ci/pr_info.py
+++ b/tests/ci/pr_info.py
@@ -236,6 +236,15 @@ class PRInfo:
                 return True
         return False
 
+    def has_changes_in_submodules(self):
+        if self.changed_files is None or not self.changed_files:
+            return True
+
+        for f in self.changed_files:
+            if "contrib" in f:
+                return True
+        return False
+
     def can_skip_builds_and_use_version_from_master(self):
         # TODO: See a broken loop
         if "force tests" in self.labels:
diff --git a/tests/ci/run_check.py b/tests/ci/run_check.py
index 608a708b04b..f0100082c71 100644
--- a/tests/ci/run_check.py
+++ b/tests/ci/run_check.py
@@ -8,7 +8,7 @@ from github import Github
 from env_helper import GITHUB_RUN_URL, GITHUB_REPOSITORY, GITHUB_SERVER_URL
 from pr_info import PRInfo
 from get_robot_token import get_best_robot_token
-from commit_status_helper import get_commit, post_label
+from commit_status_helper import get_commit, post_labels
 
 NAME = "Run Check (actions)"
 
@@ -22,6 +22,7 @@ OK_SKIP_LABELS = {"release", "pr-backport", "pr-cherrypick"}
 CAN_BE_TESTED_LABEL = "can be tested"
 DO_NOT_TEST_LABEL = "do not test"
 FORCE_TESTS_LABEL = "force tests"
+SUBMODULE_CHANGED_LABEL = "submodule changed"
 
 # Individual trusted contirbutors who are not in any trusted organization.
 # Can be changed in runtime: we will append users that we learned to be in
@@ -227,15 +228,20 @@ def check_pr_description(pr_info):
 if __name__ == "__main__":
     logging.basicConfig(level=logging.INFO)
 
-    pr_info = PRInfo(need_orgs=True, pr_event_from_api=True)
+    pr_info = PRInfo(need_orgs=True, pr_event_from_api=True, need_changed_files=True)
     can_run, description, labels_state = should_run_checks_for_pr(pr_info)
     gh = Github(get_best_robot_token())
     commit = get_commit(gh, pr_info.sha)
 
     description_report, category = check_pr_description(pr_info)
+    pr_labels = []
     if category in MAP_CATEGORY_TO_LABEL:
-        post_label(gh, pr_info, MAP_CATEGORY_TO_LABEL[category])
+        pr_labels.append(MAP_CATEGORY_TO_LABEL[category])
 
+    if pr_info.has_changes_in_submodules():
+        pr_labels.append(SUBMODULE_CHANGED_LABEL)
+
+    post_labels(gh, pr_info, pr_labels)
     if description_report:
         print("::notice ::Cannot run, description does not match the template")
         logging.info(

From 999558f00fa01605c42eacd2f85243cf74291e4c Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Tue, 29 Mar 2022 19:28:45 +0200
Subject: [PATCH 38/58] Try fake update submodule

---
 contrib/NuRaft | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/contrib/NuRaft b/contrib/NuRaft
index 1707a7572aa..7c6c6961347 160000
--- a/contrib/NuRaft
+++ b/contrib/NuRaft
@@ -1 +1 @@
-Subproject commit 1707a7572aa66ec5d0a2dbe2bf5effa3352e6b2d
+Subproject commit 7c6c696134706eeef274787c9282e40416dc03e8

From 1a8b2c9637c53bef291c0ab33866f84ef437f2e4 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Tue, 29 Mar 2022 19:50:06 +0200
Subject: [PATCH 39/58] Remove submodule changed

---
 tests/ci/commit_status_helper.py |  7 +++++++
 tests/ci/run_check.py            | 12 +++++++++---
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/tests/ci/commit_status_helper.py b/tests/ci/commit_status_helper.py
index b4a3e49372f..a53ce6715d5 100644
--- a/tests/ci/commit_status_helper.py
+++ b/tests/ci/commit_status_helper.py
@@ -61,6 +61,13 @@ def post_commit_status_to_file(file_path, description, state, report_url):
         out.writerow([state, report_url, description])
 
 
+def remove_labels(gh, pr_info, labels_names):
+    repo = gh.get_repo(GITHUB_REPOSITORY)
+    pull_request = repo.get_pull(pr_info.number)
+    for label in labels_names:
+        pull_request.remove_from_labels(label)
+
+
 def post_labels(gh, pr_info, labels_names):
     repo = gh.get_repo(GITHUB_REPOSITORY)
     pull_request = repo.get_pull(pr_info.number)
diff --git a/tests/ci/run_check.py b/tests/ci/run_check.py
index f0100082c71..0138808c6ae 100644
--- a/tests/ci/run_check.py
+++ b/tests/ci/run_check.py
@@ -8,7 +8,7 @@ from github import Github
 from env_helper import GITHUB_RUN_URL, GITHUB_REPOSITORY, GITHUB_SERVER_URL
 from pr_info import PRInfo
 from get_robot_token import get_best_robot_token
-from commit_status_helper import get_commit, post_labels
+from commit_status_helper import get_commit, post_labels, remove_labels
 
 NAME = "Run Check (actions)"
 
@@ -235,13 +235,19 @@ if __name__ == "__main__":
 
     description_report, category = check_pr_description(pr_info)
     pr_labels = []
-    if category in MAP_CATEGORY_TO_LABEL:
+    if (
+        category in MAP_CATEGORY_TO_LABEL
+        and MAP_CATEGORY_TO_LABEL[category] not in pr_info.labels
+    ):
         pr_labels.append(MAP_CATEGORY_TO_LABEL[category])
 
     if pr_info.has_changes_in_submodules():
         pr_labels.append(SUBMODULE_CHANGED_LABEL)
+    elif SUBMODULE_CHANGED_LABEL in pr_info.labels:
+        remove_labels(gh, pr_info, [SUBMODULE_CHANGED_LABEL])
 
-    post_labels(gh, pr_info, pr_labels)
+    if pr_labels:
+        post_labels(gh, pr_info, pr_labels)
     if description_report:
         print("::notice ::Cannot run, description does not match the template")
         logging.info(

From 2e4ab305dd2a6951ad045b6cbdb920730710ddff Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Tue, 29 Mar 2022 19:50:13 +0200
Subject: [PATCH 40/58] Revert "Try fake update submodule"

This reverts commit 999558f00fa01605c42eacd2f85243cf74291e4c.
---
 contrib/NuRaft | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/contrib/NuRaft b/contrib/NuRaft
index 7c6c6961347..1707a7572aa 160000
--- a/contrib/NuRaft
+++ b/contrib/NuRaft
@@ -1 +1 @@
-Subproject commit 7c6c696134706eeef274787c9282e40416dc03e8
+Subproject commit 1707a7572aa66ec5d0a2dbe2bf5effa3352e6b2d

From 74cafe154a4e707d7f21ed7c39aa8c0e04e08927 Mon Sep 17 00:00:00 2001
From: Anton Popov <ap@clickhouse.com>
Date: Tue, 29 Mar 2022 19:58:59 +0200
Subject: [PATCH 41/58] fix flaky test 02246_async_insert_quota

---
 tests/queries/0_stateless/02246_async_insert_quota.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/queries/0_stateless/02246_async_insert_quota.sh b/tests/queries/0_stateless/02246_async_insert_quota.sh
index 9fc4df4660c..4da93f94f19 100755
--- a/tests/queries/0_stateless/02246_async_insert_quota.sh
+++ b/tests/queries/0_stateless/02246_async_insert_quota.sh
@@ -16,7 +16,7 @@ ${CLICKHOUSE_CLIENT} -q "CREATE ROLE r02246"
 ${CLICKHOUSE_CLIENT} -q "CREATE USER u02246"
 ${CLICKHOUSE_CLIENT} -q "GRANT INSERT ON async_inserts_02246 TO r02246"
 ${CLICKHOUSE_CLIENT} -q "GRANT r02246 to u02246"
-${CLICKHOUSE_CLIENT} -q "CREATE QUOTA q02246 FOR INTERVAL 1 HOUR MAX QUERY INSERTS = 2 TO r02246"
+${CLICKHOUSE_CLIENT} -q "CREATE QUOTA q02246 FOR INTERVAL 100 YEAR MAX QUERY INSERTS = 2 TO r02246"
 
 ${CLICKHOUSE_CLIENT} --user u02246 --async_insert 1 -q "INSERT INTO async_inserts_02246 VALUES (1, 'a')"
 ${CLICKHOUSE_CLIENT} --user u02246 --async_insert 1 -q "INSERT INTO async_inserts_02246 VALUES (2, 'b')"

From 476c7f9d7e10e882a085a2b2a08cec43ed951718 Mon Sep 17 00:00:00 2001
From: Nikolai Kochetov <nik-kochetov@yandex-team.ru>
Date: Tue, 29 Mar 2022 18:32:28 +0000
Subject: [PATCH 42/58] Update ci checks server.

---
 tests/ci/clickhouse_helper.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/ci/clickhouse_helper.py b/tests/ci/clickhouse_helper.py
index 7ccbcb4a47e..3731037bf4c 100644
--- a/tests/ci/clickhouse_helper.py
+++ b/tests/ci/clickhouse_helper.py
@@ -10,13 +10,13 @@ from get_robot_token import get_parameter_from_ssm
 class ClickHouseHelper:
     def __init__(self, url=None):
         if url is None:
-            self.url = get_parameter_from_ssm("clickhouse-test-stat-url2")
-            self.auth = {
-                "X-ClickHouse-User": get_parameter_from_ssm(
-                    "clickhouse-test-stat-login2"
-                ),
-                "X-ClickHouse-Key": "",
-            }
+            self.url = get_parameter_from_ssm("clickhouse-test-stat-url")
+
+        self.url = url
+        self.auth = {
+            'X-ClickHouse-User': user if user is not None else get_parameter_from_ssm("clickhouse-test-stat-login"),
+            'X-ClickHouse-Key': password if password is not None else get_parameter_from_ssm("clickhouse-test-stat-password")
+        }
 
     @staticmethod
     def _insert_json_str_info_impl(url, auth, db, table, json_str):

From 3995a52da61cb727f9604b5d0114adc500d4bced Mon Sep 17 00:00:00 2001
From: Nikolai Kochetov <nik-kochetov@yandex-team.ru>
Date: Tue, 29 Mar 2022 18:36:36 +0000
Subject: [PATCH 43/58] Fix check.

---
 tests/ci/clickhouse_helper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/ci/clickhouse_helper.py b/tests/ci/clickhouse_helper.py
index 3731037bf4c..ea99b97c7d8 100644
--- a/tests/ci/clickhouse_helper.py
+++ b/tests/ci/clickhouse_helper.py
@@ -8,7 +8,7 @@ from get_robot_token import get_parameter_from_ssm
 
 
 class ClickHouseHelper:
-    def __init__(self, url=None):
+    def __init__(self, url=None, user=None, password=None):
         if url is None:
             self.url = get_parameter_from_ssm("clickhouse-test-stat-url")
 

From bb82f77477038fa9623f40c21d81ed9899eff429 Mon Sep 17 00:00:00 2001
From: Nikolai Kochetov <nik-kochetov@yandex-team.ru>
Date: Tue, 29 Mar 2022 18:46:48 +0000
Subject: [PATCH 44/58] Fix check.

---
 tests/ci/clickhouse_helper.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/ci/clickhouse_helper.py b/tests/ci/clickhouse_helper.py
index ea99b97c7d8..093d5541362 100644
--- a/tests/ci/clickhouse_helper.py
+++ b/tests/ci/clickhouse_helper.py
@@ -8,14 +8,14 @@ from get_robot_token import get_parameter_from_ssm
 
 
 class ClickHouseHelper:
-    def __init__(self, url=None, user=None, password=None):
+    def __init__(self, url=None):
         if url is None:
-            self.url = get_parameter_from_ssm("clickhouse-test-stat-url")
+            url = get_parameter_from_ssm("clickhouse-test-stat-url")
 
         self.url = url
         self.auth = {
-            'X-ClickHouse-User': user if user is not None else get_parameter_from_ssm("clickhouse-test-stat-login"),
-            'X-ClickHouse-Key': password if password is not None else get_parameter_from_ssm("clickhouse-test-stat-password")
+            'X-ClickHouse-User': get_parameter_from_ssm("clickhouse-test-stat-login"),
+            'X-ClickHouse-Key': get_parameter_from_ssm("clickhouse-test-stat-password")
         }
 
     @staticmethod

From 081073763748106499d3279894f99705ab332700 Mon Sep 17 00:00:00 2001
From: Nikolai Kochetov <nik-kochetov@yandex-team.ru>
Date: Tue, 29 Mar 2022 18:58:16 +0000
Subject: [PATCH 45/58] Fix style

---
 tests/ci/clickhouse_helper.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/ci/clickhouse_helper.py b/tests/ci/clickhouse_helper.py
index 093d5541362..6627a3974d1 100644
--- a/tests/ci/clickhouse_helper.py
+++ b/tests/ci/clickhouse_helper.py
@@ -14,8 +14,8 @@ class ClickHouseHelper:
 
         self.url = url
         self.auth = {
-            'X-ClickHouse-User': get_parameter_from_ssm("clickhouse-test-stat-login"),
-            'X-ClickHouse-Key': get_parameter_from_ssm("clickhouse-test-stat-password")
+            "X-ClickHouse-User": get_parameter_from_ssm("clickhouse-test-stat-login"),
+            "X-ClickHouse-Key": get_parameter_from_ssm("clickhouse-test-stat-password")
         }
 
     @staticmethod

From 3849e63ab186d86a29ed0c77e1524800ff87bafd Mon Sep 17 00:00:00 2001
From: Nikolai Kochetov <nik-kochetov@yandex-team.ru>
Date: Tue, 29 Mar 2022 19:06:50 +0000
Subject: [PATCH 46/58] Change database

---
 docker/test/performance-comparison/compare.sh | 2 +-
 tests/ci/clickhouse_helper.py                 | 2 +-
 tests/ci/compatibility_check.py               | 2 +-
 tests/ci/docker_images_check.py               | 2 +-
 tests/ci/docker_manifests_merge.py            | 2 +-
 tests/ci/docs_check.py                        | 2 +-
 tests/ci/fast_test_check.py                   | 2 +-
 tests/ci/functional_test_check.py             | 2 +-
 tests/ci/integration_test_check.py            | 2 +-
 tests/ci/keeper_jepsen_check.py               | 2 +-
 tests/ci/split_build_smoke_check.py           | 2 +-
 tests/ci/stress_check.py                      | 2 +-
 tests/ci/style_check.py                       | 2 +-
 tests/ci/unit_tests_check.py                  | 2 +-
 14 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/docker/test/performance-comparison/compare.sh b/docker/test/performance-comparison/compare.sh
index 54f71ce05bb..cdfa080a475 100755
--- a/docker/test/performance-comparison/compare.sh
+++ b/docker/test/performance-comparison/compare.sh
@@ -1378,7 +1378,7 @@ $REF_SHA	$SHA_TO_TEST	$(numactl --hardware | sed -n 's/^available:[[:space:]]\+/
 EOF
 
     # Also insert some data about the check into the CI checks table.
-    "${client[@]}" --query "INSERT INTO "'"'"gh-data"'"'".checks FORMAT TSVWithNamesAndTypes" \
+    "${client[@]}" --query "INSERT INTO "'"'"default"'"'".checks FORMAT TSVWithNamesAndTypes" \
         < ci-checks.tsv
 
     set -x
diff --git a/tests/ci/clickhouse_helper.py b/tests/ci/clickhouse_helper.py
index 6627a3974d1..218aaca8b91 100644
--- a/tests/ci/clickhouse_helper.py
+++ b/tests/ci/clickhouse_helper.py
@@ -179,7 +179,7 @@ def mark_flaky_tests(clickhouse_helper, check_name, test_results):
             check_name=check_name
         )
 
-        tests_data = clickhouse_helper.select_json_each_row("gh-data", query)
+        tests_data = clickhouse_helper.select_json_each_row("default", query)
         master_failed_tests = {row["test_name"] for row in tests_data}
         logging.info("Found flaky tests: %s", ", ".join(master_failed_tests))
 
diff --git a/tests/ci/compatibility_check.py b/tests/ci/compatibility_check.py
index d546fabf231..a6846257cce 100644
--- a/tests/ci/compatibility_check.py
+++ b/tests/ci/compatibility_check.py
@@ -197,4 +197,4 @@ if __name__ == "__main__":
         report_url,
         CHECK_NAME,
     )
-    ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events)
+    ch_helper.insert_events_into(db="default", table="checks", events=prepared_events)
diff --git a/tests/ci/docker_images_check.py b/tests/ci/docker_images_check.py
index 3d0cc468aec..44c4623b469 100644
--- a/tests/ci/docker_images_check.py
+++ b/tests/ci/docker_images_check.py
@@ -459,7 +459,7 @@ def main():
         NAME,
     )
     ch_helper = ClickHouseHelper()
-    ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events)
+    ch_helper.insert_events_into(db="default", table="checks", events=prepared_events)
 
 
 if __name__ == "__main__":
diff --git a/tests/ci/docker_manifests_merge.py b/tests/ci/docker_manifests_merge.py
index 8bd50819877..9371440346e 100644
--- a/tests/ci/docker_manifests_merge.py
+++ b/tests/ci/docker_manifests_merge.py
@@ -234,7 +234,7 @@ def main():
         NAME,
     )
     ch_helper = ClickHouseHelper()
-    ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events)
+    ch_helper.insert_events_into(db="default", table="checks", events=prepared_events)
 
 
 if __name__ == "__main__":
diff --git a/tests/ci/docs_check.py b/tests/ci/docs_check.py
index 58678b160a4..a7de66859e2 100644
--- a/tests/ci/docs_check.py
+++ b/tests/ci/docs_check.py
@@ -114,4 +114,4 @@ if __name__ == "__main__":
         report_url,
         NAME,
     )
-    ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events)
+    ch_helper.insert_events_into(db="default", table="checks", events=prepared_events)
diff --git a/tests/ci/fast_test_check.py b/tests/ci/fast_test_check.py
index 64e04594786..8c17c15b463 100644
--- a/tests/ci/fast_test_check.py
+++ b/tests/ci/fast_test_check.py
@@ -204,7 +204,7 @@ if __name__ == "__main__":
         report_url,
         NAME,
     )
-    ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events)
+    ch_helper.insert_events_into(db="default", table="checks", events=prepared_events)
 
     # Refuse other checks to run if fast test failed
     if state != "success":
diff --git a/tests/ci/functional_test_check.py b/tests/ci/functional_test_check.py
index 52ec5a0f8e9..c5a44ba66d2 100644
--- a/tests/ci/functional_test_check.py
+++ b/tests/ci/functional_test_check.py
@@ -356,7 +356,7 @@ if __name__ == "__main__":
         report_url,
         check_name_with_group,
     )
-    ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events)
+    ch_helper.insert_events_into(db="default", table="checks", events=prepared_events)
 
     if state != "success":
         if "force-tests" in pr_info.labels:
diff --git a/tests/ci/integration_test_check.py b/tests/ci/integration_test_check.py
index 30009414d6e..636fbe664ce 100644
--- a/tests/ci/integration_test_check.py
+++ b/tests/ci/integration_test_check.py
@@ -279,4 +279,4 @@ if __name__ == "__main__":
         report_url,
         check_name_with_group,
     )
-    ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events)
+    ch_helper.insert_events_into(db="default", table="checks", events=prepared_events)
diff --git a/tests/ci/keeper_jepsen_check.py b/tests/ci/keeper_jepsen_check.py
index 24d720e67ab..14c31927b75 100644
--- a/tests/ci/keeper_jepsen_check.py
+++ b/tests/ci/keeper_jepsen_check.py
@@ -271,5 +271,5 @@ if __name__ == "__main__":
         report_url,
         CHECK_NAME,
     )
-    ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events)
+    ch_helper.insert_events_into(db="default", table="checks", events=prepared_events)
     clear_autoscaling_group()
diff --git a/tests/ci/split_build_smoke_check.py b/tests/ci/split_build_smoke_check.py
index 41ba6c2fedb..f281bdc55a8 100644
--- a/tests/ci/split_build_smoke_check.py
+++ b/tests/ci/split_build_smoke_check.py
@@ -147,4 +147,4 @@ if __name__ == "__main__":
         report_url,
         CHECK_NAME,
     )
-    ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events)
+    ch_helper.insert_events_into(db="default", table="checks", events=prepared_events)
diff --git a/tests/ci/stress_check.py b/tests/ci/stress_check.py
index 32c181140e2..4ecdd249351 100644
--- a/tests/ci/stress_check.py
+++ b/tests/ci/stress_check.py
@@ -176,4 +176,4 @@ if __name__ == "__main__":
         report_url,
         check_name,
     )
-    ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events)
+    ch_helper.insert_events_into(db="default", table="checks", events=prepared_events)
diff --git a/tests/ci/style_check.py b/tests/ci/style_check.py
index 1b3037217c8..4c210f9beab 100644
--- a/tests/ci/style_check.py
+++ b/tests/ci/style_check.py
@@ -117,4 +117,4 @@ if __name__ == "__main__":
         report_url,
         NAME,
     )
-    ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events)
+    ch_helper.insert_events_into(db="default", table="checks", events=prepared_events)
diff --git a/tests/ci/unit_tests_check.py b/tests/ci/unit_tests_check.py
index 84c4faa822d..6db160d9a54 100644
--- a/tests/ci/unit_tests_check.py
+++ b/tests/ci/unit_tests_check.py
@@ -173,4 +173,4 @@ if __name__ == "__main__":
         report_url,
         check_name,
     )
-    ch_helper.insert_events_into(db="gh-data", table="checks", events=prepared_events)
+    ch_helper.insert_events_into(db="default", table="checks", events=prepared_events)

From 7d140c9a59c1760d7f89d6b1cbc3ef149f3324e5 Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Tue, 29 Mar 2022 22:30:50 +0300
Subject: [PATCH 47/58] clickhouse-test: increase timeout for obtaining
 processlist

Sometimes 30 seconds is not enough on CI [1]:

    2022.03.29 04:46:09.805406 [ 3885 ] {ae06ad04-f9fa-44d4-adeb-9e91aaf9bfa9} <Debug> executeQuery: (from [::1]:33506)  SELECT * FROM system.processes WHERE query NOT LIKE '%system.processes%' AND Settings['log_comment'] = '02016_order_by_with_fill_monotonic_functions_removal.sql' AND current_database = 'test_7fqp6w'
    2022.03.29 04:46:09.866100 [ 3885 ] {ae06ad04-f9fa-44d4-adeb-9e91aaf9bfa9} <Trace> ContextAccess (default): Access granted: SELECT(is_initial_query, user, query_id, address, port, initial_user, initial_query_id, initial_address, initial_port, interface, os_user, client_hostname, client_name, client_revision, client_version_major, client_version_minor, client_version_patch, http_method, http_user_agent, http_referer, forwarded_for, quota_key, distributed_depth, elapsed, is_cancelled, read_rows, read_bytes, total_rows_approx, written_rows, written_bytes, memory_usage, peak_memory_usage, query, thread_ids, ProfileEvents, Settings, current_database) ON system.processes
    2022.03.29 04:46:12.787395 [ 3885 ] {ae06ad04-f9fa-44d4-adeb-9e91aaf9bfa9} <Trace> InterpreterSelectQuery: FetchColumns -> Complete
    2022.03.29 04:46:19.749163 [ 3885 ] {ae06ad04-f9fa-44d4-adeb-9e91aaf9bfa9} <Test> ParallelFormattingOutputFormat: Parallel formatting is being used
    2022.03.29 04:46:37.923282 [ 3885 ] {ae06ad04-f9fa-44d4-adeb-9e91aaf9bfa9} <Information> executeQuery: Read 15 rows, 44.20 KiB in 28.117172383 sec., 0 rows/sec., 1.57 KiB/sec.
    2022.03.29 04:46:40.020586 [ 3885 ] {ae06ad04-f9fa-44d4-adeb-9e91aaf9bfa9} <Debug> DynamicQueryHandler: Done processing query
    2022.03.29 04:46:40.033535 [ 3885 ] {ae06ad04-f9fa-44d4-adeb-9e91aaf9bfa9} <Debug> MemoryTracker: Peak memory usage (for query): 4.00 MiB.

  [1]: https://s3.amazonaws.com/clickhouse-test-reports/32928/ddd5bebe555ce8feebcdd339e47fc45184c20dd1/stateless_tests__thread__actions__[1/3].html

Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
---
 tests/clickhouse-test | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/clickhouse-test b/tests/clickhouse-test
index 863e061085a..5f9ab01cdcc 100755
--- a/tests/clickhouse-test
+++ b/tests/clickhouse-test
@@ -116,7 +116,7 @@ def clickhouse_execute_http(base_args, query, timeout=30, settings=None, default
 def clickhouse_execute(base_args, query, timeout=30, settings=None):
     return clickhouse_execute_http(base_args, query, timeout, settings).strip()
 
-def clickhouse_execute_json(base_args, query, timeout=30, settings=None):
+def clickhouse_execute_json(base_args, query, timeout=60, settings=None):
     data = clickhouse_execute_http(base_args, query, timeout, settings, 'JSONEachRow')
     if not data:
         return None

From b264bdcf5e65874e41457ff94d8d6fe853560c2a Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Sun, 20 Mar 2022 21:40:58 +0300
Subject: [PATCH 48/58] tests: wait for left queries in 00417_kill_query

CI: https://s3.amazonaws.com/clickhouse-test-reports/0/c244ee7cbb61fea384679e18a577ff579060288b/stateless_tests__release__s3_storage__actions_/runlog.log
Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
---
 tests/queries/0_stateless/00417_kill_query.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/queries/0_stateless/00417_kill_query.sh b/tests/queries/0_stateless/00417_kill_query.sh
index 28f03fc9c37..e40eb5c3f80 100755
--- a/tests/queries/0_stateless/00417_kill_query.sh
+++ b/tests/queries/0_stateless/00417_kill_query.sh
@@ -21,3 +21,5 @@ $CLICKHOUSE_CLIENT -q "KILL QUERY WHERE 0 ASYNC"
 $CLICKHOUSE_CLIENT -q "KILL QUERY WHERE 0 FORMAT TabSeparated"
 $CLICKHOUSE_CLIENT -q "KILL QUERY WHERE 0 SYNC FORMAT TabSeparated"
 $CLICKHOUSE_CLIENT -q "KILL QUERY WHERE 1 TEST" &>/dev/null
+
+clickhouse_test_wait_queries 60

From 2a099d90fe5dcdd788123ab9e2e604d2dcb9d7ba Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Wed, 30 Mar 2022 10:45:13 +0300
Subject: [PATCH 49/58] tests: make 02152_http_external_tables_memory_tracking
 more stable

After settings randomization, it can not reach max_untracked_memory,
i.e. due to max_block_size=67990 for example (like in [1]).

  [1]: https://s3.amazonaws.com/clickhouse-test-reports/35683/c609426d091ea8b9f2d37f8d7fcbf9e759cc94e5/stateless_tests__release__actions_.html

Fix this, by settings max_untracked_memory to 0, to track any
allocations.

Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
---
 .../0_stateless/02152_http_external_tables_memory_tracking.sh   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/queries/0_stateless/02152_http_external_tables_memory_tracking.sh b/tests/queries/0_stateless/02152_http_external_tables_memory_tracking.sh
index 44de0e15370..4132ac91ae4 100755
--- a/tests/queries/0_stateless/02152_http_external_tables_memory_tracking.sh
+++ b/tests/queries/0_stateless/02152_http_external_tables_memory_tracking.sh
@@ -23,7 +23,7 @@ function run_and_check()
     echo "Checking $*"
 
     # Run query with external table (implicit StorageMemory user)
-    $CLICKHOUSE_CURL -sS -F "s=@$tmp_file;" "$CLICKHOUSE_URL&s_structure=key+Int&query=SELECT+count()+FROM+s&memory_profiler_sample_probability=1&query_id=$query_id&$*" -o /dev/null
+    $CLICKHOUSE_CURL -sS -F "s=@$tmp_file;" "$CLICKHOUSE_URL&s_structure=key+Int&query=SELECT+count()+FROM+s&memory_profiler_sample_probability=1&max_untracked_memory=0&query_id=$query_id&$*" -o /dev/null
 
     ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}" --data-binary @- <<<'SYSTEM FLUSH LOGS'
 

From 68ec0d92c0c5446d7a32156ae2940e78bedb582f Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Wed, 30 Mar 2022 10:58:34 +0200
Subject: [PATCH 50/58] Remove if category changed

---
 tests/ci/run_check.py | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/tests/ci/run_check.py b/tests/ci/run_check.py
index 0138808c6ae..5f3792f3432 100644
--- a/tests/ci/run_check.py
+++ b/tests/ci/run_check.py
@@ -234,20 +234,29 @@ if __name__ == "__main__":
     commit = get_commit(gh, pr_info.sha)
 
     description_report, category = check_pr_description(pr_info)
-    pr_labels = []
+    pr_labels_to_add = []
+    pr_labels_to_remove = []
     if (
         category in MAP_CATEGORY_TO_LABEL
         and MAP_CATEGORY_TO_LABEL[category] not in pr_info.labels
     ):
-        pr_labels.append(MAP_CATEGORY_TO_LABEL[category])
+        pr_labels_to_add.append(MAP_CATEGORY_TO_LABEL[category])
+
+    for label in pr_info.labels:
+        if label in MAP_CATEGORY_TO_LABEL and label not in pr_labels_to_add:
+            pr_labels_to_remove.append(label)
 
     if pr_info.has_changes_in_submodules():
-        pr_labels.append(SUBMODULE_CHANGED_LABEL)
+        pr_labels_to_add.append(SUBMODULE_CHANGED_LABEL)
     elif SUBMODULE_CHANGED_LABEL in pr_info.labels:
-        remove_labels(gh, pr_info, [SUBMODULE_CHANGED_LABEL])
+        pr_labels_to_remove.append(SUBMODULE_CHANGED_LABEL)
+
+    if pr_labels_to_add:
+        post_labels(gh, pr_info, pr_labels_to_add)
+
+    if pr_labels_to_remove:
+        remove_labels(gh, pr_info, pr_labels_to_remove)
 
-    if pr_labels:
-        post_labels(gh, pr_info, pr_labels)
     if description_report:
         print("::notice ::Cannot run, description does not match the template")
         logging.info(

From bcf64a73d1c272dca22a1ca910df3153a6a66784 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Wed, 30 Mar 2022 11:00:45 +0200
Subject: [PATCH 51/58] Followup

---
 tests/ci/run_check.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/ci/run_check.py b/tests/ci/run_check.py
index 5f3792f3432..8f28287e88b 100644
--- a/tests/ci/run_check.py
+++ b/tests/ci/run_check.py
@@ -243,7 +243,7 @@ if __name__ == "__main__":
         pr_labels_to_add.append(MAP_CATEGORY_TO_LABEL[category])
 
     for label in pr_info.labels:
-        if label in MAP_CATEGORY_TO_LABEL and label not in pr_labels_to_add:
+        if label in MAP_CATEGORY_TO_LABEL.values() and label not in pr_labels_to_add:
             pr_labels_to_remove.append(label)
 
     if pr_info.has_changes_in_submodules():

From 2a1b331c0de65c680285e777e9a90ad49f1d86f2 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Wed, 30 Mar 2022 11:15:54 +0200
Subject: [PATCH 52/58] Fix build check

---
 tests/ci/build_report_check.py | 6 +++++-
 tests/ci/docs_check.py         | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/tests/ci/build_report_check.py b/tests/ci/build_report_check.py
index 50315ad3d78..4e0859ef865 100644
--- a/tests/ci/build_report_check.py
+++ b/tests/ci/build_report_check.py
@@ -233,7 +233,11 @@ if __name__ == "__main__":
     if ok_builds == 0 or some_builds_are_missing:
         summary_status = "error"
 
-    description = f"{ok_builds}/{total_builds} builds are OK"
+    addition = ""
+    if some_builds_are_missing:
+        addition = "(some builds are missing)"
+
+    description = f"{ok_builds}/{total_builds} builds are OK {addition}"
 
     print("::notice ::Report url: {}".format(url))
 
diff --git a/tests/ci/docs_check.py b/tests/ci/docs_check.py
index 6dde0dfaea4..803fb3cff7d 100644
--- a/tests/ci/docs_check.py
+++ b/tests/ci/docs_check.py
@@ -114,7 +114,7 @@ if __name__ == "__main__":
         report_url,
         NAME,
     )
-    ch_helper.insert_events_into(db="default", table="checks", events=prepared_events)
 
+    ch_helper.insert_events_into(db="default", table="checks", events=prepared_events)
     if status == "error":
         sys.exit(1)

From c510ede8dd5b9e739be4de6b6c57aa8bbddfa22b Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Wed, 30 Mar 2022 11:19:11 +0200
Subject: [PATCH 53/58] Fixup

---
 tests/ci/run_check.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/ci/run_check.py b/tests/ci/run_check.py
index 8f28287e88b..93dc77124c2 100644
--- a/tests/ci/run_check.py
+++ b/tests/ci/run_check.py
@@ -243,7 +243,11 @@ if __name__ == "__main__":
         pr_labels_to_add.append(MAP_CATEGORY_TO_LABEL[category])
 
     for label in pr_info.labels:
-        if label in MAP_CATEGORY_TO_LABEL.values() and label not in pr_labels_to_add:
+        if (
+            label in MAP_CATEGORY_TO_LABEL.values()
+            and category in MAP_CATEGORY_TO_LABEL
+            and label != MAP_CATEGORY_TO_LABEL[category]
+        ):
             pr_labels_to_remove.append(label)
 
     if pr_info.has_changes_in_submodules():

From b19ecf60ad9bd518de109dcd536b45ac716faaaf Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Wed, 30 Mar 2022 11:30:21 +0200
Subject: [PATCH 54/58] Fix style

---
 tests/ci/clickhouse_helper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/ci/clickhouse_helper.py b/tests/ci/clickhouse_helper.py
index 218aaca8b91..d52b6262a78 100644
--- a/tests/ci/clickhouse_helper.py
+++ b/tests/ci/clickhouse_helper.py
@@ -15,7 +15,7 @@ class ClickHouseHelper:
         self.url = url
         self.auth = {
             "X-ClickHouse-User": get_parameter_from_ssm("clickhouse-test-stat-login"),
-            "X-ClickHouse-Key": get_parameter_from_ssm("clickhouse-test-stat-password")
+            "X-ClickHouse-Key": get_parameter_from_ssm("clickhouse-test-stat-password"),
         }
 
     @staticmethod

From d3697625a93a32f34b6955e1763e081909f71584 Mon Sep 17 00:00:00 2001
From: Nikolai Kochetov <nik-kochetov@yandex-team.ru>
Date: Wed, 30 Mar 2022 10:55:49 +0000
Subject: [PATCH 55/58] Fix stylecheck

---
 tests/ci/clickhouse_helper.py     | 2 +-
 utils/tests-visualizer/index.html | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/ci/clickhouse_helper.py b/tests/ci/clickhouse_helper.py
index 218aaca8b91..d52b6262a78 100644
--- a/tests/ci/clickhouse_helper.py
+++ b/tests/ci/clickhouse_helper.py
@@ -15,7 +15,7 @@ class ClickHouseHelper:
         self.url = url
         self.auth = {
             "X-ClickHouse-User": get_parameter_from_ssm("clickhouse-test-stat-login"),
-            "X-ClickHouse-Key": get_parameter_from_ssm("clickhouse-test-stat-password")
+            "X-ClickHouse-Key": get_parameter_from_ssm("clickhouse-test-stat-password"),
         }
 
     @staticmethod
diff --git a/utils/tests-visualizer/index.html b/utils/tests-visualizer/index.html
index 13f8daaa151..00076f683fa 100644
--- a/utils/tests-visualizer/index.html
+++ b/utils/tests-visualizer/index.html
@@ -84,14 +84,14 @@ let render_data_query = `
     SELECT groupArray([d, n, fail]) FROM
     (
         SELECT n, check_start_time::Date - start_date AS d, max(test_status LIKE 'F%' OR test_status LIKE 'E%') AS fail
-        FROM "gh-data".checks
+        FROM "default".checks
 
         INNER JOIN
         (
             SELECT test_name, toUInt16(rowNumberInAllBlocks()) AS n FROM
             (
                 SELECT DISTINCT test_name
-                FROM "gh-data".checks
+                FROM "default".checks
                 WHERE match(test_name, '^\\d+_') AND check_name ILIKE '%stateless%' AND check_start_time > now() - INTERVAL 1 DAY
                 ORDER BY test_name
             )
@@ -112,7 +112,7 @@ let test_names_query = `
     SELECT test_name, toUInt16(rowNumberInAllBlocks()) AS n FROM
     (
         SELECT DISTINCT test_name
-        FROM "gh-data".checks
+        FROM "default".checks
         WHERE match(test_name, '^\\d+_') AND check_name ILIKE '%stateless%' AND check_start_time > now() - INTERVAL 1 DAY
         ORDER BY test_name
     ) FORMAT JSONCompact`;

From 8980994faacbc806a9ed1bd3a76c7321132fd270 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Wed, 30 Mar 2022 16:29:13 +0200
Subject: [PATCH 56/58] Resurrect build hash

---
 tests/ci/build_check.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tests/ci/build_check.py b/tests/ci/build_check.py
index 24af9e5194c..eb2aa616383 100644
--- a/tests/ci/build_check.py
+++ b/tests/ci/build_check.py
@@ -54,6 +54,7 @@ def get_packager_cmd(
     build_version: str,
     image_version: str,
     ccache_path: str,
+    official: bool,
 ) -> str:
     package_type = build_config["package_type"]
     comp = build_config["compiler"]
@@ -83,6 +84,9 @@ def get_packager_cmd(
     if _can_export_binaries(build_config):
         cmd += " --with-binaries=tests"
 
+    if official:
+        cmd += " --official"
+
     return cmd
 
 
@@ -254,9 +258,11 @@ def main():
 
     logging.info("Got version from repo %s", version.string)
 
+    official_flag = pr_info.number == 0
     version_type = "testing"
     if "release" in pr_info.labels or "release-lts" in pr_info.labels:
         version_type = "stable"
+        official_flag = True
 
     update_version_local(REPO_COPY, version, version_type)
 
@@ -290,7 +296,9 @@ def main():
         version.string,
         image_version,
         ccache_path,
+        official=official_flag
     )
+
     logging.info("Going to run packager with %s", packager_cmd)
 
     build_clickhouse_log = os.path.join(TEMP_PATH, "build_log")

From c34d5125864a1e33ab846bcf41e986357dc01a3a Mon Sep 17 00:00:00 2001
From: tavplubix <tavplubix@clickhouse.com>
Date: Wed, 30 Mar 2022 18:36:37 +0300
Subject: [PATCH 57/58] Update
 00484_preferred_max_column_in_block_size_bytes.sql

---
 .../00484_preferred_max_column_in_block_size_bytes.sql          | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/queries/0_stateless/00484_preferred_max_column_in_block_size_bytes.sql b/tests/queries/0_stateless/00484_preferred_max_column_in_block_size_bytes.sql
index dc021ad52db..470bca70e06 100644
--- a/tests/queries/0_stateless/00484_preferred_max_column_in_block_size_bytes.sql
+++ b/tests/queries/0_stateless/00484_preferred_max_column_in_block_size_bytes.sql
@@ -1,3 +1,5 @@
+-- Tags: no-random-settings
+
 drop table if exists tab_00484;
 create table tab_00484 (date Date, x UInt64, s FixedString(128)) engine = MergeTree PARTITION BY date ORDER BY (date, x) SETTINGS min_bytes_for_wide_part = 0;
 insert into tab_00484 select today(), number, toFixedString('', 128) from system.numbers limit 8192;

From fcd4e0be654159e34f61d7c4f95f123ba9b6c67c Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Wed, 30 Mar 2022 19:15:27 +0200
Subject: [PATCH 58/58] Black

---
 tests/ci/build_check.py       | 2 +-
 tests/ci/clickhouse_helper.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/ci/build_check.py b/tests/ci/build_check.py
index eb2aa616383..2a079a60367 100644
--- a/tests/ci/build_check.py
+++ b/tests/ci/build_check.py
@@ -296,7 +296,7 @@ def main():
         version.string,
         image_version,
         ccache_path,
-        official=official_flag
+        official=official_flag,
     )
 
     logging.info("Going to run packager with %s", packager_cmd)
diff --git a/tests/ci/clickhouse_helper.py b/tests/ci/clickhouse_helper.py
index 218aaca8b91..d52b6262a78 100644
--- a/tests/ci/clickhouse_helper.py
+++ b/tests/ci/clickhouse_helper.py
@@ -15,7 +15,7 @@ class ClickHouseHelper:
         self.url = url
         self.auth = {
             "X-ClickHouse-User": get_parameter_from_ssm("clickhouse-test-stat-login"),
-            "X-ClickHouse-Key": get_parameter_from_ssm("clickhouse-test-stat-password")
+            "X-ClickHouse-Key": get_parameter_from_ssm("clickhouse-test-stat-password"),
         }
 
     @staticmethod